BUG#1860 - Anchors of interwiki links did not get normalized
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2
3 /**
4 * (X)HTML sanitizer for MediaWiki
5 *
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
23 *
24 * @package MediaWiki
25 */
26
27 class Sanitizer {
28 /**
29 * Cleans up HTML, removes dangerous tags and attributes, and
30 * removes HTML comments
31 * @access private
32 * @param string $text
33 * @return string
34 */
35 function removeHTMLtags( $text ) {
36 global $wgUseTidy, $wgUserHtml;
37 $fname = 'Parser::removeHTMLtags';
38 wfProfileIn( $fname );
39
40 if( $wgUserHtml ) {
41 $htmlpairs = array( # Tags that must be closed
42 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
43 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
44 'strike', 'strong', 'tt', 'var', 'div', 'center',
45 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
46 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
47 );
48 $htmlsingle = array(
49 'br', 'hr', 'li', 'dt', 'dd'
50 );
51 $htmlnest = array( # Tags that can be nested--??
52 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
53 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
54 );
55 $tabletags = array( # Can only appear inside table
56 'td', 'th', 'tr'
57 );
58 } else {
59 $htmlpairs = array();
60 $htmlsingle = array();
61 $htmlnest = array();
62 $tabletags = array();
63 }
64
65 $htmlsingle = array_merge( $tabletags, $htmlsingle );
66 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
67
68 # Remove HTML comments
69 $text = Sanitizer::removeHTMLcomments( $text );
70
71 $bits = explode( '<', $text );
72 $text = array_shift( $bits );
73 if(!$wgUseTidy) {
74 $tagstack = array(); $tablestack = array();
75 foreach ( $bits as $x ) {
76 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
77 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
78 $x, $regs );
79 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
80 error_reporting( $prev );
81
82 $badtag = 0 ;
83 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
84 # Check our stack
85 if ( $slash ) {
86 # Closing a tag...
87 if ( ! in_array( $t, $htmlsingle ) &&
88 ( $ot = @array_pop( $tagstack ) ) != $t ) {
89 @array_push( $tagstack, $ot );
90 $badtag = 1;
91 } else {
92 if ( $t == 'table' ) {
93 $tagstack = array_pop( $tablestack );
94 }
95 $newparams = '';
96 }
97 } else {
98 # Keep track for later
99 if ( in_array( $t, $tabletags ) &&
100 ! in_array( 'table', $tagstack ) ) {
101 $badtag = 1;
102 } else if ( in_array( $t, $tagstack ) &&
103 ! in_array ( $t , $htmlnest ) ) {
104 $badtag = 1 ;
105 } else if ( ! in_array( $t, $htmlsingle ) ) {
106 if ( $t == 'table' ) {
107 array_push( $tablestack, $tagstack );
108 $tagstack = array();
109 }
110 array_push( $tagstack, $t );
111 }
112 # Strip non-approved attributes from the tag
113 $newparams = Sanitizer::fixTagAttributes( $params, $t );
114 }
115 if ( ! $badtag ) {
116 $rest = str_replace( '>', '&gt;', $rest );
117 $text .= "<$slash$t$newparams$brace$rest";
118 continue;
119 }
120 }
121 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
122 }
123 # Close off any remaining tags
124 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
125 $text .= "</$t>\n";
126 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
127 }
128 } else {
129 # this might be possible using tidy itself
130 foreach ( $bits as $x ) {
131 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
132 $x, $regs );
133 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
134 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
135 $newparams = Sanitizer::fixTagAttributes( $params, $t );
136 $rest = str_replace( '>', '&gt;', $rest );
137 $text .= "<$slash$t$newparams$brace$rest";
138 } else {
139 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
140 }
141 }
142 }
143 wfProfileOut( $fname );
144 return $text;
145 }
146
147 /**
148 * Remove '<!--', '-->', and everything between.
149 * To avoid leaving blank lines, when a comment is both preceded
150 * and followed by a newline (ignoring spaces), trim leading and
151 * trailing spaces and one of the newlines.
152 *
153 * @access private
154 * @param string $text
155 * @return string
156 */
157 function removeHTMLcomments( $text ) {
158 $fname='Parser::removeHTMLcomments';
159 wfProfileIn( $fname );
160 while (($start = strpos($text, '<!--')) !== false) {
161 $end = strpos($text, '-->', $start + 4);
162 if ($end === false) {
163 # Unterminated comment; bail out
164 break;
165 }
166
167 $end += 3;
168
169 # Trim space and newline if the comment is both
170 # preceded and followed by a newline
171 $spaceStart = max($start - 1, 0);
172 $spaceLen = $end - $spaceStart;
173 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
174 $spaceStart--;
175 $spaceLen++;
176 }
177 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
178 $spaceLen++;
179 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
180 # Remove the comment, leading and trailing
181 # spaces, and leave only one newline.
182 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
183 }
184 else {
185 # Remove just the comment.
186 $text = substr_replace($text, '', $start, $end - $start);
187 }
188 }
189 wfProfileOut( $fname );
190 return $text;
191 }
192
193 /**
194 * Take a tag soup fragment listing an HTML element's attributes
195 * and normalize it to well-formed XML, discarding unwanted attributes.
196 *
197 * - Normalizes attribute names to lowercase
198 * - Discards attributes not on a whitelist for the given element
199 * - Turns broken or invalid entities into plaintext
200 * - Double-quotes all attribute values
201 * - Attributes without values are given the name as attribute
202 * - Double attributes are discarded
203 * - Unsafe style attributes are discarded
204 * - Prepends space if there are attributes.
205 *
206 * @param string $text
207 * @param string $element
208 * @return string
209 *
210 * @todo Check for legal values where the DTD limits things.
211 * @todo Check for unique id attribute :P
212 */
213 function fixTagAttributes( $text, $element ) {
214 if( trim( $text ) == '' ) {
215 return '';
216 }
217
218 $attrib = '[A-Za-z0-9]'; #FIXME
219 $space = '[\x09\x0a\x0d\x20]';
220 if( !preg_match_all(
221 "/(?:^|$space)($attrib+)
222 ($space*=$space*
223 (?:
224 # The attribute value: quoted or alone
225 \"([^<\"]*)\"
226 | '([^<']*)'
227 | ([a-zA-Z0-9._:-]+)
228 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
229 # colors are specified like this.
230 # We'll be normalizing it.
231 )
232 )?(?=$space|\$)/sx",
233 $text,
234 $pairs,
235 PREG_SET_ORDER ) ) {
236 return '';
237 }
238
239 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
240 $attribs = array();
241 foreach( $pairs as $set ) {
242 $attribute = strtolower( $set[1] );
243 if( !isset( $whitelist[$attribute] ) ) {
244 continue;
245 }
246 if( $set[2] == '' ) {
247 # In XHTML, attributes must have a value.
248 $value = $set[1];
249 } elseif( $set[3] != '' ) {
250 # Double-quoted
251 $value = Sanitizer::normalizeAttributeValue( $set[3] );
252 } elseif( $set[4] != '' ) {
253 # Single-quoted
254 $value = str_replace( '"', '&quot;',
255 Sanitizer::normalizeAttributeValue( $set[4] ) );
256 } elseif( $set[5] != '' ) {
257 # No quotes.
258 $value = Sanitizer::normalizeAttributeValue( $set[5] );
259 } elseif( $set[6] != '' ) {
260 # Illegal #XXXXXX color with no quotes.
261 $value = Sanitizer::normalizeAttributeValue( $set[6] );
262 } else {
263 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
264 }
265
266 # Strip javascript "expression" from stylesheets.
267 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
268 if( $attribute == 'style' && preg_match(
269 '/(expression|tps*:\/\/|url\\s*\().*/is',
270 wfMungeToUtf8( $value ) ) ) {
271 # haxx0r
272 continue;
273 }
274
275 if( !isset( $attribs[$attribute] ) ) {
276 $attribs[$attribute] = "$attribute=\"$value\"";
277 }
278 }
279 if( empty( $attribs ) ) {
280 return '';
281 } else {
282 return ' ' . implode( ' ', $attribs );
283 }
284 }
285
286 /**
287 * Normalize whitespace and character references in an XML source-
288 * encoded text for an attribute value.
289 *
290 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
291 * but note that we're not returning the value, but are returning
292 * XML source fragments that will be slapped into output.
293 *
294 * @param string $text
295 * @return string
296 * @access private
297 */
298 function normalizeAttributeValue( $text ) {
299 return preg_replace(
300 '/\r\n|[\x20\x0d\x0a\x09]/',
301 ' ',
302 Sanitizer::normalizeCharReferences( $text ) );
303 }
304
305 /**
306 * Ensure that any entities and character references are legal
307 * for XML and XHTML specifically. Any stray bits will be
308 * &amp;-escaped to result in a valid text fragment.
309 *
310 * a. any named char refs must be known in XHTML
311 * b. any numeric char refs must be legal chars, not invalid or forbidden
312 * c. use &#x, not &#X
313 * d. fix or reject non-valid attributes
314 *
315 * @param string $text
316 * @return string
317 * @access private
318 */
319 function normalizeCharReferences( $text ) {
320 return preg_replace_callback(
321 '/&([A-Za-z0-9]+);
322 |&\#([0-9]+);
323 |&\#x([0-9A-Za-z]+);
324 |&\#X([0-9A-Za-z]+);
325 |(&)/x',
326 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
327 $text );
328 }
329 /**
330 * @param string $matches
331 * @return string
332 */
333 function normalizeCharReferencesCallback( $matches ) {
334 $ret = null;
335 if( $matches[1] != '' ) {
336 $ret = Sanitizer::normalizeEntity( $matches[1] );
337 } elseif( $matches[2] != '' ) {
338 $ret = Sanitizer::decCharReference( $matches[2] );
339 } elseif( $matches[3] != '' ) {
340 $ret = Sanitizer::hexCharReference( $matches[3] );
341 } elseif( $matches[4] != '' ) {
342 $ret = Sanitizer::hexCharReference( $matches[4] );
343 }
344 if( is_null( $ret ) ) {
345 return htmlspecialchars( $matches[0] );
346 } else {
347 return $ret;
348 }
349 }
350
351 /**
352 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
353 * return the named entity reference as is. Otherwise, returns
354 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
355 *
356 * @param string $name
357 * @return string
358 */
359 function normalizeEntity( $name ) {
360 # List of all named character entities defined in HTML 4.01
361 # http://www.w3.org/TR/html4/sgml/entities.html
362 static $htmlEntities = array(
363 'aacute' => true,
364 'Aacute' => true,
365 'acirc' => true,
366 'Acirc' => true,
367 'acute' => true,
368 'aelig' => true,
369 'AElig' => true,
370 'agrave' => true,
371 'Agrave' => true,
372 'alefsym' => true,
373 'alpha' => true,
374 'Alpha' => true,
375 'amp' => true,
376 'and' => true,
377 'ang' => true,
378 'apos' => true,
379 'aring' => true,
380 'Aring' => true,
381 'asymp' => true,
382 'atilde' => true,
383 'Atilde' => true,
384 'auml' => true,
385 'Auml' => true,
386 'bdquo' => true,
387 'beta' => true,
388 'Beta' => true,
389 'brvbar' => true,
390 'bull' => true,
391 'cap' => true,
392 'ccedil' => true,
393 'Ccedil' => true,
394 'cedil' => true,
395 'cent' => true,
396 'chi' => true,
397 'Chi' => true,
398 'circ' => true,
399 'clubs' => true,
400 'cong' => true,
401 'copy' => true,
402 'crarr' => true,
403 'cup' => true,
404 'curren' => true,
405 'dagger' => true,
406 'Dagger' => true,
407 'darr' => true,
408 'dArr' => true,
409 'deg' => true,
410 'delta' => true,
411 'Delta' => true,
412 'diams' => true,
413 'divide' => true,
414 'eacute' => true,
415 'Eacute' => true,
416 'ecirc' => true,
417 'Ecirc' => true,
418 'egrave' => true,
419 'Egrave' => true,
420 'empty' => true,
421 'emsp' => true,
422 'ensp' => true,
423 'epsilon' => true,
424 'Epsilon' => true,
425 'equiv' => true,
426 'eta' => true,
427 'Eta' => true,
428 'eth' => true,
429 'ETH' => true,
430 'euml' => true,
431 'Euml' => true,
432 'euro' => true,
433 'exist' => true,
434 'fnof' => true,
435 'forall' => true,
436 'frac12' => true,
437 'frac14' => true,
438 'frac34' => true,
439 'frasl' => true,
440 'gamma' => true,
441 'Gamma' => true,
442 'ge' => true,
443 'gt' => true,
444 'harr' => true,
445 'hArr' => true,
446 'hearts' => true,
447 'hellip' => true,
448 'iacute' => true,
449 'Iacute' => true,
450 'icirc' => true,
451 'Icirc' => true,
452 'iexcl' => true,
453 'igrave' => true,
454 'Igrave' => true,
455 'image' => true,
456 'infin' => true,
457 'int' => true,
458 'iota' => true,
459 'Iota' => true,
460 'iquest' => true,
461 'isin' => true,
462 'iuml' => true,
463 'Iuml' => true,
464 'kappa' => true,
465 'Kappa' => true,
466 'lambda' => true,
467 'Lambda' => true,
468 'lang' => true,
469 'laquo' => true,
470 'larr' => true,
471 'lArr' => true,
472 'lceil' => true,
473 'ldquo' => true,
474 'le' => true,
475 'lfloor' => true,
476 'lowast' => true,
477 'loz' => true,
478 'lrm' => true,
479 'lsaquo' => true,
480 'lsquo' => true,
481 'lt' => true,
482 'macr' => true,
483 'mdash' => true,
484 'micro' => true,
485 'middot' => true,
486 'minus' => true,
487 'mu' => true,
488 'Mu' => true,
489 'nabla' => true,
490 'nbsp' => true,
491 'ndash' => true,
492 'ne' => true,
493 'ni' => true,
494 'not' => true,
495 'notin' => true,
496 'nsub' => true,
497 'ntilde' => true,
498 'Ntilde' => true,
499 'nu' => true,
500 'Nu' => true,
501 'oacute' => true,
502 'Oacute' => true,
503 'ocirc' => true,
504 'Ocirc' => true,
505 'oelig' => true,
506 'OElig' => true,
507 'ograve' => true,
508 'Ograve' => true,
509 'oline' => true,
510 'omega' => true,
511 'Omega' => true,
512 'omicron' => true,
513 'Omicron' => true,
514 'oplus' => true,
515 'or' => true,
516 'ordf' => true,
517 'ordm' => true,
518 'oslash' => true,
519 'Oslash' => true,
520 'otilde' => true,
521 'Otilde' => true,
522 'otimes' => true,
523 'ouml' => true,
524 'Ouml' => true,
525 'para' => true,
526 'part' => true,
527 'permil' => true,
528 'perp' => true,
529 'phi' => true,
530 'Phi' => true,
531 'pi' => true,
532 'Pi' => true,
533 'piv' => true,
534 'plusmn' => true,
535 'pound' => true,
536 'prime' => true,
537 'Prime' => true,
538 'prod' => true,
539 'prop' => true,
540 'psi' => true,
541 'Psi' => true,
542 'quot' => true,
543 'radic' => true,
544 'rang' => true,
545 'raquo' => true,
546 'rarr' => true,
547 'rArr' => true,
548 'rceil' => true,
549 'rdquo' => true,
550 'real' => true,
551 'reg' => true,
552 'rfloor' => true,
553 'rho' => true,
554 'Rho' => true,
555 'rlm' => true,
556 'rsaquo' => true,
557 'rsquo' => true,
558 'sbquo' => true,
559 'scaron' => true,
560 'Scaron' => true,
561 'sdot' => true,
562 'sect' => true,
563 'shy' => true,
564 'sigma' => true,
565 'Sigma' => true,
566 'sigmaf' => true,
567 'sim' => true,
568 'spades' => true,
569 'sub' => true,
570 'sube' => true,
571 'sum' => true,
572 'sup' => true,
573 'sup1' => true,
574 'sup2' => true,
575 'sup3' => true,
576 'supe' => true,
577 'szlig' => true,
578 'tau' => true,
579 'Tau' => true,
580 'there4' => true,
581 'theta' => true,
582 'Theta' => true,
583 'thetasym' => true,
584 'thinsp' => true,
585 'thorn' => true,
586 'THORN' => true,
587 'tilde' => true,
588 'times' => true,
589 'trade' => true,
590 'uacute' => true,
591 'Uacute' => true,
592 'uarr' => true,
593 'uArr' => true,
594 'ucirc' => true,
595 'Ucirc' => true,
596 'ugrave' => true,
597 'Ugrave' => true,
598 'uml' => true,
599 'upsih' => true,
600 'upsilon' => true,
601 'Upsilon' => true,
602 'uuml' => true,
603 'Uuml' => true,
604 'weierp' => true,
605 'xi' => true,
606 'Xi' => true,
607 'yacute' => true,
608 'Yacute' => true,
609 'yen' => true,
610 'yuml' => true,
611 'Yuml' => true,
612 'zeta' => true,
613 'Zeta' => true,
614 'zwj' => true,
615 'zwnj' => true );
616 if( isset( $htmlEntities[$name] ) ) {
617 return "&$name;";
618 } else {
619 return "&amp;$name;";
620 }
621 }
622
623 function decCharReference( $codepoint ) {
624 $point = IntVal( $codepoint );
625 if( Sanitizer::validateCodepoint( $point ) ) {
626 return sprintf( '&#%d;', $point );
627 } else {
628 return null;
629 }
630 }
631
632 function hexCharReference( $codepoint ) {
633 $point = hexdec( $codepoint );
634 if( Sanitizer::validateCodepoint( $point ) ) {
635 return sprintf( '&#x%x;', $point );
636 } else {
637 return null;
638 }
639 }
640
641 /**
642 * Returns true if a given Unicode codepoint is a valid character in XML.
643 * @param int $codepoint
644 * @return bool
645 */
646 function validateCodepoint( $codepoint ) {
647 return ($codepoint == 0x09)
648 || ($codepoint == 0x0a)
649 || ($codepoint == 0x0d)
650 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
651 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
652 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
653 }
654
655 /**
656 * Fetch the whitelist of acceptable attributes for a given
657 * element name.
658 *
659 * @param string $element
660 * @return array
661 */
662 function attributeWhitelist( $element ) {
663 $list = Sanitizer::setupAttributeWhitelist();
664 return isset( $list[$element] )
665 ? $list[$element]
666 : array();
667 }
668
669 /**
670 * @return array
671 */
672 function setupAttributeWhitelist() {
673 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
674 $block = array_merge( $common, array( 'align' ) );
675 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
676 $tablecell = array( 'abbr',
677 'axis',
678 'headers',
679 'scope',
680 'rowspan',
681 'colspan',
682 'nowrap', # deprecated
683 'width', # deprecated
684 'height' # deprecated
685 );
686
687 # Numbers refer to sections in HTML 4.01 standard describing the element.
688 # See: http://www.w3.org/TR/html4/
689 $whitelist = array (
690 # 7.5.4
691 'div' => $block,
692 'center' => $common, # deprecated
693 'span' => $block, # ??
694
695 # 7.5.5
696 'h1' => $block,
697 'h2' => $block,
698 'h3' => $block,
699 'h4' => $block,
700 'h5' => $block,
701 'h6' => $block,
702
703 # 7.5.6
704 # address
705
706 # 8.2.4
707 # bdo
708
709 # 9.2.1
710 'em' => $common,
711 'strong' => $common,
712 'cite' => $common,
713 # dfn
714 'code' => $common,
715 # samp
716 # kbd
717 'var' => $common,
718 # abbr
719 # acronym
720
721 # 9.2.2
722 'blockquote' => array_merge( $common, array( 'cite' ) ),
723 # q
724
725 # 9.2.3
726 'sub' => $common,
727 'sup' => $common,
728
729 # 9.3.1
730 'p' => $block,
731
732 # 9.3.2
733 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
734
735 # 9.3.4
736 'pre' => array_merge( $common, array( 'width' ) ),
737
738 # 9.4
739 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
740 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
741
742 # 10.2
743 'ul' => array_merge( $common, array( 'type' ) ),
744 'ol' => array_merge( $common, array( 'type', 'start' ) ),
745 'li' => array_merge( $common, array( 'type', 'value' ) ),
746
747 # 10.3
748 'dl' => $common,
749 'dd' => $common,
750 'dt' => $common,
751
752 # 11.2.1
753 'table' => array_merge( $common,
754 array( 'summary', 'width', 'border', 'frame',
755 'rules', 'cellspacing', 'cellpadding',
756 'align', 'bgcolor', 'frame', 'rules',
757 'border' ) ),
758
759 # 11.2.2
760 'caption' => array_merge( $common, array( 'align' ) ),
761
762 # 11.2.3
763 'thead' => array_merge( $common, $tablealign ),
764 'tfoot' => array_merge( $common, $tablealign ),
765 'tbody' => array_merge( $common, $tablealign ),
766
767 # 11.2.4
768 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
769 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
770
771 # 11.2.5
772 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
773
774 # 11.2.6
775 'td' => array_merge( $common, $tablecell, $tablealign ),
776 'th' => array_merge( $common, $tablecell, $tablealign ),
777
778 # 15.2.1
779 'tt' => $common,
780 'b' => $common,
781 'i' => $common,
782 'big' => $common,
783 'small' => $common,
784 'strike' => $common,
785 's' => $common,
786 'u' => $common,
787
788 # 15.2.2
789 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
790 # basefont
791
792 # 15.3
793 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
794
795 # XHTML Ruby annotation text module, simple ruby only.
796 # http://www.w3c.org/TR/ruby/
797 'ruby' => $common,
798 # rbc
799 # rtc
800 'rb' => $common,
801 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
802 'rp' => $common,
803 );
804 return $whitelist;
805 }
806
807 /**
808 * Take a fragment of (potentially invalid) HTML and return
809 * a version with any tags removed, encoded suitably for literal
810 * inclusion in an attribute value.
811 *
812 * @param string $text HTML fragment
813 * @return string
814 */
815 function stripAllTags( $text ) {
816 # Actual <tags>
817 $text = preg_replace( '/<[^>]*>/', '', $text );
818
819 # Normalize &entities and whitespace
820 $text = Sanitizer::normalizeAttributeValue( $text );
821
822 # Will be placed into "double-quoted" attributes,
823 # make sure remaining bits are safe.
824 $text = str_replace(
825 array('<', '>', '"'),
826 array('&lt;', '&gt;', '&quot;'),
827 $text );
828
829 return $text;
830 }
831
832 }
833
834 ?>