Bump to 1.5alpha2
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2
3 /**
4 * (X)HTML sanitizer for MediaWiki
5 *
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
23 *
24 * @package MediaWiki
25 * @subpackage Parser
26 */
27
28 /**
29 * Regular expression to match various types of character references in
30 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
31 */
32 define( 'MW_CHAR_REFS_REGEX',
33 '/&([A-Za-z0-9]+);
34 |&\#([0-9]+);
35 |&\#x([0-9A-Za-z]+);
36 |&\#X([0-9A-Za-z]+);
37 |(&)/x' );
38
39 /**
40 * Regular expression to match HTML/XML attribute pairs within a tag.
41 * Allows some... latitude.
42 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
43 */
44 $attrib = '[A-Za-z0-9]';
45 $space = '[\x09\x0a\x0d\x20]';
46 define( 'MW_ATTRIBS_REGEX',
47 "/(?:^|$space)($attrib+)
48 ($space*=$space*
49 (?:
50 # The attribute value: quoted or alone
51 \"([^<\"]*)\"
52 | '([^<']*)'
53 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
54 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
55 # colors are specified like this.
56 # We'll be normalizing it.
57 )
58 )?(?=$space|\$)/sx" );
59
60 /**
61 * List of all named character entities defined in HTML 4.01
62 * http://www.w3.org/TR/html4/sgml/entities.html
63 * @access private
64 */
65 global $wgHtmlEntities;
66 $wgHtmlEntities = array(
67 'Aacute' => 193,
68 'aacute' => 225,
69 'Acirc' => 194,
70 'acirc' => 226,
71 'acute' => 180,
72 'AElig' => 198,
73 'aelig' => 230,
74 'Agrave' => 192,
75 'agrave' => 224,
76 'alefsym' => 8501,
77 'Alpha' => 913,
78 'alpha' => 945,
79 'amp' => 38,
80 'and' => 8743,
81 'ang' => 8736,
82 'Aring' => 197,
83 'aring' => 229,
84 'asymp' => 8776,
85 'Atilde' => 195,
86 'atilde' => 227,
87 'Auml' => 196,
88 'auml' => 228,
89 'bdquo' => 8222,
90 'Beta' => 914,
91 'beta' => 946,
92 'brvbar' => 166,
93 'bull' => 8226,
94 'cap' => 8745,
95 'Ccedil' => 199,
96 'ccedil' => 231,
97 'cedil' => 184,
98 'cent' => 162,
99 'Chi' => 935,
100 'chi' => 967,
101 'circ' => 710,
102 'clubs' => 9827,
103 'cong' => 8773,
104 'copy' => 169,
105 'crarr' => 8629,
106 'cup' => 8746,
107 'curren' => 164,
108 'dagger' => 8224,
109 'Dagger' => 8225,
110 'darr' => 8595,
111 'dArr' => 8659,
112 'deg' => 176,
113 'Delta' => 916,
114 'delta' => 948,
115 'diams' => 9830,
116 'divide' => 247,
117 'Eacute' => 201,
118 'eacute' => 233,
119 'Ecirc' => 202,
120 'ecirc' => 234,
121 'Egrave' => 200,
122 'egrave' => 232,
123 'empty' => 8709,
124 'emsp' => 8195,
125 'ensp' => 8194,
126 'Epsilon' => 917,
127 'epsilon' => 949,
128 'equiv' => 8801,
129 'Eta' => 919,
130 'eta' => 951,
131 'ETH' => 208,
132 'eth' => 240,
133 'Euml' => 203,
134 'euml' => 235,
135 'euro' => 8364,
136 'exist' => 8707,
137 'fnof' => 402,
138 'forall' => 8704,
139 'frac12' => 189,
140 'frac14' => 188,
141 'frac34' => 190,
142 'frasl' => 8260,
143 'Gamma' => 915,
144 'gamma' => 947,
145 'ge' => 8805,
146 'gt' => 62,
147 'harr' => 8596,
148 'hArr' => 8660,
149 'hearts' => 9829,
150 'hellip' => 8230,
151 'Iacute' => 205,
152 'iacute' => 237,
153 'Icirc' => 206,
154 'icirc' => 238,
155 'iexcl' => 161,
156 'Igrave' => 204,
157 'igrave' => 236,
158 'image' => 8465,
159 'infin' => 8734,
160 'int' => 8747,
161 'Iota' => 921,
162 'iota' => 953,
163 'iquest' => 191,
164 'isin' => 8712,
165 'Iuml' => 207,
166 'iuml' => 239,
167 'Kappa' => 922,
168 'kappa' => 954,
169 'Lambda' => 923,
170 'lambda' => 955,
171 'lang' => 9001,
172 'laquo' => 171,
173 'larr' => 8592,
174 'lArr' => 8656,
175 'lceil' => 8968,
176 'ldquo' => 8220,
177 'le' => 8804,
178 'lfloor' => 8970,
179 'lowast' => 8727,
180 'loz' => 9674,
181 'lrm' => 8206,
182 'lsaquo' => 8249,
183 'lsquo' => 8216,
184 'lt' => 60,
185 'macr' => 175,
186 'mdash' => 8212,
187 'micro' => 181,
188 'middot' => 183,
189 'minus' => 8722,
190 'Mu' => 924,
191 'mu' => 956,
192 'nabla' => 8711,
193 'nbsp' => 160,
194 'ndash' => 8211,
195 'ne' => 8800,
196 'ni' => 8715,
197 'not' => 172,
198 'notin' => 8713,
199 'nsub' => 8836,
200 'Ntilde' => 209,
201 'ntilde' => 241,
202 'Nu' => 925,
203 'nu' => 957,
204 'Oacute' => 211,
205 'oacute' => 243,
206 'Ocirc' => 212,
207 'ocirc' => 244,
208 'OElig' => 338,
209 'oelig' => 339,
210 'Ograve' => 210,
211 'ograve' => 242,
212 'oline' => 8254,
213 'Omega' => 937,
214 'omega' => 969,
215 'Omicron' => 927,
216 'omicron' => 959,
217 'oplus' => 8853,
218 'or' => 8744,
219 'ordf' => 170,
220 'ordm' => 186,
221 'Oslash' => 216,
222 'oslash' => 248,
223 'Otilde' => 213,
224 'otilde' => 245,
225 'otimes' => 8855,
226 'Ouml' => 214,
227 'ouml' => 246,
228 'para' => 182,
229 'part' => 8706,
230 'permil' => 8240,
231 'perp' => 8869,
232 'Phi' => 934,
233 'phi' => 966,
234 'Pi' => 928,
235 'pi' => 960,
236 'piv' => 982,
237 'plusmn' => 177,
238 'pound' => 163,
239 'prime' => 8242,
240 'Prime' => 8243,
241 'prod' => 8719,
242 'prop' => 8733,
243 'Psi' => 936,
244 'psi' => 968,
245 'quot' => 34,
246 'radic' => 8730,
247 'rang' => 9002,
248 'raquo' => 187,
249 'rarr' => 8594,
250 'rArr' => 8658,
251 'rceil' => 8969,
252 'rdquo' => 8221,
253 'real' => 8476,
254 'reg' => 174,
255 'rfloor' => 8971,
256 'Rho' => 929,
257 'rho' => 961,
258 'rlm' => 8207,
259 'rsaquo' => 8250,
260 'rsquo' => 8217,
261 'sbquo' => 8218,
262 'Scaron' => 352,
263 'scaron' => 353,
264 'sdot' => 8901,
265 'sect' => 167,
266 'shy' => 173,
267 'Sigma' => 931,
268 'sigma' => 963,
269 'sigmaf' => 962,
270 'sim' => 8764,
271 'spades' => 9824,
272 'sub' => 8834,
273 'sube' => 8838,
274 'sum' => 8721,
275 'sup' => 8835,
276 'sup1' => 185,
277 'sup2' => 178,
278 'sup3' => 179,
279 'supe' => 8839,
280 'szlig' => 223,
281 'Tau' => 932,
282 'tau' => 964,
283 'there4' => 8756,
284 'Theta' => 920,
285 'theta' => 952,
286 'thetasym' => 977,
287 'thinsp' => 8201,
288 'THORN' => 222,
289 'thorn' => 254,
290 'tilde' => 732,
291 'times' => 215,
292 'trade' => 8482,
293 'Uacute' => 218,
294 'uacute' => 250,
295 'uarr' => 8593,
296 'uArr' => 8657,
297 'Ucirc' => 219,
298 'ucirc' => 251,
299 'Ugrave' => 217,
300 'ugrave' => 249,
301 'uml' => 168,
302 'upsih' => 978,
303 'Upsilon' => 933,
304 'upsilon' => 965,
305 'Uuml' => 220,
306 'uuml' => 252,
307 'weierp' => 8472,
308 'Xi' => 926,
309 'xi' => 958,
310 'Yacute' => 221,
311 'yacute' => 253,
312 'yen' => 165,
313 'Yuml' => 376,
314 'yuml' => 255,
315 'Zeta' => 918,
316 'zeta' => 950,
317 'zwj' => 8205,
318 'zwnj' => 8204 );
319
320 class Sanitizer {
321 /**
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
324 * @access private
325 * @param string $text
326 * @return string
327 */
328 function removeHTMLtags( $text ) {
329 global $wgUseTidy, $wgUserHtml;
330 $fname = 'Parser::removeHTMLtags';
331 wfProfileIn( $fname );
332
333 if( $wgUserHtml ) {
334 $htmlpairs = array( # Tags that must be closed
335 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
336 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
337 'strike', 'strong', 'tt', 'var', 'div', 'center',
338 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
339 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
340 );
341 $htmlsingle = array(
342 'br', 'hr', 'li', 'dt', 'dd'
343 );
344 $htmlnest = array( # Tags that can be nested--??
345 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
346 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
347 );
348 $tabletags = array( # Can only appear inside table
349 'td', 'th', 'tr'
350 );
351 } else {
352 $htmlpairs = array();
353 $htmlsingle = array();
354 $htmlnest = array();
355 $tabletags = array();
356 }
357
358 $htmlsingle = array_merge( $tabletags, $htmlsingle );
359 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
360
361 # Remove HTML comments
362 $text = Sanitizer::removeHTMLcomments( $text );
363
364 $bits = explode( '<', $text );
365 $text = array_shift( $bits );
366 if(!$wgUseTidy) {
367 $tagstack = array(); $tablestack = array();
368 foreach ( $bits as $x ) {
369 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
370 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
371 $x, $regs );
372 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
373 error_reporting( $prev );
374
375 $badtag = 0 ;
376 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
377 # Check our stack
378 if ( $slash ) {
379 # Closing a tag...
380 if ( ! in_array( $t, $htmlsingle ) &&
381 ( $ot = @array_pop( $tagstack ) ) != $t ) {
382 @array_push( $tagstack, $ot );
383 $badtag = 1;
384 } else {
385 if ( $t == 'table' ) {
386 $tagstack = array_pop( $tablestack );
387 }
388 $newparams = '';
389 }
390 } else {
391 # Keep track for later
392 if ( in_array( $t, $tabletags ) &&
393 ! in_array( 'table', $tagstack ) ) {
394 $badtag = 1;
395 } else if ( in_array( $t, $tagstack ) &&
396 ! in_array ( $t , $htmlnest ) ) {
397 $badtag = 1 ;
398 } else if ( ! in_array( $t, $htmlsingle ) ) {
399 if ( $t == 'table' ) {
400 array_push( $tablestack, $tagstack );
401 $tagstack = array();
402 }
403 array_push( $tagstack, $t );
404 }
405 # Strip non-approved attributes from the tag
406 $newparams = Sanitizer::fixTagAttributes( $params, $t );
407 }
408 if ( ! $badtag ) {
409 $rest = str_replace( '>', '&gt;', $rest );
410 $text .= "<$slash$t$newparams$brace$rest";
411 continue;
412 }
413 }
414 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
415 }
416 # Close off any remaining tags
417 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
418 $text .= "</$t>\n";
419 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
420 }
421 } else {
422 # this might be possible using tidy itself
423 foreach ( $bits as $x ) {
424 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
425 $x, $regs );
426 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
427 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
428 $newparams = Sanitizer::fixTagAttributes( $params, $t );
429 $rest = str_replace( '>', '&gt;', $rest );
430 $text .= "<$slash$t$newparams$brace$rest";
431 } else {
432 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
433 }
434 }
435 }
436 wfProfileOut( $fname );
437 return $text;
438 }
439
440 /**
441 * Remove '<!--', '-->', and everything between.
442 * To avoid leaving blank lines, when a comment is both preceded
443 * and followed by a newline (ignoring spaces), trim leading and
444 * trailing spaces and one of the newlines.
445 *
446 * @access private
447 * @param string $text
448 * @return string
449 */
450 function removeHTMLcomments( $text ) {
451 $fname='Parser::removeHTMLcomments';
452 wfProfileIn( $fname );
453 while (($start = strpos($text, '<!--')) !== false) {
454 $end = strpos($text, '-->', $start + 4);
455 if ($end === false) {
456 # Unterminated comment; bail out
457 break;
458 }
459
460 $end += 3;
461
462 # Trim space and newline if the comment is both
463 # preceded and followed by a newline
464 $spaceStart = max($start - 1, 0);
465 $spaceLen = $end - $spaceStart;
466 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
467 $spaceStart--;
468 $spaceLen++;
469 }
470 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
471 $spaceLen++;
472 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
473 # Remove the comment, leading and trailing
474 # spaces, and leave only one newline.
475 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
476 }
477 else {
478 # Remove just the comment.
479 $text = substr_replace($text, '', $start, $end - $start);
480 }
481 }
482 wfProfileOut( $fname );
483 return $text;
484 }
485
486 /**
487 * Take a tag soup fragment listing an HTML element's attributes
488 * and normalize it to well-formed XML, discarding unwanted attributes.
489 *
490 * - Normalizes attribute names to lowercase
491 * - Discards attributes not on a whitelist for the given element
492 * - Turns broken or invalid entities into plaintext
493 * - Double-quotes all attribute values
494 * - Attributes without values are given the name as attribute
495 * - Double attributes are discarded
496 * - Unsafe style attributes are discarded
497 * - Prepends space if there are attributes.
498 *
499 * @param string $text
500 * @param string $element
501 * @return string
502 *
503 * @todo Check for legal values where the DTD limits things.
504 * @todo Check for unique id attribute :P
505 */
506 function fixTagAttributes( $text, $element ) {
507 if( trim( $text ) == '' ) {
508 return '';
509 }
510
511 # Unquoted attribute
512 # Since we quote this later, this can be anything distinguishable
513 # from the end of the attribute
514 if( !preg_match_all(
515 MW_ATTRIBS_REGEX,
516 $text,
517 $pairs,
518 PREG_SET_ORDER ) ) {
519 return '';
520 }
521
522 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
523 $attribs = array();
524 foreach( $pairs as $set ) {
525 $attribute = strtolower( $set[1] );
526 if( !isset( $whitelist[$attribute] ) ) {
527 continue;
528 }
529
530 $raw = Sanitizer::getTagAttributeCallback( $set );
531 $value = Sanitizer::normalizeAttributeValue( $raw );
532
533 # Strip javascript "expression" from stylesheets.
534 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
535 if( $attribute == 'style' && preg_match(
536 '/(expression|tps*:\/\/|url\\s*\().*/is',
537 Sanitizer::decodeCharReferences( $value ) ) ) {
538 # haxx0r
539 continue;
540 }
541
542 # Templates and links may be expanded in later parsing,
543 # creating invalid or dangerous output. Suppress this.
544 $value = strtr( $value, array(
545 '{' => '&#123;',
546 '[' => '&#91;',
547 "''" => '&#39;&#39;',
548 'ISBN' => '&#73;SBN',
549 'RFC' => '&#82;FC',
550 'PMID' => '&#80;MID',
551 ) );
552 $value = preg_replace(
553 '/(' . URL_PROTOCOLS . '):/',
554 '\\1&#58;', $value );
555
556 if( !isset( $attribs[$attribute] ) ) {
557 $attribs[$attribute] = "$attribute=\"$value\"";
558 }
559 }
560 if( empty( $attribs ) ) {
561 return '';
562 } else {
563 return ' ' . implode( ' ', $attribs );
564 }
565 }
566
567 /**
568 * Return an associative array of attribute names and values from
569 * a partial tag string. Attribute names are forces to lowercase,
570 * character references are decoded to UTF-8 text.
571 *
572 * @param string
573 * @return array
574 */
575 function decodeTagAttributes( $text ) {
576 $attribs = array();
577
578 if( trim( $text ) == '' ) {
579 return $attribs;
580 }
581
582 if( !preg_match_all(
583 MW_ATTRIBS_REGEX,
584 $text,
585 $pairs,
586 PREG_SET_ORDER ) ) {
587 return $attribs;
588 }
589
590 foreach( $pairs as $set ) {
591 $attribute = strtolower( $set[1] );
592 $value = Sanitizer::getTagAttributeCallback( $set );
593 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
594 }
595 return $attribs;
596 }
597
598 /**
599 * Pick the appropriate attribute value from a match set from the
600 * MW_ATTRIBS_REGEX matches.
601 *
602 * @param array $set
603 * @return string
604 * @access private
605 */
606 function getTagAttributeCallback( $set ) {
607 if( isset( $set[6] ) ) {
608 # Illegal #XXXXXX color with no quotes.
609 return $set[6];
610 } elseif( isset( $set[5] ) ) {
611 # No quotes.
612 return $set[5];
613 } elseif( isset( $set[4] ) ) {
614 # Single-quoted
615 return $set[4];
616 } elseif( isset( $set[3] ) ) {
617 # Double-quoted
618 return $set[3];
619 } elseif( !isset( $set[2] ) ) {
620 # In XHTML, attributes must have a value.
621 # For 'reduced' form, return explicitly the attribute name here.
622 return $set[1];
623 } else {
624 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
625 }
626 }
627
628 /**
629 * Normalize whitespace and character references in an XML source-
630 * encoded text for an attribute value.
631 *
632 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
633 * but note that we're not returning the value, but are returning
634 * XML source fragments that will be slapped into output.
635 *
636 * @param string $text
637 * @return string
638 * @access private
639 */
640 function normalizeAttributeValue( $text ) {
641 return str_replace( '"', '&quot;',
642 preg_replace(
643 '/\r\n|[\x20\x0d\x0a\x09]/',
644 ' ',
645 Sanitizer::normalizeCharReferences( $text ) ) );
646 }
647
648 /**
649 * Ensure that any entities and character references are legal
650 * for XML and XHTML specifically. Any stray bits will be
651 * &amp;-escaped to result in a valid text fragment.
652 *
653 * a. any named char refs must be known in XHTML
654 * b. any numeric char refs must be legal chars, not invalid or forbidden
655 * c. use &#x, not &#X
656 * d. fix or reject non-valid attributes
657 *
658 * @param string $text
659 * @return string
660 * @access private
661 */
662 function normalizeCharReferences( $text ) {
663 return preg_replace_callback(
664 MW_CHAR_REFS_REGEX,
665 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
666 $text );
667 }
668 /**
669 * @param string $matches
670 * @return string
671 */
672 function normalizeCharReferencesCallback( $matches ) {
673 $ret = null;
674 if( $matches[1] != '' ) {
675 $ret = Sanitizer::normalizeEntity( $matches[1] );
676 } elseif( $matches[2] != '' ) {
677 $ret = Sanitizer::decCharReference( $matches[2] );
678 } elseif( $matches[3] != '' ) {
679 $ret = Sanitizer::hexCharReference( $matches[3] );
680 } elseif( $matches[4] != '' ) {
681 $ret = Sanitizer::hexCharReference( $matches[4] );
682 }
683 if( is_null( $ret ) ) {
684 return htmlspecialchars( $matches[0] );
685 } else {
686 return $ret;
687 }
688 }
689
690 /**
691 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
692 * return the named entity reference as is. Otherwise, returns
693 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
694 *
695 * @param string $name
696 * @return string
697 */
698 function normalizeEntity( $name ) {
699 global $wgHtmlEntities;
700 if( isset( $wgHtmlEntities[$name] ) ) {
701 return "&$name;";
702 } else {
703 return "&amp;$name;";
704 }
705 }
706
707 function decCharReference( $codepoint ) {
708 $point = IntVal( $codepoint );
709 if( Sanitizer::validateCodepoint( $point ) ) {
710 return sprintf( '&#%d;', $point );
711 } else {
712 return null;
713 }
714 }
715
716 function hexCharReference( $codepoint ) {
717 $point = hexdec( $codepoint );
718 if( Sanitizer::validateCodepoint( $point ) ) {
719 return sprintf( '&#x%x;', $point );
720 } else {
721 return null;
722 }
723 }
724
725 /**
726 * Returns true if a given Unicode codepoint is a valid character in XML.
727 * @param int $codepoint
728 * @return bool
729 */
730 function validateCodepoint( $codepoint ) {
731 return ($codepoint == 0x09)
732 || ($codepoint == 0x0a)
733 || ($codepoint == 0x0d)
734 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
735 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
736 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
737 }
738
739 /**
740 * Decode any character references, numeric or named entities,
741 * in the text and return a UTF-8 string.
742 *
743 * @param string $text
744 * @return string
745 * @access public
746 */
747 function decodeCharReferences( $text ) {
748 return preg_replace_callback(
749 MW_CHAR_REFS_REGEX,
750 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
751 $text );
752 }
753
754 /**
755 * @param string $matches
756 * @return string
757 */
758 function decodeCharReferencesCallback( $matches ) {
759 if( $matches[1] != '' ) {
760 return Sanitizer::decodeEntity( $matches[1] );
761 } elseif( $matches[2] != '' ) {
762 return Sanitizer::decodeChar( intval( $matches[2] ) );
763 } elseif( $matches[3] != '' ) {
764 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
765 } elseif( $matches[4] != '' ) {
766 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
767 }
768 # Last case should be an ampersand by itself
769 return $matches[0];
770 }
771
772 /**
773 * Return UTF-8 string for a codepoint if that is a valid
774 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
775 * @param int $codepoint
776 * @return string
777 * @access private
778 */
779 function decodeChar( $codepoint ) {
780 if( Sanitizer::validateCodepoint( $codepoint ) ) {
781 return codepointToUtf8( $codepoint );
782 } else {
783 return UTF8_REPLACEMENT;
784 }
785 }
786
787 /**
788 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
789 * return the UTF-8 encoding of that character. Otherwise, returns
790 * pseudo-entity source (eg &foo;)
791 *
792 * @param string $name
793 * @return string
794 */
795 function decodeEntity( $name ) {
796 global $wgHtmlEntities;
797 if( isset( $wgHtmlEntities[$name] ) ) {
798 return codepointToUtf8( $wgHtmlEntities[$name] );
799 } else {
800 return "&$name;";
801 }
802 }
803
804 /**
805 * Fetch the whitelist of acceptable attributes for a given
806 * element name.
807 *
808 * @param string $element
809 * @return array
810 */
811 function attributeWhitelist( $element ) {
812 static $list;
813 if( !isset( $list ) ) {
814 $list = Sanitizer::setupAttributeWhitelist();
815 }
816 return isset( $list[$element] )
817 ? $list[$element]
818 : array();
819 }
820
821 /**
822 * @return array
823 */
824 function setupAttributeWhitelist() {
825 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
826 $block = array_merge( $common, array( 'align' ) );
827 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
828 $tablecell = array( 'abbr',
829 'axis',
830 'headers',
831 'scope',
832 'rowspan',
833 'colspan',
834 'nowrap', # deprecated
835 'width', # deprecated
836 'height', # deprecated
837 'bgcolor' # deprecated
838 );
839
840 # Numbers refer to sections in HTML 4.01 standard describing the element.
841 # See: http://www.w3.org/TR/html4/
842 $whitelist = array (
843 # 7.5.4
844 'div' => $block,
845 'center' => $common, # deprecated
846 'span' => $block, # ??
847
848 # 7.5.5
849 'h1' => $block,
850 'h2' => $block,
851 'h3' => $block,
852 'h4' => $block,
853 'h5' => $block,
854 'h6' => $block,
855
856 # 7.5.6
857 # address
858
859 # 8.2.4
860 # bdo
861
862 # 9.2.1
863 'em' => $common,
864 'strong' => $common,
865 'cite' => $common,
866 # dfn
867 'code' => $common,
868 # samp
869 # kbd
870 'var' => $common,
871 # abbr
872 # acronym
873
874 # 9.2.2
875 'blockquote' => array_merge( $common, array( 'cite' ) ),
876 # q
877
878 # 9.2.3
879 'sub' => $common,
880 'sup' => $common,
881
882 # 9.3.1
883 'p' => $block,
884
885 # 9.3.2
886 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
887
888 # 9.3.4
889 'pre' => array_merge( $common, array( 'width' ) ),
890
891 # 9.4
892 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
893 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
894
895 # 10.2
896 'ul' => array_merge( $common, array( 'type' ) ),
897 'ol' => array_merge( $common, array( 'type', 'start' ) ),
898 'li' => array_merge( $common, array( 'type', 'value' ) ),
899
900 # 10.3
901 'dl' => $common,
902 'dd' => $common,
903 'dt' => $common,
904
905 # 11.2.1
906 'table' => array_merge( $common,
907 array( 'summary', 'width', 'border', 'frame',
908 'rules', 'cellspacing', 'cellpadding',
909 'align', 'bgcolor', 'frame', 'rules',
910 'border' ) ),
911
912 # 11.2.2
913 'caption' => array_merge( $common, array( 'align' ) ),
914
915 # 11.2.3
916 'thead' => array_merge( $common, $tablealign ),
917 'tfoot' => array_merge( $common, $tablealign ),
918 'tbody' => array_merge( $common, $tablealign ),
919
920 # 11.2.4
921 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
922 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
923
924 # 11.2.5
925 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
926
927 # 11.2.6
928 'td' => array_merge( $common, $tablecell, $tablealign ),
929 'th' => array_merge( $common, $tablecell, $tablealign ),
930
931 # 15.2.1
932 'tt' => $common,
933 'b' => $common,
934 'i' => $common,
935 'big' => $common,
936 'small' => $common,
937 'strike' => $common,
938 's' => $common,
939 'u' => $common,
940
941 # 15.2.2
942 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
943 # basefont
944
945 # 15.3
946 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
947
948 # XHTML Ruby annotation text module, simple ruby only.
949 # http://www.w3c.org/TR/ruby/
950 'ruby' => $common,
951 # rbc
952 # rtc
953 'rb' => $common,
954 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
955 'rp' => $common,
956 );
957 return $whitelist;
958 }
959
960 /**
961 * Take a fragment of (potentially invalid) HTML and return
962 * a version with any tags removed, encoded suitably for literal
963 * inclusion in an attribute value.
964 *
965 * @param string $text HTML fragment
966 * @return string
967 */
968 function stripAllTags( $text ) {
969 # Actual <tags>
970 $text = preg_replace( '/<[^>]*>/', '', $text );
971
972 # Normalize &entities and whitespace
973 $text = Sanitizer::normalizeAttributeValue( $text );
974
975 # Will be placed into "double-quoted" attributes,
976 # make sure remaining bits are safe.
977 $text = str_replace(
978 array('<', '>', '"'),
979 array('&lt;', '&gt;', '&quot;'),
980 $text );
981
982 return $text;
983 }
984
985 }
986
987 ?>