Prevent blocked users from changing page protection levels
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2 /**
3 * (X)HTML sanitizer for MediaWiki
4 *
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @package MediaWiki
24 * @subpackage Parser
25 */
26
27 /**
28 * Regular expression to match various types of character references in
29 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
30 */
31 define( 'MW_CHAR_REFS_REGEX',
32 '/&([A-Za-z0-9]+);
33 |&\#([0-9]+);
34 |&\#x([0-9A-Za-z]+);
35 |&\#X([0-9A-Za-z]+);
36 |(&)/x' );
37
38 /**
39 * Regular expression to match HTML/XML attribute pairs within a tag.
40 * Allows some... latitude.
41 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
42 */
43 $attrib = '[A-Za-z0-9]';
44 $space = '[\x09\x0a\x0d\x20]';
45 define( 'MW_ATTRIBS_REGEX',
46 "/(?:^|$space)($attrib+)
47 ($space*=$space*
48 (?:
49 # The attribute value: quoted or alone
50 \"([^<\"]*)\"
51 | '([^<']*)'
52 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
53 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
54 # colors are specified like this.
55 # We'll be normalizing it.
56 )
57 )?(?=$space|\$)/sx" );
58
59 /**
60 * List of all named character entities defined in HTML 4.01
61 * http://www.w3.org/TR/html4/sgml/entities.html
62 * @access private
63 */
64 global $wgHtmlEntities;
65 $wgHtmlEntities = array(
66 'Aacute' => 193,
67 'aacute' => 225,
68 'Acirc' => 194,
69 'acirc' => 226,
70 'acute' => 180,
71 'AElig' => 198,
72 'aelig' => 230,
73 'Agrave' => 192,
74 'agrave' => 224,
75 'alefsym' => 8501,
76 'Alpha' => 913,
77 'alpha' => 945,
78 'amp' => 38,
79 'and' => 8743,
80 'ang' => 8736,
81 'Aring' => 197,
82 'aring' => 229,
83 'asymp' => 8776,
84 'Atilde' => 195,
85 'atilde' => 227,
86 'Auml' => 196,
87 'auml' => 228,
88 'bdquo' => 8222,
89 'Beta' => 914,
90 'beta' => 946,
91 'brvbar' => 166,
92 'bull' => 8226,
93 'cap' => 8745,
94 'Ccedil' => 199,
95 'ccedil' => 231,
96 'cedil' => 184,
97 'cent' => 162,
98 'Chi' => 935,
99 'chi' => 967,
100 'circ' => 710,
101 'clubs' => 9827,
102 'cong' => 8773,
103 'copy' => 169,
104 'crarr' => 8629,
105 'cup' => 8746,
106 'curren' => 164,
107 'dagger' => 8224,
108 'Dagger' => 8225,
109 'darr' => 8595,
110 'dArr' => 8659,
111 'deg' => 176,
112 'Delta' => 916,
113 'delta' => 948,
114 'diams' => 9830,
115 'divide' => 247,
116 'Eacute' => 201,
117 'eacute' => 233,
118 'Ecirc' => 202,
119 'ecirc' => 234,
120 'Egrave' => 200,
121 'egrave' => 232,
122 'empty' => 8709,
123 'emsp' => 8195,
124 'ensp' => 8194,
125 'Epsilon' => 917,
126 'epsilon' => 949,
127 'equiv' => 8801,
128 'Eta' => 919,
129 'eta' => 951,
130 'ETH' => 208,
131 'eth' => 240,
132 'Euml' => 203,
133 'euml' => 235,
134 'euro' => 8364,
135 'exist' => 8707,
136 'fnof' => 402,
137 'forall' => 8704,
138 'frac12' => 189,
139 'frac14' => 188,
140 'frac34' => 190,
141 'frasl' => 8260,
142 'Gamma' => 915,
143 'gamma' => 947,
144 'ge' => 8805,
145 'gt' => 62,
146 'harr' => 8596,
147 'hArr' => 8660,
148 'hearts' => 9829,
149 'hellip' => 8230,
150 'Iacute' => 205,
151 'iacute' => 237,
152 'Icirc' => 206,
153 'icirc' => 238,
154 'iexcl' => 161,
155 'Igrave' => 204,
156 'igrave' => 236,
157 'image' => 8465,
158 'infin' => 8734,
159 'int' => 8747,
160 'Iota' => 921,
161 'iota' => 953,
162 'iquest' => 191,
163 'isin' => 8712,
164 'Iuml' => 207,
165 'iuml' => 239,
166 'Kappa' => 922,
167 'kappa' => 954,
168 'Lambda' => 923,
169 'lambda' => 955,
170 'lang' => 9001,
171 'laquo' => 171,
172 'larr' => 8592,
173 'lArr' => 8656,
174 'lceil' => 8968,
175 'ldquo' => 8220,
176 'le' => 8804,
177 'lfloor' => 8970,
178 'lowast' => 8727,
179 'loz' => 9674,
180 'lrm' => 8206,
181 'lsaquo' => 8249,
182 'lsquo' => 8216,
183 'lt' => 60,
184 'macr' => 175,
185 'mdash' => 8212,
186 'micro' => 181,
187 'middot' => 183,
188 'minus' => 8722,
189 'Mu' => 924,
190 'mu' => 956,
191 'nabla' => 8711,
192 'nbsp' => 160,
193 'ndash' => 8211,
194 'ne' => 8800,
195 'ni' => 8715,
196 'not' => 172,
197 'notin' => 8713,
198 'nsub' => 8836,
199 'Ntilde' => 209,
200 'ntilde' => 241,
201 'Nu' => 925,
202 'nu' => 957,
203 'Oacute' => 211,
204 'oacute' => 243,
205 'Ocirc' => 212,
206 'ocirc' => 244,
207 'OElig' => 338,
208 'oelig' => 339,
209 'Ograve' => 210,
210 'ograve' => 242,
211 'oline' => 8254,
212 'Omega' => 937,
213 'omega' => 969,
214 'Omicron' => 927,
215 'omicron' => 959,
216 'oplus' => 8853,
217 'or' => 8744,
218 'ordf' => 170,
219 'ordm' => 186,
220 'Oslash' => 216,
221 'oslash' => 248,
222 'Otilde' => 213,
223 'otilde' => 245,
224 'otimes' => 8855,
225 'Ouml' => 214,
226 'ouml' => 246,
227 'para' => 182,
228 'part' => 8706,
229 'permil' => 8240,
230 'perp' => 8869,
231 'Phi' => 934,
232 'phi' => 966,
233 'Pi' => 928,
234 'pi' => 960,
235 'piv' => 982,
236 'plusmn' => 177,
237 'pound' => 163,
238 'prime' => 8242,
239 'Prime' => 8243,
240 'prod' => 8719,
241 'prop' => 8733,
242 'Psi' => 936,
243 'psi' => 968,
244 'quot' => 34,
245 'radic' => 8730,
246 'rang' => 9002,
247 'raquo' => 187,
248 'rarr' => 8594,
249 'rArr' => 8658,
250 'rceil' => 8969,
251 'rdquo' => 8221,
252 'real' => 8476,
253 'reg' => 174,
254 'rfloor' => 8971,
255 'Rho' => 929,
256 'rho' => 961,
257 'rlm' => 8207,
258 'rsaquo' => 8250,
259 'rsquo' => 8217,
260 'sbquo' => 8218,
261 'Scaron' => 352,
262 'scaron' => 353,
263 'sdot' => 8901,
264 'sect' => 167,
265 'shy' => 173,
266 'Sigma' => 931,
267 'sigma' => 963,
268 'sigmaf' => 962,
269 'sim' => 8764,
270 'spades' => 9824,
271 'sub' => 8834,
272 'sube' => 8838,
273 'sum' => 8721,
274 'sup' => 8835,
275 'sup1' => 185,
276 'sup2' => 178,
277 'sup3' => 179,
278 'supe' => 8839,
279 'szlig' => 223,
280 'Tau' => 932,
281 'tau' => 964,
282 'there4' => 8756,
283 'Theta' => 920,
284 'theta' => 952,
285 'thetasym' => 977,
286 'thinsp' => 8201,
287 'THORN' => 222,
288 'thorn' => 254,
289 'tilde' => 732,
290 'times' => 215,
291 'trade' => 8482,
292 'Uacute' => 218,
293 'uacute' => 250,
294 'uarr' => 8593,
295 'uArr' => 8657,
296 'Ucirc' => 219,
297 'ucirc' => 251,
298 'Ugrave' => 217,
299 'ugrave' => 249,
300 'uml' => 168,
301 'upsih' => 978,
302 'Upsilon' => 933,
303 'upsilon' => 965,
304 'Uuml' => 220,
305 'uuml' => 252,
306 'weierp' => 8472,
307 'Xi' => 926,
308 'xi' => 958,
309 'Yacute' => 221,
310 'yacute' => 253,
311 'yen' => 165,
312 'Yuml' => 376,
313 'yuml' => 255,
314 'Zeta' => 918,
315 'zeta' => 950,
316 'zwj' => 8205,
317 'zwnj' => 8204 );
318
319 /** @package MediaWiki */
320 class Sanitizer {
321 /**
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
324 * @access private
325 * @param string $text
326 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
327 * @param array $args for the processing callback
328 * @return string
329 */
330 function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
331 global $wgUseTidy, $wgUserHtml;
332 $fname = 'Parser::removeHTMLtags';
333 wfProfileIn( $fname );
334
335 if( $wgUserHtml ) {
336 $htmlpairs = array( # Tags that must be closed
337 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
338 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
339 'strike', 'strong', 'tt', 'var', 'div', 'center',
340 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
341 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
342 );
343 $htmlsingle = array(
344 'br', 'hr', 'li', 'dt', 'dd'
345 );
346 $htmlsingleonly = array( # Elements that cannot have close tags
347 'br', 'hr'
348 );
349 $htmlnest = array( # Tags that can be nested--??
350 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
351 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
352 );
353 $tabletags = array( # Can only appear inside table
354 'td', 'th', 'tr'
355 );
356 } else {
357 $htmlpairs = array();
358 $htmlsingle = array();
359 $htmlnest = array();
360 $tabletags = array();
361 }
362
363 $htmlsingle = array_merge( $tabletags, $htmlsingle );
364 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
365
366 # Remove HTML comments
367 $text = Sanitizer::removeHTMLcomments( $text );
368
369 $bits = explode( '<', $text );
370 $text = array_shift( $bits );
371 if(!$wgUseTidy) {
372 $tagstack = array(); $tablestack = array();
373 foreach ( $bits as $x ) {
374 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
375 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
376 $x, $regs );
377 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
378 error_reporting( $prev );
379
380 $badtag = 0 ;
381 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
382 # Check our stack
383 if ( $slash ) {
384 # Closing a tag...
385 if( in_array( $t, $htmlsingleonly ) ) {
386 $badtag = 1;
387 } elseif( !in_array( $t, $htmlsingle ) &&
388 ( $ot = @array_pop( $tagstack ) ) != $t ) {
389 @array_push( $tagstack, $ot );
390 $badtag = 1;
391 } else {
392 if ( $t == 'table' ) {
393 $tagstack = array_pop( $tablestack );
394 }
395 $newparams = '';
396 }
397 } else {
398 # Keep track for later
399 if ( in_array( $t, $tabletags ) &&
400 ! in_array( 'table', $tagstack ) ) {
401 $badtag = 1;
402 } else if ( in_array( $t, $tagstack ) &&
403 ! in_array ( $t , $htmlnest ) ) {
404 $badtag = 1 ;
405 } elseif( in_array( $t, $htmlsingleonly ) ) {
406 # Hack to force empty tag for uncloseable elements
407 $brace = '/>';
408 } else if ( ! in_array( $t, $htmlsingle ) ) {
409 if ( $t == 'table' ) {
410 array_push( $tablestack, $tagstack );
411 $tagstack = array();
412 }
413 array_push( $tagstack, $t );
414 }
415
416 # Replace any variables or template parameters with
417 # plaintext results.
418 if( is_callable( $processCallback ) ) {
419 call_user_func_array( $processCallback, array( &$params, $args ) );
420 }
421
422 # Strip non-approved attributes from the tag
423 $newparams = Sanitizer::fixTagAttributes( $params, $t );
424 }
425 if ( ! $badtag ) {
426 $rest = str_replace( '>', '&gt;', $rest );
427 $close = ( $brace == '/>' ) ? ' /' : '';
428 $text .= "<$slash$t$newparams$close>$rest";
429 continue;
430 }
431 }
432 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
433 }
434 # Close off any remaining tags
435 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
436 $text .= "</$t>\n";
437 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
438 }
439 } else {
440 # this might be possible using tidy itself
441 foreach ( $bits as $x ) {
442 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
443 $x, $regs );
444 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
445 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
446 if( is_callable( $processCallback ) ) {
447 call_user_func_array( $processCallback, array( &$params, $args ) );
448 }
449 $newparams = Sanitizer::fixTagAttributes( $params, $t );
450 $rest = str_replace( '>', '&gt;', $rest );
451 $text .= "<$slash$t$newparams$brace$rest";
452 } else {
453 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
454 }
455 }
456 }
457 wfProfileOut( $fname );
458 return $text;
459 }
460
461 /**
462 * Remove '<!--', '-->', and everything between.
463 * To avoid leaving blank lines, when a comment is both preceded
464 * and followed by a newline (ignoring spaces), trim leading and
465 * trailing spaces and one of the newlines.
466 *
467 * @access private
468 * @param string $text
469 * @return string
470 */
471 function removeHTMLcomments( $text ) {
472 $fname='Parser::removeHTMLcomments';
473 wfProfileIn( $fname );
474 while (($start = strpos($text, '<!--')) !== false) {
475 $end = strpos($text, '-->', $start + 4);
476 if ($end === false) {
477 # Unterminated comment; bail out
478 break;
479 }
480
481 $end += 3;
482
483 # Trim space and newline if the comment is both
484 # preceded and followed by a newline
485 $spaceStart = max($start - 1, 0);
486 $spaceLen = $end - $spaceStart;
487 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
488 $spaceStart--;
489 $spaceLen++;
490 }
491 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
492 $spaceLen++;
493 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
494 # Remove the comment, leading and trailing
495 # spaces, and leave only one newline.
496 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
497 }
498 else {
499 # Remove just the comment.
500 $text = substr_replace($text, '', $start, $end - $start);
501 }
502 }
503 wfProfileOut( $fname );
504 return $text;
505 }
506
507 /**
508 * Take a tag soup fragment listing an HTML element's attributes
509 * and normalize it to well-formed XML, discarding unwanted attributes.
510 *
511 * - Normalizes attribute names to lowercase
512 * - Discards attributes not on a whitelist for the given element
513 * - Turns broken or invalid entities into plaintext
514 * - Double-quotes all attribute values
515 * - Attributes without values are given the name as attribute
516 * - Double attributes are discarded
517 * - Unsafe style attributes are discarded
518 * - Prepends space if there are attributes.
519 *
520 * @param string $text
521 * @param string $element
522 * @return string
523 *
524 * @todo Check for legal values where the DTD limits things.
525 * @todo Check for unique id attribute :P
526 */
527 function fixTagAttributes( $text, $element ) {
528 if( trim( $text ) == '' ) {
529 return '';
530 }
531
532 # Unquoted attribute
533 # Since we quote this later, this can be anything distinguishable
534 # from the end of the attribute
535 $pairs = array();
536 if( !preg_match_all(
537 MW_ATTRIBS_REGEX,
538 $text,
539 $pairs,
540 PREG_SET_ORDER ) ) {
541 return '';
542 }
543
544 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
545 $attribs = array();
546 foreach( $pairs as $set ) {
547 $attribute = strtolower( $set[1] );
548 if( !isset( $whitelist[$attribute] ) ) {
549 continue;
550 }
551
552 $raw = Sanitizer::getTagAttributeCallback( $set );
553 $value = Sanitizer::normalizeAttributeValue( $raw );
554
555 # Strip javascript "expression" from stylesheets.
556 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
557 if( $attribute == 'style' ) {
558 $stripped = Sanitizer::decodeCharReferences( $value );
559
560 // Remove any comments; IE gets token splitting wrong
561 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
562 $value = htmlspecialchars( $stripped );
563
564 // ... and continue checks
565 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
566 'codepointToUtf8(hexdec("$1"))', $stripped );
567 $stripped = str_replace( '\\', '', $stripped );
568 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
569 $stripped ) ) {
570 # haxx0r
571 continue;
572 }
573 }
574
575 # Templates and links may be expanded in later parsing,
576 # creating invalid or dangerous output. Suppress this.
577 $value = strtr( $value, array(
578 '{' => '&#123;',
579 '[' => '&#91;',
580 "''" => '&#39;&#39;',
581 'ISBN' => '&#73;SBN',
582 'RFC' => '&#82;FC',
583 'PMID' => '&#80;MID',
584 ) );
585
586 # Stupid hack
587 $value = preg_replace_callback(
588 '/(' . wfUrlProtocols() . ')/',
589 array( 'Sanitizer', 'armorLinksCallback' ),
590 $value );
591
592 // If this attribute was previously set, override it.
593 // Output should only have one attribute of each name.
594 $attribs[$attribute] = "$attribute=\"$value\"";
595 }
596 if( empty( $attribs ) ) {
597 return '';
598 } else {
599 return ' ' . implode( ' ', $attribs );
600 }
601 }
602
603 /**
604 * Regex replace callback for armoring links against further processing.
605 * @param array $matches
606 * @return string
607 * @access private
608 */
609 function armorLinksCallback( $matches ) {
610 return str_replace( ':', '&#58;', $matches[1] );
611 }
612
613 /**
614 * Return an associative array of attribute names and values from
615 * a partial tag string. Attribute names are forces to lowercase,
616 * character references are decoded to UTF-8 text.
617 *
618 * @param string
619 * @return array
620 */
621 function decodeTagAttributes( $text ) {
622 $attribs = array();
623
624 if( trim( $text ) == '' ) {
625 return $attribs;
626 }
627
628 $pairs = array();
629 if( !preg_match_all(
630 MW_ATTRIBS_REGEX,
631 $text,
632 $pairs,
633 PREG_SET_ORDER ) ) {
634 return $attribs;
635 }
636
637 foreach( $pairs as $set ) {
638 $attribute = strtolower( $set[1] );
639 $value = Sanitizer::getTagAttributeCallback( $set );
640 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
641 }
642 return $attribs;
643 }
644
645 /**
646 * Pick the appropriate attribute value from a match set from the
647 * MW_ATTRIBS_REGEX matches.
648 *
649 * @param array $set
650 * @return string
651 * @access private
652 */
653 function getTagAttributeCallback( $set ) {
654 if( isset( $set[6] ) ) {
655 # Illegal #XXXXXX color with no quotes.
656 return $set[6];
657 } elseif( isset( $set[5] ) ) {
658 # No quotes.
659 return $set[5];
660 } elseif( isset( $set[4] ) ) {
661 # Single-quoted
662 return $set[4];
663 } elseif( isset( $set[3] ) ) {
664 # Double-quoted
665 return $set[3];
666 } elseif( !isset( $set[2] ) ) {
667 # In XHTML, attributes must have a value.
668 # For 'reduced' form, return explicitly the attribute name here.
669 return $set[1];
670 } else {
671 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
672 }
673 }
674
675 /**
676 * Normalize whitespace and character references in an XML source-
677 * encoded text for an attribute value.
678 *
679 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
680 * but note that we're not returning the value, but are returning
681 * XML source fragments that will be slapped into output.
682 *
683 * @param string $text
684 * @return string
685 * @access private
686 */
687 function normalizeAttributeValue( $text ) {
688 return str_replace( '"', '&quot;',
689 preg_replace(
690 '/\r\n|[\x20\x0d\x0a\x09]/',
691 ' ',
692 Sanitizer::normalizeCharReferences( $text ) ) );
693 }
694
695 /**
696 * Ensure that any entities and character references are legal
697 * for XML and XHTML specifically. Any stray bits will be
698 * &amp;-escaped to result in a valid text fragment.
699 *
700 * a. any named char refs must be known in XHTML
701 * b. any numeric char refs must be legal chars, not invalid or forbidden
702 * c. use &#x, not &#X
703 * d. fix or reject non-valid attributes
704 *
705 * @param string $text
706 * @return string
707 * @access private
708 */
709 function normalizeCharReferences( $text ) {
710 return preg_replace_callback(
711 MW_CHAR_REFS_REGEX,
712 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
713 $text );
714 }
715 /**
716 * @param string $matches
717 * @return string
718 */
719 function normalizeCharReferencesCallback( $matches ) {
720 $ret = null;
721 if( $matches[1] != '' ) {
722 $ret = Sanitizer::normalizeEntity( $matches[1] );
723 } elseif( $matches[2] != '' ) {
724 $ret = Sanitizer::decCharReference( $matches[2] );
725 } elseif( $matches[3] != '' ) {
726 $ret = Sanitizer::hexCharReference( $matches[3] );
727 } elseif( $matches[4] != '' ) {
728 $ret = Sanitizer::hexCharReference( $matches[4] );
729 }
730 if( is_null( $ret ) ) {
731 return htmlspecialchars( $matches[0] );
732 } else {
733 return $ret;
734 }
735 }
736
737 /**
738 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
739 * return the named entity reference as is. Otherwise, returns
740 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
741 *
742 * @param string $name
743 * @return string
744 */
745 function normalizeEntity( $name ) {
746 global $wgHtmlEntities;
747 if( isset( $wgHtmlEntities[$name] ) ) {
748 return "&$name;";
749 } else {
750 return "&amp;$name;";
751 }
752 }
753
754 function decCharReference( $codepoint ) {
755 $point = intval( $codepoint );
756 if( Sanitizer::validateCodepoint( $point ) ) {
757 return sprintf( '&#%d;', $point );
758 } else {
759 return null;
760 }
761 }
762
763 function hexCharReference( $codepoint ) {
764 $point = hexdec( $codepoint );
765 if( Sanitizer::validateCodepoint( $point ) ) {
766 return sprintf( '&#x%x;', $point );
767 } else {
768 return null;
769 }
770 }
771
772 /**
773 * Returns true if a given Unicode codepoint is a valid character in XML.
774 * @param int $codepoint
775 * @return bool
776 */
777 function validateCodepoint( $codepoint ) {
778 return ($codepoint == 0x09)
779 || ($codepoint == 0x0a)
780 || ($codepoint == 0x0d)
781 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
782 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
783 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
784 }
785
786 /**
787 * Decode any character references, numeric or named entities,
788 * in the text and return a UTF-8 string.
789 *
790 * @param string $text
791 * @return string
792 * @access public
793 */
794 function decodeCharReferences( $text ) {
795 return preg_replace_callback(
796 MW_CHAR_REFS_REGEX,
797 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
798 $text );
799 }
800
801 /**
802 * @param string $matches
803 * @return string
804 */
805 function decodeCharReferencesCallback( $matches ) {
806 if( $matches[1] != '' ) {
807 return Sanitizer::decodeEntity( $matches[1] );
808 } elseif( $matches[2] != '' ) {
809 return Sanitizer::decodeChar( intval( $matches[2] ) );
810 } elseif( $matches[3] != '' ) {
811 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
812 } elseif( $matches[4] != '' ) {
813 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
814 }
815 # Last case should be an ampersand by itself
816 return $matches[0];
817 }
818
819 /**
820 * Return UTF-8 string for a codepoint if that is a valid
821 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
822 * @param int $codepoint
823 * @return string
824 * @access private
825 */
826 function decodeChar( $codepoint ) {
827 if( Sanitizer::validateCodepoint( $codepoint ) ) {
828 return codepointToUtf8( $codepoint );
829 } else {
830 return UTF8_REPLACEMENT;
831 }
832 }
833
834 /**
835 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
836 * return the UTF-8 encoding of that character. Otherwise, returns
837 * pseudo-entity source (eg &foo;)
838 *
839 * @param string $name
840 * @return string
841 */
842 function decodeEntity( $name ) {
843 global $wgHtmlEntities;
844 if( isset( $wgHtmlEntities[$name] ) ) {
845 return codepointToUtf8( $wgHtmlEntities[$name] );
846 } else {
847 return "&$name;";
848 }
849 }
850
851 /**
852 * Fetch the whitelist of acceptable attributes for a given
853 * element name.
854 *
855 * @param string $element
856 * @return array
857 */
858 function attributeWhitelist( $element ) {
859 static $list;
860 if( !isset( $list ) ) {
861 $list = Sanitizer::setupAttributeWhitelist();
862 }
863 return isset( $list[$element] )
864 ? $list[$element]
865 : array();
866 }
867
868 /**
869 * @return array
870 */
871 function setupAttributeWhitelist() {
872 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
873 $block = array_merge( $common, array( 'align' ) );
874 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
875 $tablecell = array( 'abbr',
876 'axis',
877 'headers',
878 'scope',
879 'rowspan',
880 'colspan',
881 'nowrap', # deprecated
882 'width', # deprecated
883 'height', # deprecated
884 'bgcolor' # deprecated
885 );
886
887 # Numbers refer to sections in HTML 4.01 standard describing the element.
888 # See: http://www.w3.org/TR/html4/
889 $whitelist = array (
890 # 7.5.4
891 'div' => $block,
892 'center' => $common, # deprecated
893 'span' => $block, # ??
894
895 # 7.5.5
896 'h1' => $block,
897 'h2' => $block,
898 'h3' => $block,
899 'h4' => $block,
900 'h5' => $block,
901 'h6' => $block,
902
903 # 7.5.6
904 # address
905
906 # 8.2.4
907 # bdo
908
909 # 9.2.1
910 'em' => $common,
911 'strong' => $common,
912 'cite' => $common,
913 # dfn
914 'code' => $common,
915 # samp
916 # kbd
917 'var' => $common,
918 # abbr
919 # acronym
920
921 # 9.2.2
922 'blockquote' => array_merge( $common, array( 'cite' ) ),
923 # q
924
925 # 9.2.3
926 'sub' => $common,
927 'sup' => $common,
928
929 # 9.3.1
930 'p' => $block,
931
932 # 9.3.2
933 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
934
935 # 9.3.4
936 'pre' => array_merge( $common, array( 'width' ) ),
937
938 # 9.4
939 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
940 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
941
942 # 10.2
943 'ul' => array_merge( $common, array( 'type' ) ),
944 'ol' => array_merge( $common, array( 'type', 'start' ) ),
945 'li' => array_merge( $common, array( 'type', 'value' ) ),
946
947 # 10.3
948 'dl' => $common,
949 'dd' => $common,
950 'dt' => $common,
951
952 # 11.2.1
953 'table' => array_merge( $common,
954 array( 'summary', 'width', 'border', 'frame',
955 'rules', 'cellspacing', 'cellpadding',
956 'align', 'bgcolor', 'frame', 'rules',
957 'border' ) ),
958
959 # 11.2.2
960 'caption' => array_merge( $common, array( 'align' ) ),
961
962 # 11.2.3
963 'thead' => array_merge( $common, $tablealign ),
964 'tfoot' => array_merge( $common, $tablealign ),
965 'tbody' => array_merge( $common, $tablealign ),
966
967 # 11.2.4
968 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
969 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
970
971 # 11.2.5
972 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
973
974 # 11.2.6
975 'td' => array_merge( $common, $tablecell, $tablealign ),
976 'th' => array_merge( $common, $tablecell, $tablealign ),
977
978 # 15.2.1
979 'tt' => $common,
980 'b' => $common,
981 'i' => $common,
982 'big' => $common,
983 'small' => $common,
984 'strike' => $common,
985 's' => $common,
986 'u' => $common,
987
988 # 15.2.2
989 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
990 # basefont
991
992 # 15.3
993 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
994
995 # XHTML Ruby annotation text module, simple ruby only.
996 # http://www.w3c.org/TR/ruby/
997 'ruby' => $common,
998 # rbc
999 # rtc
1000 'rb' => $common,
1001 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1002 'rp' => $common,
1003 );
1004 return $whitelist;
1005 }
1006
1007 /**
1008 * Take a fragment of (potentially invalid) HTML and return
1009 * a version with any tags removed, encoded suitably for literal
1010 * inclusion in an attribute value.
1011 *
1012 * @param string $text HTML fragment
1013 * @return string
1014 */
1015 function stripAllTags( $text ) {
1016 # Actual <tags>
1017 $text = preg_replace( '/<[^>]*>/', '', $text );
1018
1019 # Normalize &entities and whitespace
1020 $text = Sanitizer::normalizeAttributeValue( $text );
1021
1022 # Will be placed into "double-quoted" attributes,
1023 # make sure remaining bits are safe.
1024 $text = str_replace(
1025 array('<', '>', '"'),
1026 array('&lt;', '&gt;', '&quot;'),
1027 $text );
1028
1029 return $text;
1030 }
1031
1032 }
1033
1034 ?>