Fix 8590 : remove support for $wgUserHtml . Our specific HTML subset
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2 /**
3 * XHTML sanitizer for MediaWiki
4 *
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @addtogroup Parser
24 */
25
26 /**
27 * Regular expression to match various types of character references in
28 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
29 */
30 define( 'MW_CHAR_REFS_REGEX',
31 '/&([A-Za-z0-9]+);
32 |&\#([0-9]+);
33 |&\#x([0-9A-Za-z]+);
34 |&\#X([0-9A-Za-z]+);
35 |(&)/x' );
36
37 /**
38 * Regular expression to match HTML/XML attribute pairs within a tag.
39 * Allows some... latitude.
40 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
41 */
42 $attrib = '[A-Za-z0-9]';
43 $space = '[\x09\x0a\x0d\x20]';
44 define( 'MW_ATTRIBS_REGEX',
45 "/(?:^|$space)($attrib+)
46 ($space*=$space*
47 (?:
48 # The attribute value: quoted or alone
49 \"([^<\"]*)\"
50 | '([^<']*)'
51 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
52 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
53 # colors are specified like this.
54 # We'll be normalizing it.
55 )
56 )?(?=$space|\$)/sx" );
57
58 /**
59 * List of all named character entities defined in HTML 4.01
60 * http://www.w3.org/TR/html4/sgml/entities.html
61 * @private
62 */
63 global $wgHtmlEntities;
64 $wgHtmlEntities = array(
65 'Aacute' => 193,
66 'aacute' => 225,
67 'Acirc' => 194,
68 'acirc' => 226,
69 'acute' => 180,
70 'AElig' => 198,
71 'aelig' => 230,
72 'Agrave' => 192,
73 'agrave' => 224,
74 'alefsym' => 8501,
75 'Alpha' => 913,
76 'alpha' => 945,
77 'amp' => 38,
78 'and' => 8743,
79 'ang' => 8736,
80 'Aring' => 197,
81 'aring' => 229,
82 'asymp' => 8776,
83 'Atilde' => 195,
84 'atilde' => 227,
85 'Auml' => 196,
86 'auml' => 228,
87 'bdquo' => 8222,
88 'Beta' => 914,
89 'beta' => 946,
90 'brvbar' => 166,
91 'bull' => 8226,
92 'cap' => 8745,
93 'Ccedil' => 199,
94 'ccedil' => 231,
95 'cedil' => 184,
96 'cent' => 162,
97 'Chi' => 935,
98 'chi' => 967,
99 'circ' => 710,
100 'clubs' => 9827,
101 'cong' => 8773,
102 'copy' => 169,
103 'crarr' => 8629,
104 'cup' => 8746,
105 'curren' => 164,
106 'dagger' => 8224,
107 'Dagger' => 8225,
108 'darr' => 8595,
109 'dArr' => 8659,
110 'deg' => 176,
111 'Delta' => 916,
112 'delta' => 948,
113 'diams' => 9830,
114 'divide' => 247,
115 'Eacute' => 201,
116 'eacute' => 233,
117 'Ecirc' => 202,
118 'ecirc' => 234,
119 'Egrave' => 200,
120 'egrave' => 232,
121 'empty' => 8709,
122 'emsp' => 8195,
123 'ensp' => 8194,
124 'Epsilon' => 917,
125 'epsilon' => 949,
126 'equiv' => 8801,
127 'Eta' => 919,
128 'eta' => 951,
129 'ETH' => 208,
130 'eth' => 240,
131 'Euml' => 203,
132 'euml' => 235,
133 'euro' => 8364,
134 'exist' => 8707,
135 'fnof' => 402,
136 'forall' => 8704,
137 'frac12' => 189,
138 'frac14' => 188,
139 'frac34' => 190,
140 'frasl' => 8260,
141 'Gamma' => 915,
142 'gamma' => 947,
143 'ge' => 8805,
144 'gt' => 62,
145 'harr' => 8596,
146 'hArr' => 8660,
147 'hearts' => 9829,
148 'hellip' => 8230,
149 'Iacute' => 205,
150 'iacute' => 237,
151 'Icirc' => 206,
152 'icirc' => 238,
153 'iexcl' => 161,
154 'Igrave' => 204,
155 'igrave' => 236,
156 'image' => 8465,
157 'infin' => 8734,
158 'int' => 8747,
159 'Iota' => 921,
160 'iota' => 953,
161 'iquest' => 191,
162 'isin' => 8712,
163 'Iuml' => 207,
164 'iuml' => 239,
165 'Kappa' => 922,
166 'kappa' => 954,
167 'Lambda' => 923,
168 'lambda' => 955,
169 'lang' => 9001,
170 'laquo' => 171,
171 'larr' => 8592,
172 'lArr' => 8656,
173 'lceil' => 8968,
174 'ldquo' => 8220,
175 'le' => 8804,
176 'lfloor' => 8970,
177 'lowast' => 8727,
178 'loz' => 9674,
179 'lrm' => 8206,
180 'lsaquo' => 8249,
181 'lsquo' => 8216,
182 'lt' => 60,
183 'macr' => 175,
184 'mdash' => 8212,
185 'micro' => 181,
186 'middot' => 183,
187 'minus' => 8722,
188 'Mu' => 924,
189 'mu' => 956,
190 'nabla' => 8711,
191 'nbsp' => 160,
192 'ndash' => 8211,
193 'ne' => 8800,
194 'ni' => 8715,
195 'not' => 172,
196 'notin' => 8713,
197 'nsub' => 8836,
198 'Ntilde' => 209,
199 'ntilde' => 241,
200 'Nu' => 925,
201 'nu' => 957,
202 'Oacute' => 211,
203 'oacute' => 243,
204 'Ocirc' => 212,
205 'ocirc' => 244,
206 'OElig' => 338,
207 'oelig' => 339,
208 'Ograve' => 210,
209 'ograve' => 242,
210 'oline' => 8254,
211 'Omega' => 937,
212 'omega' => 969,
213 'Omicron' => 927,
214 'omicron' => 959,
215 'oplus' => 8853,
216 'or' => 8744,
217 'ordf' => 170,
218 'ordm' => 186,
219 'Oslash' => 216,
220 'oslash' => 248,
221 'Otilde' => 213,
222 'otilde' => 245,
223 'otimes' => 8855,
224 'Ouml' => 214,
225 'ouml' => 246,
226 'para' => 182,
227 'part' => 8706,
228 'permil' => 8240,
229 'perp' => 8869,
230 'Phi' => 934,
231 'phi' => 966,
232 'Pi' => 928,
233 'pi' => 960,
234 'piv' => 982,
235 'plusmn' => 177,
236 'pound' => 163,
237 'prime' => 8242,
238 'Prime' => 8243,
239 'prod' => 8719,
240 'prop' => 8733,
241 'Psi' => 936,
242 'psi' => 968,
243 'quot' => 34,
244 'radic' => 8730,
245 'rang' => 9002,
246 'raquo' => 187,
247 'rarr' => 8594,
248 'rArr' => 8658,
249 'rceil' => 8969,
250 'rdquo' => 8221,
251 'real' => 8476,
252 'reg' => 174,
253 'rfloor' => 8971,
254 'Rho' => 929,
255 'rho' => 961,
256 'rlm' => 8207,
257 'rsaquo' => 8250,
258 'rsquo' => 8217,
259 'sbquo' => 8218,
260 'Scaron' => 352,
261 'scaron' => 353,
262 'sdot' => 8901,
263 'sect' => 167,
264 'shy' => 173,
265 'Sigma' => 931,
266 'sigma' => 963,
267 'sigmaf' => 962,
268 'sim' => 8764,
269 'spades' => 9824,
270 'sub' => 8834,
271 'sube' => 8838,
272 'sum' => 8721,
273 'sup' => 8835,
274 'sup1' => 185,
275 'sup2' => 178,
276 'sup3' => 179,
277 'supe' => 8839,
278 'szlig' => 223,
279 'Tau' => 932,
280 'tau' => 964,
281 'there4' => 8756,
282 'Theta' => 920,
283 'theta' => 952,
284 'thetasym' => 977,
285 'thinsp' => 8201,
286 'THORN' => 222,
287 'thorn' => 254,
288 'tilde' => 732,
289 'times' => 215,
290 'trade' => 8482,
291 'Uacute' => 218,
292 'uacute' => 250,
293 'uarr' => 8593,
294 'uArr' => 8657,
295 'Ucirc' => 219,
296 'ucirc' => 251,
297 'Ugrave' => 217,
298 'ugrave' => 249,
299 'uml' => 168,
300 'upsih' => 978,
301 'Upsilon' => 933,
302 'upsilon' => 965,
303 'Uuml' => 220,
304 'uuml' => 252,
305 'weierp' => 8472,
306 'Xi' => 926,
307 'xi' => 958,
308 'Yacute' => 221,
309 'yacute' => 253,
310 'yen' => 165,
311 'Yuml' => 376,
312 'yuml' => 255,
313 'Zeta' => 918,
314 'zeta' => 950,
315 'zwj' => 8205,
316 'zwnj' => 8204 );
317
318 class Sanitizer {
319 /**
320 * Cleans up HTML, removes dangerous tags and attributes, and
321 * removes HTML comments
322 * @private
323 * @param string $text
324 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
325 * @param array $args for the processing callback
326 * @return string
327 */
328 static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
329 global $wgUseTidy;
330
331 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
332 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
333
334 wfProfileIn( __METHOD__ );
335
336 if ( !$staticInitialised ) {
337
338 $htmlpairs = array( # Tags that must be closed
339 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
340 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
341 'strike', 'strong', 'tt', 'var', 'div', 'center',
342 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
343 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
344 );
345 $htmlsingle = array(
346 'br', 'hr', 'li', 'dt', 'dd'
347 );
348 $htmlsingleonly = array( # Elements that cannot have close tags
349 'br', 'hr'
350 );
351 $htmlnest = array( # Tags that can be nested--??
352 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
353 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
354 );
355 $tabletags = array( # Can only appear inside table, we will close them
356 'td', 'th', 'tr',
357 );
358 $htmllist = array( # Tags used by list
359 'ul','ol',
360 );
361 $listtags = array( # Tags that can appear in a list
362 'li',
363 );
364
365 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
366 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
367
368 # Convert them all to hashtables for faster lookup
369 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
370 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
371 foreach ( $vars as $var ) {
372 $$var = array_flip( $$var );
373 }
374 $staticInitialised = true;
375 }
376
377 # Remove HTML comments
378 $text = Sanitizer::removeHTMLcomments( $text );
379 $bits = explode( '<', $text );
380 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
381 if(!$wgUseTidy) {
382 $tagstack = $tablestack = array();
383 foreach ( $bits as $x ) {
384 $regs = array();
385 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
386 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
387 } else {
388 $slash = $t = $params = $brace = $rest = null;
389 }
390
391 $badtag = 0 ;
392 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
393 # Check our stack
394 if ( $slash ) {
395 # Closing a tag...
396 if( isset( $htmlsingleonly[$t] ) ) {
397 $badtag = 1;
398 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
399 if ( isset( $htmlsingleallowed[$ot] ) ) {
400 # Pop all elements with an optional close tag
401 # and see if we find a match below them
402 $optstack = array();
403 array_push ($optstack, $ot);
404 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
405 isset( $htmlsingleallowed[$ot] ) )
406 {
407 array_push ($optstack, $ot);
408 }
409 if ( $t != $ot ) {
410 # No match. Push the optinal elements back again
411 $badtag = 1;
412 while ( $ot = @array_pop( $optstack ) ) {
413 array_push( $tagstack, $ot );
414 }
415 }
416 } else {
417 @array_push( $tagstack, $ot );
418 # <li> can be nested in <ul> or <ol>, skip those cases:
419 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
420 $badtag = 1;
421 }
422 }
423 } else {
424 if ( $t == 'table' ) {
425 $tagstack = array_pop( $tablestack );
426 }
427 }
428 $newparams = '';
429 } else {
430 # Keep track for later
431 if ( isset( $tabletags[$t] ) &&
432 ! in_array( 'table', $tagstack ) ) {
433 $badtag = 1;
434 } else if ( in_array( $t, $tagstack ) &&
435 ! isset( $htmlnest [$t ] ) ) {
436 $badtag = 1 ;
437 # Is it a self closed htmlpair ? (bug 5487)
438 } else if( $brace == '/>' &&
439 isset( $htmlpairs[$t] ) ) {
440 $badtag = 1;
441 } elseif( isset( $htmlsingleonly[$t] ) ) {
442 # Hack to force empty tag for uncloseable elements
443 $brace = '/>';
444 } else if( isset( $htmlsingle[$t] ) ) {
445 # Hack to not close $htmlsingle tags
446 $brace = NULL;
447 } else if( isset( $tabletags[$t] )
448 && in_array($t ,$tagstack) ) {
449 // New table tag but forgot to close the previous one
450 $text .= "</$t>";
451 } else {
452 if ( $t == 'table' ) {
453 array_push( $tablestack, $tagstack );
454 $tagstack = array();
455 }
456 array_push( $tagstack, $t );
457 }
458
459 # Replace any variables or template parameters with
460 # plaintext results.
461 if( is_callable( $processCallback ) ) {
462 call_user_func_array( $processCallback, array( &$params, $args ) );
463 }
464
465 # Strip non-approved attributes from the tag
466 $newparams = Sanitizer::fixTagAttributes( $params, $t );
467 }
468 if ( ! $badtag ) {
469 $rest = str_replace( '>', '&gt;', $rest );
470 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
471 $text .= "<$slash$t$newparams$close>$rest";
472 continue;
473 }
474 }
475 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
476 }
477 # Close off any remaining tags
478 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
479 $text .= "</$t>\n";
480 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
481 }
482 } else {
483 # this might be possible using tidy itself
484 foreach ( $bits as $x ) {
485 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
486 $x, $regs );
487 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
488 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
489 if( is_callable( $processCallback ) ) {
490 call_user_func_array( $processCallback, array( &$params, $args ) );
491 }
492 $newparams = Sanitizer::fixTagAttributes( $params, $t );
493 $rest = str_replace( '>', '&gt;', $rest );
494 $text .= "<$slash$t$newparams$brace$rest";
495 } else {
496 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
497 }
498 }
499 }
500 wfProfileOut( __METHOD__ );
501 return $text;
502 }
503
504 /**
505 * Remove '<!--', '-->', and everything between.
506 * To avoid leaving blank lines, when a comment is both preceded
507 * and followed by a newline (ignoring spaces), trim leading and
508 * trailing spaces and one of the newlines.
509 *
510 * @private
511 * @param string $text
512 * @return string
513 */
514 static function removeHTMLcomments( $text ) {
515 wfProfileIn( __METHOD__ );
516 while (($start = strpos($text, '<!--')) !== false) {
517 $end = strpos($text, '-->', $start + 4);
518 if ($end === false) {
519 # Unterminated comment; bail out
520 break;
521 }
522
523 $end += 3;
524
525 # Trim space and newline if the comment is both
526 # preceded and followed by a newline
527 $spaceStart = max($start - 1, 0);
528 $spaceLen = $end - $spaceStart;
529 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
530 $spaceStart--;
531 $spaceLen++;
532 }
533 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
534 $spaceLen++;
535 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
536 # Remove the comment, leading and trailing
537 # spaces, and leave only one newline.
538 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
539 }
540 else {
541 # Remove just the comment.
542 $text = substr_replace($text, '', $start, $end - $start);
543 }
544 }
545 wfProfileOut( __METHOD__ );
546 return $text;
547 }
548
549 /**
550 * Take an array of attribute names and values and normalize or discard
551 * illegal values for the given element type.
552 *
553 * - Discards attributes not on a whitelist for the given element
554 * - Unsafe style attributes are discarded
555 *
556 * @param array $attribs
557 * @param string $element
558 * @return array
559 *
560 * @todo Check for legal values where the DTD limits things.
561 * @todo Check for unique id attribute :P
562 */
563 static function validateTagAttributes( $attribs, $element ) {
564 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
565 $out = array();
566 foreach( $attribs as $attribute => $value ) {
567 if( !isset( $whitelist[$attribute] ) ) {
568 continue;
569 }
570 # Strip javascript "expression" from stylesheets.
571 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
572 if( $attribute == 'style' ) {
573 $value = Sanitizer::checkCss( $value );
574 if( $value === false ) {
575 # haxx0r
576 continue;
577 }
578 }
579
580 if ( $attribute === 'id' )
581 $value = Sanitizer::escapeId( $value );
582
583 // If this attribute was previously set, override it.
584 // Output should only have one attribute of each name.
585 $out[$attribute] = $value;
586 }
587 return $out;
588 }
589
590 /**
591 * Pick apart some CSS and check it for forbidden or unsafe structures.
592 * Returns a sanitized string, or false if it was just too evil.
593 *
594 * Currently URL references, 'expression', 'tps' are forbidden.
595 *
596 * @param string $value
597 * @return mixed
598 */
599 static function checkCss( $value ) {
600 $stripped = Sanitizer::decodeCharReferences( $value );
601
602 // Remove any comments; IE gets token splitting wrong
603 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
604
605 $value = $stripped;
606
607 // ... and continue checks
608 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
609 'codepointToUtf8(hexdec("$1"))', $stripped );
610 $stripped = str_replace( '\\', '', $stripped );
611 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
612 $stripped ) ) {
613 # haxx0r
614 return false;
615 }
616
617 return $value;
618 }
619
620 /**
621 * Take a tag soup fragment listing an HTML element's attributes
622 * and normalize it to well-formed XML, discarding unwanted attributes.
623 * Output is safe for further wikitext processing, with escaping of
624 * values that could trigger problems.
625 *
626 * - Normalizes attribute names to lowercase
627 * - Discards attributes not on a whitelist for the given element
628 * - Turns broken or invalid entities into plaintext
629 * - Double-quotes all attribute values
630 * - Attributes without values are given the name as attribute
631 * - Double attributes are discarded
632 * - Unsafe style attributes are discarded
633 * - Prepends space if there are attributes.
634 *
635 * @param string $text
636 * @param string $element
637 * @return string
638 */
639 static function fixTagAttributes( $text, $element ) {
640 if( trim( $text ) == '' ) {
641 return '';
642 }
643
644 $stripped = Sanitizer::validateTagAttributes(
645 Sanitizer::decodeTagAttributes( $text ), $element );
646
647 $attribs = array();
648 foreach( $stripped as $attribute => $value ) {
649 $encAttribute = htmlspecialchars( $attribute );
650 $encValue = Sanitizer::safeEncodeAttribute( $value );
651
652 $attribs[] = "$encAttribute=\"$encValue\"";
653 }
654 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
655 }
656
657 /**
658 * Encode an attribute value for HTML output.
659 * @param $text
660 * @return HTML-encoded text fragment
661 */
662 static function encodeAttribute( $text ) {
663 $encValue = htmlspecialchars( $text );
664
665 // Whitespace is normalized during attribute decoding,
666 // so if we've been passed non-spaces we must encode them
667 // ahead of time or they won't be preserved.
668 $encValue = strtr( $encValue, array(
669 "\n" => '&#10;',
670 "\r" => '&#13;',
671 "\t" => '&#9;',
672 ) );
673
674 return $encValue;
675 }
676
677 /**
678 * Encode an attribute value for HTML tags, with extra armoring
679 * against further wiki processing.
680 * @param $text
681 * @return HTML-encoded text fragment
682 */
683 static function safeEncodeAttribute( $text ) {
684 $encValue = Sanitizer::encodeAttribute( $text );
685
686 # Templates and links may be expanded in later parsing,
687 # creating invalid or dangerous output. Suppress this.
688 $encValue = strtr( $encValue, array(
689 '<' => '&lt;', // This should never happen,
690 '>' => '&gt;', // we've received invalid input
691 '"' => '&quot;', // which should have been escaped.
692 '{' => '&#123;',
693 '[' => '&#91;',
694 "''" => '&#39;&#39;',
695 'ISBN' => '&#73;SBN',
696 'RFC' => '&#82;FC',
697 'PMID' => '&#80;MID',
698 '|' => '&#124;',
699 '__' => '&#95;_',
700 ) );
701
702 # Stupid hack
703 $encValue = preg_replace_callback(
704 '/(' . wfUrlProtocols() . ')/',
705 array( 'Sanitizer', 'armorLinksCallback' ),
706 $encValue );
707 return $encValue;
708 }
709
710 /**
711 * Given a value escape it so that it can be used in an id attribute and
712 * return it, this does not validate the value however (see first link)
713 *
714 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
715 * in the id and
716 * name attributes
717 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
718 *
719 * @static
720 *
721 * @param string $id
722 * @return string
723 */
724 static function escapeId( $id ) {
725 static $replace = array(
726 '%3A' => ':',
727 '%' => '.'
728 );
729
730 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
731
732 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
733 }
734
735 /**
736 * Given a value, escape it so that it can be used as a CSS class and
737 * return it.
738 *
739 * @todo For extra validity, input should be validated UTF-8.
740 *
741 * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
742 *
743 * @param string $class
744 * @return string
745 */
746 static function escapeClass( $class ) {
747 // Convert ugly stuff to underscores and kill underscores in ugly places
748 return rtrim(preg_replace(
749 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
750 '_',
751 $class ), '_');
752 }
753
754 /**
755 * Regex replace callback for armoring links against further processing.
756 * @param array $matches
757 * @return string
758 * @private
759 */
760 private static function armorLinksCallback( $matches ) {
761 return str_replace( ':', '&#58;', $matches[1] );
762 }
763
764 /**
765 * Return an associative array of attribute names and values from
766 * a partial tag string. Attribute names are forces to lowercase,
767 * character references are decoded to UTF-8 text.
768 *
769 * @param string
770 * @return array
771 */
772 static function decodeTagAttributes( $text ) {
773 $attribs = array();
774
775 if( trim( $text ) == '' ) {
776 return $attribs;
777 }
778
779 $pairs = array();
780 if( !preg_match_all(
781 MW_ATTRIBS_REGEX,
782 $text,
783 $pairs,
784 PREG_SET_ORDER ) ) {
785 return $attribs;
786 }
787
788 foreach( $pairs as $set ) {
789 $attribute = strtolower( $set[1] );
790 $value = Sanitizer::getTagAttributeCallback( $set );
791
792 // Normalize whitespace
793 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
794 $value = trim( $value );
795
796 // Decode character references
797 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
798 }
799 return $attribs;
800 }
801
802 /**
803 * Pick the appropriate attribute value from a match set from the
804 * MW_ATTRIBS_REGEX matches.
805 *
806 * @param array $set
807 * @return string
808 * @private
809 */
810 private static function getTagAttributeCallback( $set ) {
811 if( isset( $set[6] ) ) {
812 # Illegal #XXXXXX color with no quotes.
813 return $set[6];
814 } elseif( isset( $set[5] ) ) {
815 # No quotes.
816 return $set[5];
817 } elseif( isset( $set[4] ) ) {
818 # Single-quoted
819 return $set[4];
820 } elseif( isset( $set[3] ) ) {
821 # Double-quoted
822 return $set[3];
823 } elseif( !isset( $set[2] ) ) {
824 # In XHTML, attributes must have a value.
825 # For 'reduced' form, return explicitly the attribute name here.
826 return $set[1];
827 } else {
828 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
829 }
830 }
831
832 /**
833 * Normalize whitespace and character references in an XML source-
834 * encoded text for an attribute value.
835 *
836 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
837 * but note that we're not returning the value, but are returning
838 * XML source fragments that will be slapped into output.
839 *
840 * @param string $text
841 * @return string
842 * @private
843 */
844 private static function normalizeAttributeValue( $text ) {
845 return str_replace( '"', '&quot;',
846 preg_replace(
847 '/\r\n|[\x20\x0d\x0a\x09]/',
848 ' ',
849 Sanitizer::normalizeCharReferences( $text ) ) );
850 }
851
852 /**
853 * Ensure that any entities and character references are legal
854 * for XML and XHTML specifically. Any stray bits will be
855 * &amp;-escaped to result in a valid text fragment.
856 *
857 * a. any named char refs must be known in XHTML
858 * b. any numeric char refs must be legal chars, not invalid or forbidden
859 * c. use &#x, not &#X
860 * d. fix or reject non-valid attributes
861 *
862 * @param string $text
863 * @return string
864 * @private
865 */
866 static function normalizeCharReferences( $text ) {
867 return preg_replace_callback(
868 MW_CHAR_REFS_REGEX,
869 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
870 $text );
871 }
872 /**
873 * @param string $matches
874 * @return string
875 */
876 static function normalizeCharReferencesCallback( $matches ) {
877 $ret = null;
878 if( $matches[1] != '' ) {
879 $ret = Sanitizer::normalizeEntity( $matches[1] );
880 } elseif( $matches[2] != '' ) {
881 $ret = Sanitizer::decCharReference( $matches[2] );
882 } elseif( $matches[3] != '' ) {
883 $ret = Sanitizer::hexCharReference( $matches[3] );
884 } elseif( $matches[4] != '' ) {
885 $ret = Sanitizer::hexCharReference( $matches[4] );
886 }
887 if( is_null( $ret ) ) {
888 return htmlspecialchars( $matches[0] );
889 } else {
890 return $ret;
891 }
892 }
893
894 /**
895 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
896 * return the named entity reference as is. Otherwise, returns
897 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
898 *
899 * @param string $name
900 * @return string
901 * @static
902 */
903 static function normalizeEntity( $name ) {
904 global $wgHtmlEntities;
905 if( isset( $wgHtmlEntities[$name] ) ) {
906 return "&$name;";
907 } else {
908 return "&amp;$name;";
909 }
910 }
911
912 static function decCharReference( $codepoint ) {
913 $point = intval( $codepoint );
914 if( Sanitizer::validateCodepoint( $point ) ) {
915 return sprintf( '&#%d;', $point );
916 } else {
917 return null;
918 }
919 }
920
921 static function hexCharReference( $codepoint ) {
922 $point = hexdec( $codepoint );
923 if( Sanitizer::validateCodepoint( $point ) ) {
924 return sprintf( '&#x%x;', $point );
925 } else {
926 return null;
927 }
928 }
929
930 /**
931 * Returns true if a given Unicode codepoint is a valid character in XML.
932 * @param int $codepoint
933 * @return bool
934 */
935 private static function validateCodepoint( $codepoint ) {
936 return ($codepoint == 0x09)
937 || ($codepoint == 0x0a)
938 || ($codepoint == 0x0d)
939 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
940 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
941 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
942 }
943
944 /**
945 * Decode any character references, numeric or named entities,
946 * in the text and return a UTF-8 string.
947 *
948 * @param string $text
949 * @return string
950 * @public
951 * @static
952 */
953 public static function decodeCharReferences( $text ) {
954 return preg_replace_callback(
955 MW_CHAR_REFS_REGEX,
956 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
957 $text );
958 }
959
960 /**
961 * @param string $matches
962 * @return string
963 */
964 static function decodeCharReferencesCallback( $matches ) {
965 if( $matches[1] != '' ) {
966 return Sanitizer::decodeEntity( $matches[1] );
967 } elseif( $matches[2] != '' ) {
968 return Sanitizer::decodeChar( intval( $matches[2] ) );
969 } elseif( $matches[3] != '' ) {
970 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
971 } elseif( $matches[4] != '' ) {
972 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
973 }
974 # Last case should be an ampersand by itself
975 return $matches[0];
976 }
977
978 /**
979 * Return UTF-8 string for a codepoint if that is a valid
980 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
981 * @param int $codepoint
982 * @return string
983 * @private
984 */
985 static function decodeChar( $codepoint ) {
986 if( Sanitizer::validateCodepoint( $codepoint ) ) {
987 return codepointToUtf8( $codepoint );
988 } else {
989 return UTF8_REPLACEMENT;
990 }
991 }
992
993 /**
994 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
995 * return the UTF-8 encoding of that character. Otherwise, returns
996 * pseudo-entity source (eg &foo;)
997 *
998 * @param string $name
999 * @return string
1000 */
1001 static function decodeEntity( $name ) {
1002 global $wgHtmlEntities;
1003 if( isset( $wgHtmlEntities[$name] ) ) {
1004 return codepointToUtf8( $wgHtmlEntities[$name] );
1005 } else {
1006 return "&$name;";
1007 }
1008 }
1009
1010 /**
1011 * Fetch the whitelist of acceptable attributes for a given
1012 * element name.
1013 *
1014 * @param string $element
1015 * @return array
1016 */
1017 static function attributeWhitelist( $element ) {
1018 static $list;
1019 if( !isset( $list ) ) {
1020 $list = Sanitizer::setupAttributeWhitelist();
1021 }
1022 return isset( $list[$element] )
1023 ? $list[$element]
1024 : array();
1025 }
1026
1027 /**
1028 * @todo Document it a bit
1029 * @return array
1030 */
1031 static function setupAttributeWhitelist() {
1032 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1033 $block = array_merge( $common, array( 'align' ) );
1034 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1035 $tablecell = array( 'abbr',
1036 'axis',
1037 'headers',
1038 'scope',
1039 'rowspan',
1040 'colspan',
1041 'nowrap', # deprecated
1042 'width', # deprecated
1043 'height', # deprecated
1044 'bgcolor' # deprecated
1045 );
1046
1047 # Numbers refer to sections in HTML 4.01 standard describing the element.
1048 # See: http://www.w3.org/TR/html4/
1049 $whitelist = array (
1050 # 7.5.4
1051 'div' => $block,
1052 'center' => $common, # deprecated
1053 'span' => $block, # ??
1054
1055 # 7.5.5
1056 'h1' => $block,
1057 'h2' => $block,
1058 'h3' => $block,
1059 'h4' => $block,
1060 'h5' => $block,
1061 'h6' => $block,
1062
1063 # 7.5.6
1064 # address
1065
1066 # 8.2.4
1067 # bdo
1068
1069 # 9.2.1
1070 'em' => $common,
1071 'strong' => $common,
1072 'cite' => $common,
1073 # dfn
1074 'code' => $common,
1075 # samp
1076 # kbd
1077 'var' => $common,
1078 # abbr
1079 # acronym
1080
1081 # 9.2.2
1082 'blockquote' => array_merge( $common, array( 'cite' ) ),
1083 # q
1084
1085 # 9.2.3
1086 'sub' => $common,
1087 'sup' => $common,
1088
1089 # 9.3.1
1090 'p' => $block,
1091
1092 # 9.3.2
1093 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1094
1095 # 9.3.4
1096 'pre' => array_merge( $common, array( 'width' ) ),
1097
1098 # 9.4
1099 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1100 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1101
1102 # 10.2
1103 'ul' => array_merge( $common, array( 'type' ) ),
1104 'ol' => array_merge( $common, array( 'type', 'start' ) ),
1105 'li' => array_merge( $common, array( 'type', 'value' ) ),
1106
1107 # 10.3
1108 'dl' => $common,
1109 'dd' => $common,
1110 'dt' => $common,
1111
1112 # 11.2.1
1113 'table' => array_merge( $common,
1114 array( 'summary', 'width', 'border', 'frame',
1115 'rules', 'cellspacing', 'cellpadding',
1116 'align', 'bgcolor',
1117 ) ),
1118
1119 # 11.2.2
1120 'caption' => array_merge( $common, array( 'align' ) ),
1121
1122 # 11.2.3
1123 'thead' => array_merge( $common, $tablealign ),
1124 'tfoot' => array_merge( $common, $tablealign ),
1125 'tbody' => array_merge( $common, $tablealign ),
1126
1127 # 11.2.4
1128 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1129 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1130
1131 # 11.2.5
1132 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1133
1134 # 11.2.6
1135 'td' => array_merge( $common, $tablecell, $tablealign ),
1136 'th' => array_merge( $common, $tablecell, $tablealign ),
1137
1138 # 15.2.1
1139 'tt' => $common,
1140 'b' => $common,
1141 'i' => $common,
1142 'big' => $common,
1143 'small' => $common,
1144 'strike' => $common,
1145 's' => $common,
1146 'u' => $common,
1147
1148 # 15.2.2
1149 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1150 # basefont
1151
1152 # 15.3
1153 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1154
1155 # XHTML Ruby annotation text module, simple ruby only.
1156 # http://www.w3c.org/TR/ruby/
1157 'ruby' => $common,
1158 # rbc
1159 # rtc
1160 'rb' => $common,
1161 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1162 'rp' => $common,
1163 );
1164 return $whitelist;
1165 }
1166
1167 /**
1168 * Take a fragment of (potentially invalid) HTML and return
1169 * a version with any tags removed, encoded suitably for literal
1170 * inclusion in an attribute value.
1171 *
1172 * @param string $text HTML fragment
1173 * @return string
1174 */
1175 static function stripAllTags( $text ) {
1176 # Actual <tags>
1177 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1178
1179 # Normalize &entities and whitespace
1180 $text = Sanitizer::normalizeAttributeValue( $text );
1181
1182 # Will be placed into "double-quoted" attributes,
1183 # make sure remaining bits are safe.
1184 $text = str_replace(
1185 array('<', '>', '"'),
1186 array('&lt;', '&gt;', '&quot;'),
1187 $text );
1188
1189 return $text;
1190 }
1191
1192 /**
1193 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1194 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1195 * PHP 5.1 doesn't.
1196 *
1197 * Use for passing XHTML fragments to PHP's XML parsing functions
1198 *
1199 * @return string
1200 * @static
1201 */
1202 static function hackDocType() {
1203 global $wgHtmlEntities;
1204 $out = "<!DOCTYPE html [\n";
1205 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1206 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1207 }
1208 $out .= "]>\n";
1209 return $out;
1210 }
1211
1212 static function cleanUrl( $url, $hostname=true ) {
1213 # Normalize any HTML entities in input. They will be
1214 # re-escaped by makeExternalLink().
1215 $url = Sanitizer::decodeCharReferences( $url );
1216
1217 # Escape any control characters introduced by the above step
1218 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1219
1220 # Validate hostname portion
1221 $matches = array();
1222 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1223 list( /* $whole */, $protocol, $host, $rest ) = $matches;
1224
1225 // Characters that will be ignored in IDNs.
1226 // http://tools.ietf.org/html/3454#section-3.1
1227 // Strip them before further processing so blacklists and such work.
1228 $strip = "/
1229 \\s| # general whitespace
1230 \xc2\xad| # 00ad SOFT HYPHEN
1231 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1232 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1233 \xe2\x81\xa0| # 2060 WORD JOINER
1234 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1235 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1236 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1237 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1238 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1239 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1240 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1241 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1242 /xuD";
1243
1244 $host = preg_replace( $strip, '', $host );
1245
1246 // @fixme: validate hostnames here
1247
1248 return $protocol . $host . $rest;
1249 } else {
1250 return $url;
1251 }
1252 }
1253
1254 }
1255
1256 ?>