40016d93b2d144f505ba1ec6e924615ed5da0e6c
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2
3 /**
4 * (X)HTML sanitizer for MediaWiki
5 *
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
23 *
24 * @package MediaWiki
25 * @subpackage Parser
26 */
27
28 /**
29 * Regular expression to match various types of character references in
30 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
31 */
32 define( 'MW_CHAR_REFS_REGEX',
33 '/&([A-Za-z0-9]+);
34 |&\#([0-9]+);
35 |&\#x([0-9A-Za-z]+);
36 |&\#X([0-9A-Za-z]+);
37 |(&)/x' );
38
39 /**
40 * Regular expression to match HTML/XML attribute pairs within a tag.
41 * Allows some... latitude.
42 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
43 */
44 $attrib = '[A-Za-z0-9]';
45 $space = '[\x09\x0a\x0d\x20]';
46 define( 'MW_ATTRIBS_REGEX',
47 "/(?:^|$space)($attrib+)
48 ($space*=$space*
49 (?:
50 # The attribute value: quoted or alone
51 \"([^<\"]*)\"
52 | '([^<']*)'
53 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
54 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
55 # colors are specified like this.
56 # We'll be normalizing it.
57 )
58 )?(?=$space|\$)/sx" );
59
60 /**
61 * List of all named character entities defined in HTML 4.01
62 * http://www.w3.org/TR/html4/sgml/entities.html
63 * @access private
64 */
65 global $wgHtmlEntities;
66 $wgHtmlEntities = array(
67 'Aacute' => 193,
68 'aacute' => 225,
69 'Acirc' => 194,
70 'acirc' => 226,
71 'acute' => 180,
72 'AElig' => 198,
73 'aelig' => 230,
74 'Agrave' => 192,
75 'agrave' => 224,
76 'alefsym' => 8501,
77 'Alpha' => 913,
78 'alpha' => 945,
79 'amp' => 38,
80 'and' => 8743,
81 'ang' => 8736,
82 'Aring' => 197,
83 'aring' => 229,
84 'asymp' => 8776,
85 'Atilde' => 195,
86 'atilde' => 227,
87 'Auml' => 196,
88 'auml' => 228,
89 'bdquo' => 8222,
90 'Beta' => 914,
91 'beta' => 946,
92 'brvbar' => 166,
93 'bull' => 8226,
94 'cap' => 8745,
95 'Ccedil' => 199,
96 'ccedil' => 231,
97 'cedil' => 184,
98 'cent' => 162,
99 'Chi' => 935,
100 'chi' => 967,
101 'circ' => 710,
102 'clubs' => 9827,
103 'cong' => 8773,
104 'copy' => 169,
105 'crarr' => 8629,
106 'cup' => 8746,
107 'curren' => 164,
108 'dagger' => 8224,
109 'Dagger' => 8225,
110 'darr' => 8595,
111 'dArr' => 8659,
112 'deg' => 176,
113 'Delta' => 916,
114 'delta' => 948,
115 'diams' => 9830,
116 'divide' => 247,
117 'Eacute' => 201,
118 'eacute' => 233,
119 'Ecirc' => 202,
120 'ecirc' => 234,
121 'Egrave' => 200,
122 'egrave' => 232,
123 'empty' => 8709,
124 'emsp' => 8195,
125 'ensp' => 8194,
126 'Epsilon' => 917,
127 'epsilon' => 949,
128 'equiv' => 8801,
129 'Eta' => 919,
130 'eta' => 951,
131 'ETH' => 208,
132 'eth' => 240,
133 'Euml' => 203,
134 'euml' => 235,
135 'euro' => 8364,
136 'exist' => 8707,
137 'fnof' => 402,
138 'forall' => 8704,
139 'frac12' => 189,
140 'frac14' => 188,
141 'frac34' => 190,
142 'frasl' => 8260,
143 'Gamma' => 915,
144 'gamma' => 947,
145 'ge' => 8805,
146 'gt' => 62,
147 'harr' => 8596,
148 'hArr' => 8660,
149 'hearts' => 9829,
150 'hellip' => 8230,
151 'Iacute' => 205,
152 'iacute' => 237,
153 'Icirc' => 206,
154 'icirc' => 238,
155 'iexcl' => 161,
156 'Igrave' => 204,
157 'igrave' => 236,
158 'image' => 8465,
159 'infin' => 8734,
160 'int' => 8747,
161 'Iota' => 921,
162 'iota' => 953,
163 'iquest' => 191,
164 'isin' => 8712,
165 'Iuml' => 207,
166 'iuml' => 239,
167 'Kappa' => 922,
168 'kappa' => 954,
169 'Lambda' => 923,
170 'lambda' => 955,
171 'lang' => 9001,
172 'laquo' => 171,
173 'larr' => 8592,
174 'lArr' => 8656,
175 'lceil' => 8968,
176 'ldquo' => 8220,
177 'le' => 8804,
178 'lfloor' => 8970,
179 'lowast' => 8727,
180 'loz' => 9674,
181 'lrm' => 8206,
182 'lsaquo' => 8249,
183 'lsquo' => 8216,
184 'lt' => 60,
185 'macr' => 175,
186 'mdash' => 8212,
187 'micro' => 181,
188 'middot' => 183,
189 'minus' => 8722,
190 'Mu' => 924,
191 'mu' => 956,
192 'nabla' => 8711,
193 'nbsp' => 160,
194 'ndash' => 8211,
195 'ne' => 8800,
196 'ni' => 8715,
197 'not' => 172,
198 'notin' => 8713,
199 'nsub' => 8836,
200 'Ntilde' => 209,
201 'ntilde' => 241,
202 'Nu' => 925,
203 'nu' => 957,
204 'Oacute' => 211,
205 'oacute' => 243,
206 'Ocirc' => 212,
207 'ocirc' => 244,
208 'OElig' => 338,
209 'oelig' => 339,
210 'Ograve' => 210,
211 'ograve' => 242,
212 'oline' => 8254,
213 'Omega' => 937,
214 'omega' => 969,
215 'Omicron' => 927,
216 'omicron' => 959,
217 'oplus' => 8853,
218 'or' => 8744,
219 'ordf' => 170,
220 'ordm' => 186,
221 'Oslash' => 216,
222 'oslash' => 248,
223 'Otilde' => 213,
224 'otilde' => 245,
225 'otimes' => 8855,
226 'Ouml' => 214,
227 'ouml' => 246,
228 'para' => 182,
229 'part' => 8706,
230 'permil' => 8240,
231 'perp' => 8869,
232 'Phi' => 934,
233 'phi' => 966,
234 'Pi' => 928,
235 'pi' => 960,
236 'piv' => 982,
237 'plusmn' => 177,
238 'pound' => 163,
239 'prime' => 8242,
240 'Prime' => 8243,
241 'prod' => 8719,
242 'prop' => 8733,
243 'Psi' => 936,
244 'psi' => 968,
245 'quot' => 34,
246 'radic' => 8730,
247 'rang' => 9002,
248 'raquo' => 187,
249 'rarr' => 8594,
250 'rArr' => 8658,
251 'rceil' => 8969,
252 'rdquo' => 8221,
253 'real' => 8476,
254 'reg' => 174,
255 'rfloor' => 8971,
256 'Rho' => 929,
257 'rho' => 961,
258 'rlm' => 8207,
259 'rsaquo' => 8250,
260 'rsquo' => 8217,
261 'sbquo' => 8218,
262 'Scaron' => 352,
263 'scaron' => 353,
264 'sdot' => 8901,
265 'sect' => 167,
266 'shy' => 173,
267 'Sigma' => 931,
268 'sigma' => 963,
269 'sigmaf' => 962,
270 'sim' => 8764,
271 'spades' => 9824,
272 'sub' => 8834,
273 'sube' => 8838,
274 'sum' => 8721,
275 'sup' => 8835,
276 'sup1' => 185,
277 'sup2' => 178,
278 'sup3' => 179,
279 'supe' => 8839,
280 'szlig' => 223,
281 'Tau' => 932,
282 'tau' => 964,
283 'there4' => 8756,
284 'Theta' => 920,
285 'theta' => 952,
286 'thetasym' => 977,
287 'thinsp' => 8201,
288 'THORN' => 222,
289 'thorn' => 254,
290 'tilde' => 732,
291 'times' => 215,
292 'trade' => 8482,
293 'Uacute' => 218,
294 'uacute' => 250,
295 'uarr' => 8593,
296 'uArr' => 8657,
297 'Ucirc' => 219,
298 'ucirc' => 251,
299 'Ugrave' => 217,
300 'ugrave' => 249,
301 'uml' => 168,
302 'upsih' => 978,
303 'Upsilon' => 933,
304 'upsilon' => 965,
305 'Uuml' => 220,
306 'uuml' => 252,
307 'weierp' => 8472,
308 'Xi' => 926,
309 'xi' => 958,
310 'Yacute' => 221,
311 'yacute' => 253,
312 'yen' => 165,
313 'Yuml' => 376,
314 'yuml' => 255,
315 'Zeta' => 918,
316 'zeta' => 950,
317 'zwj' => 8205,
318 'zwnj' => 8204 );
319
320 class Sanitizer {
321 /**
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
324 * @access private
325 * @param string $text
326 * @return string
327 */
328 function removeHTMLtags( $text ) {
329 global $wgUseTidy, $wgUserHtml;
330 $fname = 'Parser::removeHTMLtags';
331 wfProfileIn( $fname );
332
333 if( $wgUserHtml ) {
334 $htmlpairs = array( # Tags that must be closed
335 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
336 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
337 'strike', 'strong', 'tt', 'var', 'div', 'center',
338 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
339 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
340 );
341 $htmlsingle = array(
342 'br', 'hr', 'li', 'dt', 'dd'
343 );
344 $htmlnest = array( # Tags that can be nested--??
345 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
346 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
347 );
348 $tabletags = array( # Can only appear inside table
349 'td', 'th', 'tr'
350 );
351 } else {
352 $htmlpairs = array();
353 $htmlsingle = array();
354 $htmlnest = array();
355 $tabletags = array();
356 }
357
358 $htmlsingle = array_merge( $tabletags, $htmlsingle );
359 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
360
361 # Remove HTML comments
362 $text = Sanitizer::removeHTMLcomments( $text );
363
364 $bits = explode( '<', $text );
365 $text = array_shift( $bits );
366 if(!$wgUseTidy) {
367 $tagstack = array(); $tablestack = array();
368 foreach ( $bits as $x ) {
369 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
370 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
371 $x, $regs );
372 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
373 error_reporting( $prev );
374
375 $badtag = 0 ;
376 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
377 # Check our stack
378 if ( $slash ) {
379 # Closing a tag...
380 if ( ! in_array( $t, $htmlsingle ) &&
381 ( $ot = @array_pop( $tagstack ) ) != $t ) {
382 @array_push( $tagstack, $ot );
383 $badtag = 1;
384 } else {
385 if ( $t == 'table' ) {
386 $tagstack = array_pop( $tablestack );
387 }
388 $newparams = '';
389 }
390 } else {
391 # Keep track for later
392 if ( in_array( $t, $tabletags ) &&
393 ! in_array( 'table', $tagstack ) ) {
394 $badtag = 1;
395 } else if ( in_array( $t, $tagstack ) &&
396 ! in_array ( $t , $htmlnest ) ) {
397 $badtag = 1 ;
398 } else if ( ! in_array( $t, $htmlsingle ) ) {
399 if ( $t == 'table' ) {
400 array_push( $tablestack, $tagstack );
401 $tagstack = array();
402 }
403 array_push( $tagstack, $t );
404 }
405 # Strip non-approved attributes from the tag
406 $newparams = Sanitizer::fixTagAttributes( $params, $t );
407 }
408 if ( ! $badtag ) {
409 $rest = str_replace( '>', '&gt;', $rest );
410 $text .= "<$slash$t$newparams$brace$rest";
411 continue;
412 }
413 }
414 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
415 }
416 # Close off any remaining tags
417 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
418 $text .= "</$t>\n";
419 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
420 }
421 } else {
422 # this might be possible using tidy itself
423 foreach ( $bits as $x ) {
424 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
425 $x, $regs );
426 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
427 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
428 $newparams = Sanitizer::fixTagAttributes( $params, $t );
429 $rest = str_replace( '>', '&gt;', $rest );
430 $text .= "<$slash$t$newparams$brace$rest";
431 } else {
432 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
433 }
434 }
435 }
436 wfProfileOut( $fname );
437 return $text;
438 }
439
440 /**
441 * Remove '<!--', '-->', and everything between.
442 * To avoid leaving blank lines, when a comment is both preceded
443 * and followed by a newline (ignoring spaces), trim leading and
444 * trailing spaces and one of the newlines.
445 *
446 * @access private
447 * @param string $text
448 * @return string
449 */
450 function removeHTMLcomments( $text ) {
451 $fname='Parser::removeHTMLcomments';
452 wfProfileIn( $fname );
453 while (($start = strpos($text, '<!--')) !== false) {
454 $end = strpos($text, '-->', $start + 4);
455 if ($end === false) {
456 # Unterminated comment; bail out
457 break;
458 }
459
460 $end += 3;
461
462 # Trim space and newline if the comment is both
463 # preceded and followed by a newline
464 $spaceStart = max($start - 1, 0);
465 $spaceLen = $end - $spaceStart;
466 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
467 $spaceStart--;
468 $spaceLen++;
469 }
470 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
471 $spaceLen++;
472 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
473 # Remove the comment, leading and trailing
474 # spaces, and leave only one newline.
475 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
476 }
477 else {
478 # Remove just the comment.
479 $text = substr_replace($text, '', $start, $end - $start);
480 }
481 }
482 wfProfileOut( $fname );
483 return $text;
484 }
485
486 /**
487 * Take a tag soup fragment listing an HTML element's attributes
488 * and normalize it to well-formed XML, discarding unwanted attributes.
489 *
490 * - Normalizes attribute names to lowercase
491 * - Discards attributes not on a whitelist for the given element
492 * - Turns broken or invalid entities into plaintext
493 * - Double-quotes all attribute values
494 * - Attributes without values are given the name as attribute
495 * - Double attributes are discarded
496 * - Unsafe style attributes are discarded
497 * - Prepends space if there are attributes.
498 *
499 * @param string $text
500 * @param string $element
501 * @return string
502 *
503 * @todo Check for legal values where the DTD limits things.
504 * @todo Check for unique id attribute :P
505 */
506 function fixTagAttributes( $text, $element ) {
507 if( trim( $text ) == '' ) {
508 return '';
509 }
510
511 # Unquoted attribute
512 # Since we quote this later, this can be anything distinguishable
513 # from the end of the attribute
514 if( !preg_match_all(
515 MW_ATTRIBS_REGEX,
516 $text,
517 $pairs,
518 PREG_SET_ORDER ) ) {
519 return '';
520 }
521
522 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
523 $attribs = array();
524 foreach( $pairs as $set ) {
525 $attribute = strtolower( $set[1] );
526 if( !isset( $whitelist[$attribute] ) ) {
527 continue;
528 }
529
530 $raw = Sanitizer::getTagAttributeCallback( $set );
531 $value = Sanitizer::normalizeAttributeValue( $raw );
532
533 # Strip javascript "expression" from stylesheets.
534 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
535 if( $attribute == 'style' && preg_match(
536 '/(expression|tps*:\/\/|url\\s*\().*/is',
537 Sanitizer::decodeCharReferences( $value ) ) ) {
538 # haxx0r
539 continue;
540 }
541
542 if( !isset( $attribs[$attribute] ) ) {
543 $attribs[$attribute] = "$attribute=\"$value\"";
544 }
545 }
546 if( empty( $attribs ) ) {
547 return '';
548 } else {
549 return ' ' . implode( ' ', $attribs );
550 }
551 }
552
553 /**
554 * Return an associative array of attribute names and values from
555 * a partial tag string. Attribute names are forces to lowercase,
556 * character references are decoded to UTF-8 text.
557 *
558 * @param string
559 * @return array
560 */
561 function decodeTagAttributes( $text ) {
562 $attribs = array();
563
564 if( trim( $text ) == '' ) {
565 return $attribs;
566 }
567
568 if( !preg_match_all(
569 MW_ATTRIBS_REGEX,
570 $text,
571 $pairs,
572 PREG_SET_ORDER ) ) {
573 return $attribs;
574 }
575
576 foreach( $pairs as $set ) {
577 $attribute = strtolower( $set[1] );
578 $value = Sanitizer::getTagAttributeCallback( $set );
579 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
580 }
581 return $attribs;
582 }
583
584 /**
585 * Pick the appropriate attribute value from a match set from the
586 * MW_ATTRIBS_REGEX matches.
587 *
588 * @param array $set
589 * @return string
590 * @access private
591 */
592 function getTagAttributeCallback( $set ) {
593 if( isset( $set[6] ) ) {
594 # Illegal #XXXXXX color with no quotes.
595 return $set[6];
596 } elseif( isset( $set[5] ) ) {
597 # No quotes.
598 return $set[5];
599 } elseif( isset( $set[4] ) ) {
600 # Single-quoted
601 return $set[4];
602 } elseif( isset( $set[3] ) ) {
603 # Double-quoted
604 return $set[3];
605 } elseif( !isset( $set[2] ) ) {
606 # In XHTML, attributes must have a value.
607 # For 'reduced' form, return explicitly the attribute name here.
608 return $set[1];
609 } else {
610 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
611 }
612 }
613
614 /**
615 * Normalize whitespace and character references in an XML source-
616 * encoded text for an attribute value.
617 *
618 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
619 * but note that we're not returning the value, but are returning
620 * XML source fragments that will be slapped into output.
621 *
622 * @param string $text
623 * @return string
624 * @access private
625 */
626 function normalizeAttributeValue( $text ) {
627 return str_replace( '"', '&quot;',
628 preg_replace(
629 '/\r\n|[\x20\x0d\x0a\x09]/',
630 ' ',
631 Sanitizer::normalizeCharReferences( $text ) ) );
632 }
633
634 /**
635 * Ensure that any entities and character references are legal
636 * for XML and XHTML specifically. Any stray bits will be
637 * &amp;-escaped to result in a valid text fragment.
638 *
639 * a. any named char refs must be known in XHTML
640 * b. any numeric char refs must be legal chars, not invalid or forbidden
641 * c. use &#x, not &#X
642 * d. fix or reject non-valid attributes
643 *
644 * @param string $text
645 * @return string
646 * @access private
647 */
648 function normalizeCharReferences( $text ) {
649 return preg_replace_callback(
650 MW_CHAR_REFS_REGEX,
651 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
652 $text );
653 }
654 /**
655 * @param string $matches
656 * @return string
657 */
658 function normalizeCharReferencesCallback( $matches ) {
659 $ret = null;
660 if( $matches[1] != '' ) {
661 $ret = Sanitizer::normalizeEntity( $matches[1] );
662 } elseif( $matches[2] != '' ) {
663 $ret = Sanitizer::decCharReference( $matches[2] );
664 } elseif( $matches[3] != '' ) {
665 $ret = Sanitizer::hexCharReference( $matches[3] );
666 } elseif( $matches[4] != '' ) {
667 $ret = Sanitizer::hexCharReference( $matches[4] );
668 }
669 if( is_null( $ret ) ) {
670 return htmlspecialchars( $matches[0] );
671 } else {
672 return $ret;
673 }
674 }
675
676 /**
677 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
678 * return the named entity reference as is. Otherwise, returns
679 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
680 *
681 * @param string $name
682 * @return string
683 */
684 function normalizeEntity( $name ) {
685 global $wgHtmlEntities;
686 if( isset( $wgHtmlEntities[$name] ) ) {
687 return "&$name;";
688 } else {
689 return "&amp;$name;";
690 }
691 }
692
693 function decCharReference( $codepoint ) {
694 $point = IntVal( $codepoint );
695 if( Sanitizer::validateCodepoint( $point ) ) {
696 return sprintf( '&#%d;', $point );
697 } else {
698 return null;
699 }
700 }
701
702 function hexCharReference( $codepoint ) {
703 $point = hexdec( $codepoint );
704 if( Sanitizer::validateCodepoint( $point ) ) {
705 return sprintf( '&#x%x;', $point );
706 } else {
707 return null;
708 }
709 }
710
711 /**
712 * Returns true if a given Unicode codepoint is a valid character in XML.
713 * @param int $codepoint
714 * @return bool
715 */
716 function validateCodepoint( $codepoint ) {
717 return ($codepoint == 0x09)
718 || ($codepoint == 0x0a)
719 || ($codepoint == 0x0d)
720 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
721 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
722 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
723 }
724
725 /**
726 * Decode any character references, numeric or named entities,
727 * in the text and return a UTF-8 string.
728 *
729 * @param string $text
730 * @return string
731 * @access public
732 */
733 function decodeCharReferences( $text ) {
734 return preg_replace_callback(
735 MW_CHAR_REFS_REGEX,
736 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
737 $text );
738 }
739
740 /**
741 * @param string $matches
742 * @return string
743 */
744 function decodeCharReferencesCallback( $matches ) {
745 if( $matches[1] != '' ) {
746 return Sanitizer::decodeEntity( $matches[1] );
747 } elseif( $matches[2] != '' ) {
748 return Sanitizer::decodeChar( intval( $matches[2] ) );
749 } elseif( $matches[3] != '' ) {
750 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
751 } elseif( $matches[4] != '' ) {
752 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
753 }
754 # Last case should be an ampersand by itself
755 return $matches[0];
756 }
757
758 /**
759 * Return UTF-8 string for a codepoint if that is a valid
760 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
761 * @param int $codepoint
762 * @return string
763 * @access private
764 */
765 function decodeChar( $codepoint ) {
766 if( Sanitizer::validateCodepoint( $codepoint ) ) {
767 return codepointToUtf8( $codepoint );
768 } else {
769 return UTF8_REPLACEMENT;
770 }
771 }
772
773 /**
774 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
775 * return the UTF-8 encoding of that character. Otherwise, returns
776 * pseudo-entity source (eg &foo;)
777 *
778 * @param string $name
779 * @return string
780 */
781 function decodeEntity( $name ) {
782 global $wgHtmlEntities;
783 if( isset( $wgHtmlEntities[$name] ) ) {
784 return codepointToUtf8( $wgHtmlEntities[$name] );
785 } else {
786 return "&$name;";
787 }
788 }
789
790 /**
791 * Fetch the whitelist of acceptable attributes for a given
792 * element name.
793 *
794 * @param string $element
795 * @return array
796 */
797 function attributeWhitelist( $element ) {
798 static $list;
799 if( !isset( $list ) ) {
800 $list = Sanitizer::setupAttributeWhitelist();
801 }
802 return isset( $list[$element] )
803 ? $list[$element]
804 : array();
805 }
806
807 /**
808 * @return array
809 */
810 function setupAttributeWhitelist() {
811 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
812 $block = array_merge( $common, array( 'align' ) );
813 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
814 $tablecell = array( 'abbr',
815 'axis',
816 'headers',
817 'scope',
818 'rowspan',
819 'colspan',
820 'nowrap', # deprecated
821 'width', # deprecated
822 'height', # deprecated
823 'bgcolor' # deprecated
824 );
825
826 # Numbers refer to sections in HTML 4.01 standard describing the element.
827 # See: http://www.w3.org/TR/html4/
828 $whitelist = array (
829 # 7.5.4
830 'div' => $block,
831 'center' => $common, # deprecated
832 'span' => $block, # ??
833
834 # 7.5.5
835 'h1' => $block,
836 'h2' => $block,
837 'h3' => $block,
838 'h4' => $block,
839 'h5' => $block,
840 'h6' => $block,
841
842 # 7.5.6
843 # address
844
845 # 8.2.4
846 # bdo
847
848 # 9.2.1
849 'em' => $common,
850 'strong' => $common,
851 'cite' => $common,
852 # dfn
853 'code' => $common,
854 # samp
855 # kbd
856 'var' => $common,
857 # abbr
858 # acronym
859
860 # 9.2.2
861 'blockquote' => array_merge( $common, array( 'cite' ) ),
862 # q
863
864 # 9.2.3
865 'sub' => $common,
866 'sup' => $common,
867
868 # 9.3.1
869 'p' => $block,
870
871 # 9.3.2
872 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
873
874 # 9.3.4
875 'pre' => array_merge( $common, array( 'width' ) ),
876
877 # 9.4
878 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
879 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
880
881 # 10.2
882 'ul' => array_merge( $common, array( 'type' ) ),
883 'ol' => array_merge( $common, array( 'type', 'start' ) ),
884 'li' => array_merge( $common, array( 'type', 'value' ) ),
885
886 # 10.3
887 'dl' => $common,
888 'dd' => $common,
889 'dt' => $common,
890
891 # 11.2.1
892 'table' => array_merge( $common,
893 array( 'summary', 'width', 'border', 'frame',
894 'rules', 'cellspacing', 'cellpadding',
895 'align', 'bgcolor', 'frame', 'rules',
896 'border' ) ),
897
898 # 11.2.2
899 'caption' => array_merge( $common, array( 'align' ) ),
900
901 # 11.2.3
902 'thead' => array_merge( $common, $tablealign ),
903 'tfoot' => array_merge( $common, $tablealign ),
904 'tbody' => array_merge( $common, $tablealign ),
905
906 # 11.2.4
907 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
908 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
909
910 # 11.2.5
911 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
912
913 # 11.2.6
914 'td' => array_merge( $common, $tablecell, $tablealign ),
915 'th' => array_merge( $common, $tablecell, $tablealign ),
916
917 # 15.2.1
918 'tt' => $common,
919 'b' => $common,
920 'i' => $common,
921 'big' => $common,
922 'small' => $common,
923 'strike' => $common,
924 's' => $common,
925 'u' => $common,
926
927 # 15.2.2
928 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
929 # basefont
930
931 # 15.3
932 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
933
934 # XHTML Ruby annotation text module, simple ruby only.
935 # http://www.w3c.org/TR/ruby/
936 'ruby' => $common,
937 # rbc
938 # rtc
939 'rb' => $common,
940 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
941 'rp' => $common,
942 );
943 return $whitelist;
944 }
945
946 /**
947 * Take a fragment of (potentially invalid) HTML and return
948 * a version with any tags removed, encoded suitably for literal
949 * inclusion in an attribute value.
950 *
951 * @param string $text HTML fragment
952 * @return string
953 */
954 function stripAllTags( $text ) {
955 # Actual <tags>
956 $text = preg_replace( '/<[^>]*>/', '', $text );
957
958 # Normalize &entities and whitespace
959 $text = Sanitizer::normalizeAttributeValue( $text );
960
961 # Will be placed into "double-quoted" attributes,
962 # make sure remaining bits are safe.
963 $text = str_replace(
964 array('<', '>', '"'),
965 array('&lt;', '&gt;', '&quot;'),
966 $text );
967
968 return $text;
969 }
970
971 }
972
973 ?>