includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  * @subpackage Parser
  26  */
  27
  28 /**
  29  * Regular expression to match various types of character references in
  30  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  31  */
  32 define( 'MW_CHAR_REFS_REGEX',
  33         '/&([A-Za-z0-9]+);
  34          |&\#([0-9]+);
  35          |&\#x([0-9A-Za-z]+);
  36          |&\#X([0-9A-Za-z]+);
  37          |(&)/x' );
  38
  39 /**
  40  * Regular expression to match HTML/XML attribute pairs within a tag.
  41  * Allows some... latitude.
  42  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  43  */
  44 $attrib = '[A-Za-z0-9]';
  45 $space = '[\x09\x0a\x0d\x20]';
  46 define( 'MW_ATTRIBS_REGEX',
  47         "/(?:^|$space)($attrib+)
  48           ($space*=$space*
  49                 (?:
  50                  # The attribute value: quoted or alone
  51                   \"([^<\"]*)\"
  52                  | '([^<']*)'
  53                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  54                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  55                                                          # colors are specified like this.
  56                                                          # We'll be normalizing it.
  57                 )
  58            )?(?=$space|\$)/sx" );
  59
  60 /**
  61  * List of all named character entities defined in HTML 4.01
  62  * http://www.w3.org/TR/html4/sgml/entities.html
  63  * @access private
  64  */
  65 global $wgHtmlEntities;
  66 $wgHtmlEntities = array(
  67         'Aacute'   => 193,
  68         'aacute'   => 225,
  69         'Acirc'    => 194,
  70         'acirc'    => 226,
  71         'acute'    => 180,
  72         'AElig'    => 198,
  73         'aelig'    => 230,
  74         'Agrave'   => 192,
  75         'agrave'   => 224,
  76         'alefsym'  => 8501,
  77         'Alpha'    => 913,
  78         'alpha'    => 945,
  79         'amp'      => 38,
  80         'and'      => 8743,
  81         'ang'      => 8736,
  82         'Aring'    => 197,
  83         'aring'    => 229,
  84         'asymp'    => 8776,
  85         'Atilde'   => 195,
  86         'atilde'   => 227,
  87         'Auml'     => 196,
  88         'auml'     => 228,
  89         'bdquo'    => 8222,
  90         'Beta'     => 914,
  91         'beta'     => 946,
  92         'brvbar'   => 166,
  93         'bull'     => 8226,
  94         'cap'      => 8745,
  95         'Ccedil'   => 199,
  96         'ccedil'   => 231,
  97         'cedil'    => 184,
  98         'cent'     => 162,
  99         'Chi'      => 935,
 100         'chi'      => 967,
 101         'circ'     => 710,
 102         'clubs'    => 9827,
 103         'cong'     => 8773,
 104         'copy'     => 169,
 105         'crarr'    => 8629,
 106         'cup'      => 8746,
 107         'curren'   => 164,
 108         'dagger'   => 8224,
 109         'Dagger'   => 8225,
 110         'darr'     => 8595,
 111         'dArr'     => 8659,
 112         'deg'      => 176,
 113         'Delta'    => 916,
 114         'delta'    => 948,
 115         'diams'    => 9830,
 116         'divide'   => 247,
 117         'Eacute'   => 201,
 118         'eacute'   => 233,
 119         'Ecirc'    => 202,
 120         'ecirc'    => 234,
 121         'Egrave'   => 200,
 122         'egrave'   => 232,
 123         'empty'    => 8709,
 124         'emsp'     => 8195,
 125         'ensp'     => 8194,
 126         'Epsilon'  => 917,
 127         'epsilon'  => 949,
 128         'equiv'    => 8801,
 129         'Eta'      => 919,
 130         'eta'      => 951,
 131         'ETH'      => 208,
 132         'eth'      => 240,
 133         'Euml'     => 203,
 134         'euml'     => 235,
 135         'euro'     => 8364,
 136         'exist'    => 8707,
 137         'fnof'     => 402,
 138         'forall'   => 8704,
 139         'frac12'   => 189,
 140         'frac14'   => 188,
 141         'frac34'   => 190,
 142         'frasl'    => 8260,
 143         'Gamma'    => 915,
 144         'gamma'    => 947,
 145         'ge'       => 8805,
 146         'gt'       => 62,
 147         'harr'     => 8596,
 148         'hArr'     => 8660,
 149         'hearts'   => 9829,
 150         'hellip'   => 8230,
 151         'Iacute'   => 205,
 152         'iacute'   => 237,
 153         'Icirc'    => 206,
 154         'icirc'    => 238,
 155         'iexcl'    => 161,
 156         'Igrave'   => 204,
 157         'igrave'   => 236,
 158         'image'    => 8465,
 159         'infin'    => 8734,
 160         'int'      => 8747,
 161         'Iota'     => 921,
 162         'iota'     => 953,
 163         'iquest'   => 191,
 164         'isin'     => 8712,
 165         'Iuml'     => 207,
 166         'iuml'     => 239,
 167         'Kappa'    => 922,
 168         'kappa'    => 954,
 169         'Lambda'   => 923,
 170         'lambda'   => 955,
 171         'lang'     => 9001,
 172         'laquo'    => 171,
 173         'larr'     => 8592,
 174         'lArr'     => 8656,
 175         'lceil'    => 8968,
 176         'ldquo'    => 8220,
 177         'le'       => 8804,
 178         'lfloor'   => 8970,
 179         'lowast'   => 8727,
 180         'loz'      => 9674,
 181         'lrm'      => 8206,
 182         'lsaquo'   => 8249,
 183         'lsquo'    => 8216,
 184         'lt'       => 60,
 185         'macr'     => 175,
 186         'mdash'    => 8212,
 187         'micro'    => 181,
 188         'middot'   => 183,
 189         'minus'    => 8722,
 190         'Mu'       => 924,
 191         'mu'       => 956,
 192         'nabla'    => 8711,
 193         'nbsp'     => 160,
 194         'ndash'    => 8211,
 195         'ne'       => 8800,
 196         'ni'       => 8715,
 197         'not'      => 172,
 198         'notin'    => 8713,
 199         'nsub'     => 8836,
 200         'Ntilde'   => 209,
 201         'ntilde'   => 241,
 202         'Nu'       => 925,
 203         'nu'       => 957,
 204         'Oacute'   => 211,
 205         'oacute'   => 243,
 206         'Ocirc'    => 212,
 207         'ocirc'    => 244,
 208         'OElig'    => 338,
 209         'oelig'    => 339,
 210         'Ograve'   => 210,
 211         'ograve'   => 242,
 212         'oline'    => 8254,
 213         'Omega'    => 937,
 214         'omega'    => 969,
 215         'Omicron'  => 927,
 216         'omicron'  => 959,
 217         'oplus'    => 8853,
 218         'or'       => 8744,
 219         'ordf'     => 170,
 220         'ordm'     => 186,
 221         'Oslash'   => 216,
 222         'oslash'   => 248,
 223         'Otilde'   => 213,
 224         'otilde'   => 245,
 225         'otimes'   => 8855,
 226         'Ouml'     => 214,
 227         'ouml'     => 246,
 228         'para'     => 182,
 229         'part'     => 8706,
 230         'permil'   => 8240,
 231         'perp'     => 8869,
 232         'Phi'      => 934,
 233         'phi'      => 966,
 234         'Pi'       => 928,
 235         'pi'       => 960,
 236         'piv'      => 982,
 237         'plusmn'   => 177,
 238         'pound'    => 163,
 239         'prime'    => 8242,
 240         'Prime'    => 8243,
 241         'prod'     => 8719,
 242         'prop'     => 8733,
 243         'Psi'      => 936,
 244         'psi'      => 968,
 245         'quot'     => 34,
 246         'radic'    => 8730,
 247         'rang'     => 9002,
 248         'raquo'    => 187,
 249         'rarr'     => 8594,
 250         'rArr'     => 8658,
 251         'rceil'    => 8969,
 252         'rdquo'    => 8221,
 253         'real'     => 8476,
 254         'reg'      => 174,
 255         'rfloor'   => 8971,
 256         'Rho'      => 929,
 257         'rho'      => 961,
 258         'rlm'      => 8207,
 259         'rsaquo'   => 8250,
 260         'rsquo'    => 8217,
 261         'sbquo'    => 8218,
 262         'Scaron'   => 352,
 263         'scaron'   => 353,
 264         'sdot'     => 8901,
 265         'sect'     => 167,
 266         'shy'      => 173,
 267         'Sigma'    => 931,
 268         'sigma'    => 963,
 269         'sigmaf'   => 962,
 270         'sim'      => 8764,
 271         'spades'   => 9824,
 272         'sub'      => 8834,
 273         'sube'     => 8838,
 274         'sum'      => 8721,
 275         'sup'      => 8835,
 276         'sup1'     => 185,
 277         'sup2'     => 178,
 278         'sup3'     => 179,
 279         'supe'     => 8839,
 280         'szlig'    => 223,
 281         'Tau'      => 932,
 282         'tau'      => 964,
 283         'there4'   => 8756,
 284         'Theta'    => 920,
 285         'theta'    => 952,
 286         'thetasym' => 977,
 287         'thinsp'   => 8201,
 288         'THORN'    => 222,
 289         'thorn'    => 254,
 290         'tilde'    => 732,
 291         'times'    => 215,
 292         'trade'    => 8482,
 293         'Uacute'   => 218,
 294         'uacute'   => 250,
 295         'uarr'     => 8593,
 296         'uArr'     => 8657,
 297         'Ucirc'    => 219,
 298         'ucirc'    => 251,
 299         'Ugrave'   => 217,
 300         'ugrave'   => 249,
 301         'uml'      => 168,
 302         'upsih'    => 978,
 303         'Upsilon'  => 933,
 304         'upsilon'  => 965,
 305         'Uuml'     => 220,
 306         'uuml'     => 252,
 307         'weierp'   => 8472,
 308         'Xi'       => 926,
 309         'xi'       => 958,
 310         'Yacute'   => 221,
 311         'yacute'   => 253,
 312         'yen'      => 165,
 313         'Yuml'     => 376,
 314         'yuml'     => 255,
 315         'Zeta'     => 918,
 316         'zeta'     => 950,
 317         'zwj'      => 8205,
 318         'zwnj'     => 8204 );
 319
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @access private
 325          * @param string $text
 326          * @return string
 327          */
 328         function removeHTMLtags( $text ) {
 329                 global $wgUseTidy, $wgUserHtml;
 330                 $fname = 'Parser::removeHTMLtags';
 331                 wfProfileIn( $fname );
 332
 333                 if( $wgUserHtml ) {
 334                         $htmlpairs = array( # Tags that must be closed
 335                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 336                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 337                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 338                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 339                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
 340                         );
 341                         $htmlsingle = array(
 342                                 'br', 'hr', 'li', 'dt', 'dd'
 343                         );
 344                         $htmlnest = array( # Tags that can be nested--??
 345                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 346                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 347                         );
 348                         $tabletags = array( # Can only appear inside table
 349                                 'td', 'th', 'tr'
 350                         );
 351                 } else {
 352                         $htmlpairs = array();
 353                         $htmlsingle = array();
 354                         $htmlnest = array();
 355                         $tabletags = array();
 356                 }
 357
 358                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 359                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 360
 361                 # Remove HTML comments
 362                 $text = Sanitizer::removeHTMLcomments( $text );
 363
 364                 $bits = explode( '<', $text );
 365                 $text = array_shift( $bits );
 366                 if(!$wgUseTidy) {
 367                         $tagstack = array(); $tablestack = array();
 368                         foreach ( $bits as $x ) {
 369                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 370                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 371                                 $x, $regs );
 372                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 373                                 error_reporting( $prev );
 374
 375                                 $badtag = 0 ;
 376                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 377                                         # Check our stack
 378                                         if ( $slash ) {
 379                                                 # Closing a tag...
 380                                                 if ( ! in_array( $t, $htmlsingle ) &&
 381                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
 382                                                         @array_push( $tagstack, $ot );
 383                                                         $badtag = 1;
 384                                                 } else {
 385                                                         if ( $t == 'table' ) {
 386                                                                 $tagstack = array_pop( $tablestack );
 387                                                         }
 388                                                         $newparams = '';
 389                                                 }
 390                                         } else {
 391                                                 # Keep track for later
 392                                                 if ( in_array( $t, $tabletags ) &&
 393                                                 ! in_array( 'table', $tagstack ) ) {
 394                                                         $badtag = 1;
 395                                                 } else if ( in_array( $t, $tagstack ) &&
 396                                                 ! in_array ( $t , $htmlnest ) ) {
 397                                                         $badtag = 1 ;
 398                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 399                                                         if ( $t == 'table' ) {
 400                                                                 array_push( $tablestack, $tagstack );
 401                                                                 $tagstack = array();
 402                                                         }
 403                                                         array_push( $tagstack, $t );
 404                                                 }
 405                                                 # Strip non-approved attributes from the tag
 406                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 407                                         }
 408                                         if ( ! $badtag ) {
 409                                                 $rest = str_replace( '>', '&gt;', $rest );
 410                                                 $text .= "<$slash$t$newparams$brace$rest";
 411                                                 continue;
 412                                         }
 413                                 }
 414                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 415                         }
 416                         # Close off any remaining tags
 417                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 418                                 $text .= "</$t>\n";
 419                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 420                         }
 421                 } else {
 422                         # this might be possible using tidy itself
 423                         foreach ( $bits as $x ) {
 424                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 425                                 $x, $regs );
 426                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 427                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 428                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 429                                         $rest = str_replace( '>', '&gt;', $rest );
 430                                         $text .= "<$slash$t$newparams$brace$rest";
 431                                 } else {
 432                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 433                                 }
 434                         }
 435                 }
 436                 wfProfileOut( $fname );
 437                 return $text;
 438         }
 439
 440         /**
 441          * Remove '<!--', '-->', and everything between.
 442          * To avoid leaving blank lines, when a comment is both preceded
 443          * and followed by a newline (ignoring spaces), trim leading and
 444          * trailing spaces and one of the newlines.
 445          *
 446          * @access private
 447          * @param string $text
 448          * @return string
 449          */
 450         function removeHTMLcomments( $text ) {
 451                 $fname='Parser::removeHTMLcomments';
 452                 wfProfileIn( $fname );
 453                 while (($start = strpos($text, '<!--')) !== false) {
 454                         $end = strpos($text, '-->', $start + 4);
 455                         if ($end === false) {
 456                                 # Unterminated comment; bail out
 457                                 break;
 458                         }
 459
 460                         $end += 3;
 461
 462                         # Trim space and newline if the comment is both
 463                         # preceded and followed by a newline
 464                         $spaceStart = max($start - 1, 0);
 465                         $spaceLen = $end - $spaceStart;
 466                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 467                                 $spaceStart--;
 468                                 $spaceLen++;
 469                         }
 470                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 471                                 $spaceLen++;
 472                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 473                                 # Remove the comment, leading and trailing
 474                                 # spaces, and leave only one newline.
 475                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 476                         }
 477                         else {
 478                                 # Remove just the comment.
 479                                 $text = substr_replace($text, '', $start, $end - $start);
 480                         }
 481                 }
 482                 wfProfileOut( $fname );
 483                 return $text;
 484         }
 485
 486         /**
 487          * Take a tag soup fragment listing an HTML element's attributes
 488          * and normalize it to well-formed XML, discarding unwanted attributes.
 489          *
 490          * - Normalizes attribute names to lowercase
 491          * - Discards attributes not on a whitelist for the given element
 492          * - Turns broken or invalid entities into plaintext
 493          * - Double-quotes all attribute values
 494          * - Attributes without values are given the name as attribute
 495          * - Double attributes are discarded
 496          * - Unsafe style attributes are discarded
 497          * - Prepends space if there are attributes.
 498          *
 499          * @param string $text
 500          * @param string $element
 501          * @return string
 502          *
 503          * @todo Check for legal values where the DTD limits things.
 504          * @todo Check for unique id attribute :P
 505          */
 506         function fixTagAttributes( $text, $element ) {
 507                 if( trim( $text ) == '' ) {
 508                         return '';
 509                 }
 510
 511                 # Unquoted attribute
 512                 # Since we quote this later, this can be anything distinguishable
 513                 # from the end of the attribute
 514                 if( !preg_match_all(
 515                         MW_ATTRIBS_REGEX,
 516                         $text,
 517                         $pairs,
 518                         PREG_SET_ORDER ) ) {
 519                         return '';
 520                 }
 521
 522                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 523                 $attribs = array();
 524                 foreach( $pairs as $set ) {
 525                         $attribute = strtolower( $set[1] );
 526                         if( !isset( $whitelist[$attribute] ) ) {
 527                                 continue;
 528                         }
 529
 530                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 531                         $value = Sanitizer::normalizeAttributeValue( $raw );
 532
 533                         # Strip javascript "expression" from stylesheets.
 534                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 535                         if( $attribute == 'style' && preg_match(
 536                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 537                                         Sanitizer::decodeCharReferences( $value ) ) ) {
 538                                 # haxx0r
 539                                 continue;
 540                         }
 541
 542                         # Templates and links may be expanded in later parsing,
 543                         # creating invalid or dangerous output. Suppress this.
 544                         $value = strtr( $value, array(
 545                                 '{'    => '&#123;',
 546                                 '['    => '&#91;',
 547                                 "''"   => '&#39;&#39;',
 548                                 'ISBN' => '&#73;SBN',
 549                                 'RFC'  => '&#82;FC',
 550                                 'PMID' => '&#80;MID',
 551                         ) );
 552                         $value = preg_replace(
 553                                 '/(' . URL_PROTOCOLS . '):/',
 554                                 '\\1&#58;', $value );
 555
 556                         if( !isset( $attribs[$attribute] ) ) {
 557                                 $attribs[$attribute] = "$attribute=\"$value\"";
 558                         }
 559                 }
 560                 if( empty( $attribs ) ) {
 561                         return '';
 562                 } else {
 563                         return ' ' . implode( ' ', $attribs );
 564                 }
 565         }
 566
 567         /**
 568          * Return an associative array of attribute names and values from
 569          * a partial tag string. Attribute names are forces to lowercase,
 570          * character references are decoded to UTF-8 text.
 571          *
 572          * @param string
 573          * @return array
 574          */
 575         function decodeTagAttributes( $text ) {
 576                 $attribs = array();
 577
 578                 if( trim( $text ) == '' ) {
 579                         return $attribs;
 580                 }
 581
 582                 if( !preg_match_all(
 583                         MW_ATTRIBS_REGEX,
 584                         $text,
 585                         $pairs,
 586                         PREG_SET_ORDER ) ) {
 587                         return $attribs;
 588                 }
 589
 590                 foreach( $pairs as $set ) {
 591                         $attribute = strtolower( $set[1] );
 592                         $value = Sanitizer::getTagAttributeCallback( $set );
 593                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 594                 }
 595                 return $attribs;
 596         }
 597
 598         /**
 599          * Pick the appropriate attribute value from a match set from the
 600          * MW_ATTRIBS_REGEX matches.
 601          *
 602          * @param array $set
 603          * @return string
 604          * @access private
 605          */
 606         function getTagAttributeCallback( $set ) {
 607                 if( isset( $set[6] ) ) {
 608                         # Illegal #XXXXXX color with no quotes.
 609                         return $set[6];
 610                 } elseif( isset( $set[5] ) ) {
 611                         # No quotes.
 612                         return $set[5];
 613                 } elseif( isset( $set[4] ) ) {
 614                         # Single-quoted
 615                         return $set[4];
 616                 } elseif( isset( $set[3] ) ) {
 617                         # Double-quoted
 618                         return $set[3];
 619                 } elseif( !isset( $set[2] ) ) {
 620                         # In XHTML, attributes must have a value.
 621                         # For 'reduced' form, return explicitly the attribute name here.
 622                         return $set[1];
 623                 } else {
 624                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 625                 }
 626         }
 627
 628         /**
 629          * Normalize whitespace and character references in an XML source-
 630          * encoded text for an attribute value.
 631          *
 632          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 633          * but note that we're not returning the value, but are returning
 634          * XML source fragments that will be slapped into output.
 635          *
 636          * @param string $text
 637          * @return string
 638          * @access private
 639          */
 640         function normalizeAttributeValue( $text ) {
 641                 return str_replace( '"', '&quot;',
 642                         preg_replace(
 643                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 644                                 ' ',
 645                                 Sanitizer::normalizeCharReferences( $text ) ) );
 646         }
 647
 648         /**
 649          * Ensure that any entities and character references are legal
 650          * for XML and XHTML specifically. Any stray bits will be
 651          * &amp;-escaped to result in a valid text fragment.
 652          *
 653          * a. any named char refs must be known in XHTML
 654          * b. any numeric char refs must be legal chars, not invalid or forbidden
 655          * c. use &#x, not &#X
 656          * d. fix or reject non-valid attributes
 657          *
 658          * @param string $text
 659          * @return string
 660          * @access private
 661          */
 662         function normalizeCharReferences( $text ) {
 663                 return preg_replace_callback(
 664                         MW_CHAR_REFS_REGEX,
 665                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 666                         $text );
 667         }
 668         /**
 669          * @param string $matches
 670          * @return string
 671          */
 672         function normalizeCharReferencesCallback( $matches ) {
 673                 $ret = null;
 674                 if( $matches[1] != '' ) {
 675                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 676                 } elseif( $matches[2] != '' ) {
 677                         $ret = Sanitizer::decCharReference( $matches[2] );
 678                 } elseif( $matches[3] != ''  ) {
 679                         $ret = Sanitizer::hexCharReference( $matches[3] );
 680                 } elseif( $matches[4] != '' ) {
 681                         $ret = Sanitizer::hexCharReference( $matches[4] );
 682                 }
 683                 if( is_null( $ret ) ) {
 684                         return htmlspecialchars( $matches[0] );
 685                 } else {
 686                         return $ret;
 687                 }
 688         }
 689
 690         /**
 691          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 692          * return the named entity reference as is. Otherwise, returns
 693          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 694          *
 695          * @param string $name
 696          * @return string
 697          */
 698         function normalizeEntity( $name ) {
 699                 global $wgHtmlEntities;
 700                 if( isset( $wgHtmlEntities[$name] ) ) {
 701                         return "&$name;";
 702                 } else {
 703                         return "&amp;$name;";
 704                 }
 705         }
 706
 707         function decCharReference( $codepoint ) {
 708                 $point = IntVal( $codepoint );
 709                 if( Sanitizer::validateCodepoint( $point ) ) {
 710                         return sprintf( '&#%d;', $point );
 711                 } else {
 712                         return null;
 713                 }
 714         }
 715
 716         function hexCharReference( $codepoint ) {
 717                 $point = hexdec( $codepoint );
 718                 if( Sanitizer::validateCodepoint( $point ) ) {
 719                         return sprintf( '&#x%x;', $point );
 720                 } else {
 721                         return null;
 722                 }
 723         }
 724
 725         /**
 726          * Returns true if a given Unicode codepoint is a valid character in XML.
 727          * @param int $codepoint
 728          * @return bool
 729          */
 730         function validateCodepoint( $codepoint ) {
 731                 return ($codepoint ==    0x09)
 732                         || ($codepoint ==    0x0a)
 733                         || ($codepoint ==    0x0d)
 734                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 735                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 736                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 737         }
 738
 739         /**
 740          * Decode any character references, numeric or named entities,
 741          * in the text and return a UTF-8 string.
 742          *
 743          * @param string $text
 744          * @return string
 745          * @access public
 746          */
 747         function decodeCharReferences( $text ) {
 748                 return preg_replace_callback(
 749                         MW_CHAR_REFS_REGEX,
 750                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 751                         $text );
 752         }
 753
 754         /**
 755          * @param string $matches
 756          * @return string
 757          */
 758         function decodeCharReferencesCallback( $matches ) {
 759                 if( $matches[1] != '' ) {
 760                         return Sanitizer::decodeEntity( $matches[1] );
 761                 } elseif( $matches[2] != '' ) {
 762                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 763                 } elseif( $matches[3] != ''  ) {
 764                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 765                 } elseif( $matches[4] != '' ) {
 766                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 767                 }
 768                 # Last case should be an ampersand by itself
 769                 return $matches[0];
 770         }
 771
 772         /**
 773          * Return UTF-8 string for a codepoint if that is a valid
 774          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 775          * @param int $codepoint
 776          * @return string
 777          * @access private
 778          */
 779         function decodeChar( $codepoint ) {
 780                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 781                         return codepointToUtf8( $codepoint );
 782                 } else {
 783                         return UTF8_REPLACEMENT;
 784                 }
 785         }
 786
 787         /**
 788          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 789          * return the UTF-8 encoding of that character. Otherwise, returns
 790          * pseudo-entity source (eg &foo;)
 791          *
 792          * @param string $name
 793          * @return string
 794          */
 795         function decodeEntity( $name ) {
 796                 global $wgHtmlEntities;
 797                 if( isset( $wgHtmlEntities[$name] ) ) {
 798                         return codepointToUtf8( $wgHtmlEntities[$name] );
 799                 } else {
 800                         return "&$name;";
 801                 }
 802         }
 803
 804         /**
 805          * Fetch the whitelist of acceptable attributes for a given
 806          * element name.
 807          *
 808          * @param string $element
 809          * @return array
 810          */
 811         function attributeWhitelist( $element ) {
 812                 static $list;
 813                 if( !isset( $list ) ) {
 814                         $list = Sanitizer::setupAttributeWhitelist();
 815                 }
 816                 return isset( $list[$element] )
 817                         ? $list[$element]
 818                         : array();
 819         }
 820
 821         /**
 822          * @return array
 823          */
 824         function setupAttributeWhitelist() {
 825                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 826                 $block = array_merge( $common, array( 'align' ) );
 827                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 828                 $tablecell = array( 'abbr',
 829                                     'axis',
 830                                     'headers',
 831                                     'scope',
 832                                     'rowspan',
 833                                     'colspan',
 834                                     'nowrap', # deprecated
 835                                     'width',  # deprecated
 836                                     'height', # deprecated
 837                                     'bgcolor' # deprecated
 838                                     );
 839
 840                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 841                 # See: http://www.w3.org/TR/html4/
 842                 $whitelist = array (
 843                         # 7.5.4
 844                         'div'        => $block,
 845                         'center'     => $common, # deprecated
 846                         'span'       => $block, # ??
 847
 848                         # 7.5.5
 849                         'h1'         => $block,
 850                         'h2'         => $block,
 851                         'h3'         => $block,
 852                         'h4'         => $block,
 853                         'h5'         => $block,
 854                         'h6'         => $block,
 855
 856                         # 7.5.6
 857                         # address
 858
 859                         # 8.2.4
 860                         # bdo
 861
 862                         # 9.2.1
 863                         'em'         => $common,
 864                         'strong'     => $common,
 865                         'cite'       => $common,
 866                         # dfn
 867                         'code'       => $common,
 868                         # samp
 869                         # kbd
 870                         'var'        => $common,
 871                         # abbr
 872                         # acronym
 873
 874                         # 9.2.2
 875                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 876                         # q
 877
 878                         # 9.2.3
 879                         'sub'        => $common,
 880                         'sup'        => $common,
 881
 882                         # 9.3.1
 883                         'p'          => $block,
 884
 885                         # 9.3.2
 886                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 887
 888                         # 9.3.4
 889                         'pre'        => array_merge( $common, array( 'width' ) ),
 890
 891                         # 9.4
 892                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 893                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 894
 895                         # 10.2
 896                         'ul'         => array_merge( $common, array( 'type' ) ),
 897                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 898                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 899
 900                         # 10.3
 901                         'dl'         => $common,
 902                         'dd'         => $common,
 903                         'dt'         => $common,
 904
 905                         # 11.2.1
 906                         'table'      => array_merge( $common,
 907                                                                 array( 'summary', 'width', 'border', 'frame',
 908                                                                                          'rules', 'cellspacing', 'cellpadding',
 909                                                                                          'align', 'bgcolor', 'frame', 'rules',
 910                                                                                          'border' ) ),
 911
 912                         # 11.2.2
 913                         'caption'    => array_merge( $common, array( 'align' ) ),
 914
 915                         # 11.2.3
 916                         'thead'      => array_merge( $common, $tablealign ),
 917                         'tfoot'      => array_merge( $common, $tablealign ),
 918                         'tbody'      => array_merge( $common, $tablealign ),
 919
 920                         # 11.2.4
 921                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 922                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 923
 924                         # 11.2.5
 925                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 926
 927                         # 11.2.6
 928                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 929                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 930
 931                         # 15.2.1
 932                         'tt'         => $common,
 933                         'b'          => $common,
 934                         'i'          => $common,
 935                         'big'        => $common,
 936                         'small'      => $common,
 937                         'strike'     => $common,
 938                         's'          => $common,
 939                         'u'          => $common,
 940
 941                         # 15.2.2
 942                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 943                         # basefont
 944
 945                         # 15.3
 946                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 947
 948                         # XHTML Ruby annotation text module, simple ruby only.
 949                         # http://www.w3c.org/TR/ruby/
 950                         'ruby'       => $common,
 951                         # rbc
 952                         # rtc
 953                         'rb'         => $common,
 954                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 955                         'rp'         => $common,
 956                         );
 957                 return $whitelist;
 958         }
 959
 960         /**
 961          * Take a fragment of (potentially invalid) HTML and return
 962          * a version with any tags removed, encoded suitably for literal
 963          * inclusion in an attribute value.
 964          *
 965          * @param string $text HTML fragment
 966          * @return string
 967          */
 968         function stripAllTags( $text ) {
 969                 # Actual <tags>
 970                 $text = preg_replace( '/<[^>]*>/', '', $text );
 971
 972                 # Normalize &entities and whitespace
 973                 $text = Sanitizer::normalizeAttributeValue( $text );
 974
 975                 # Will be placed into "double-quoted" attributes,
 976                 # make sure remaining bits are safe.
 977                 $text = str_replace(
 978                         array('<', '>', '"'),
 979                         array('&lt;', '&gt;', '&quot;'),
 980                         $text );
 981
 982                 return $text;
 983         }
 984
 985 }
 986
 987 ?>