includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * Regular expression to match URIs that could trigger script execution
  61  */
  62 define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
  63
  64 /**
  65  * Regular expression to match namespace attributes
  66  */
  67 define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
  68
  69 /**
  70  * List of all named character entities defined in HTML 4.01
  71  * http://www.w3.org/TR/html4/sgml/entities.html
  72  * @private
  73  */
  74 global $wgHtmlEntities;
  75 $wgHtmlEntities = array(
  76         'Aacute'   => 193,
  77         'aacute'   => 225,
  78         'Acirc'    => 194,
  79         'acirc'    => 226,
  80         'acute'    => 180,
  81         'AElig'    => 198,
  82         'aelig'    => 230,
  83         'Agrave'   => 192,
  84         'agrave'   => 224,
  85         'alefsym'  => 8501,
  86         'Alpha'    => 913,
  87         'alpha'    => 945,
  88         'amp'      => 38,
  89         'and'      => 8743,
  90         'ang'      => 8736,
  91         'Aring'    => 197,
  92         'aring'    => 229,
  93         'asymp'    => 8776,
  94         'Atilde'   => 195,
  95         'atilde'   => 227,
  96         'Auml'     => 196,
  97         'auml'     => 228,
  98         'bdquo'    => 8222,
  99         'Beta'     => 914,
 100         'beta'     => 946,
 101         'brvbar'   => 166,
 102         'bull'     => 8226,
 103         'cap'      => 8745,
 104         'Ccedil'   => 199,
 105         'ccedil'   => 231,
 106         'cedil'    => 184,
 107         'cent'     => 162,
 108         'Chi'      => 935,
 109         'chi'      => 967,
 110         'circ'     => 710,
 111         'clubs'    => 9827,
 112         'cong'     => 8773,
 113         'copy'     => 169,
 114         'crarr'    => 8629,
 115         'cup'      => 8746,
 116         'curren'   => 164,
 117         'dagger'   => 8224,
 118         'Dagger'   => 8225,
 119         'darr'     => 8595,
 120         'dArr'     => 8659,
 121         'deg'      => 176,
 122         'Delta'    => 916,
 123         'delta'    => 948,
 124         'diams'    => 9830,
 125         'divide'   => 247,
 126         'Eacute'   => 201,
 127         'eacute'   => 233,
 128         'Ecirc'    => 202,
 129         'ecirc'    => 234,
 130         'Egrave'   => 200,
 131         'egrave'   => 232,
 132         'empty'    => 8709,
 133         'emsp'     => 8195,
 134         'ensp'     => 8194,
 135         'Epsilon'  => 917,
 136         'epsilon'  => 949,
 137         'equiv'    => 8801,
 138         'Eta'      => 919,
 139         'eta'      => 951,
 140         'ETH'      => 208,
 141         'eth'      => 240,
 142         'Euml'     => 203,
 143         'euml'     => 235,
 144         'euro'     => 8364,
 145         'exist'    => 8707,
 146         'fnof'     => 402,
 147         'forall'   => 8704,
 148         'frac12'   => 189,
 149         'frac14'   => 188,
 150         'frac34'   => 190,
 151         'frasl'    => 8260,
 152         'Gamma'    => 915,
 153         'gamma'    => 947,
 154         'ge'       => 8805,
 155         'gt'       => 62,
 156         'harr'     => 8596,
 157         'hArr'     => 8660,
 158         'hearts'   => 9829,
 159         'hellip'   => 8230,
 160         'Iacute'   => 205,
 161         'iacute'   => 237,
 162         'Icirc'    => 206,
 163         'icirc'    => 238,
 164         'iexcl'    => 161,
 165         'Igrave'   => 204,
 166         'igrave'   => 236,
 167         'image'    => 8465,
 168         'infin'    => 8734,
 169         'int'      => 8747,
 170         'Iota'     => 921,
 171         'iota'     => 953,
 172         'iquest'   => 191,
 173         'isin'     => 8712,
 174         'Iuml'     => 207,
 175         'iuml'     => 239,
 176         'Kappa'    => 922,
 177         'kappa'    => 954,
 178         'Lambda'   => 923,
 179         'lambda'   => 955,
 180         'lang'     => 9001,
 181         'laquo'    => 171,
 182         'larr'     => 8592,
 183         'lArr'     => 8656,
 184         'lceil'    => 8968,
 185         'ldquo'    => 8220,
 186         'le'       => 8804,
 187         'lfloor'   => 8970,
 188         'lowast'   => 8727,
 189         'loz'      => 9674,
 190         'lrm'      => 8206,
 191         'lsaquo'   => 8249,
 192         'lsquo'    => 8216,
 193         'lt'       => 60,
 194         'macr'     => 175,
 195         'mdash'    => 8212,
 196         'micro'    => 181,
 197         'middot'   => 183,
 198         'minus'    => 8722,
 199         'Mu'       => 924,
 200         'mu'       => 956,
 201         'nabla'    => 8711,
 202         'nbsp'     => 160,
 203         'ndash'    => 8211,
 204         'ne'       => 8800,
 205         'ni'       => 8715,
 206         'not'      => 172,
 207         'notin'    => 8713,
 208         'nsub'     => 8836,
 209         'Ntilde'   => 209,
 210         'ntilde'   => 241,
 211         'Nu'       => 925,
 212         'nu'       => 957,
 213         'Oacute'   => 211,
 214         'oacute'   => 243,
 215         'Ocirc'    => 212,
 216         'ocirc'    => 244,
 217         'OElig'    => 338,
 218         'oelig'    => 339,
 219         'Ograve'   => 210,
 220         'ograve'   => 242,
 221         'oline'    => 8254,
 222         'Omega'    => 937,
 223         'omega'    => 969,
 224         'Omicron'  => 927,
 225         'omicron'  => 959,
 226         'oplus'    => 8853,
 227         'or'       => 8744,
 228         'ordf'     => 170,
 229         'ordm'     => 186,
 230         'Oslash'   => 216,
 231         'oslash'   => 248,
 232         'Otilde'   => 213,
 233         'otilde'   => 245,
 234         'otimes'   => 8855,
 235         'Ouml'     => 214,
 236         'ouml'     => 246,
 237         'para'     => 182,
 238         'part'     => 8706,
 239         'permil'   => 8240,
 240         'perp'     => 8869,
 241         'Phi'      => 934,
 242         'phi'      => 966,
 243         'Pi'       => 928,
 244         'pi'       => 960,
 245         'piv'      => 982,
 246         'plusmn'   => 177,
 247         'pound'    => 163,
 248         'prime'    => 8242,
 249         'Prime'    => 8243,
 250         'prod'     => 8719,
 251         'prop'     => 8733,
 252         'Psi'      => 936,
 253         'psi'      => 968,
 254         'quot'     => 34,
 255         'radic'    => 8730,
 256         'rang'     => 9002,
 257         'raquo'    => 187,
 258         'rarr'     => 8594,
 259         'rArr'     => 8658,
 260         'rceil'    => 8969,
 261         'rdquo'    => 8221,
 262         'real'     => 8476,
 263         'reg'      => 174,
 264         'rfloor'   => 8971,
 265         'Rho'      => 929,
 266         'rho'      => 961,
 267         'rlm'      => 8207,
 268         'rsaquo'   => 8250,
 269         'rsquo'    => 8217,
 270         'sbquo'    => 8218,
 271         'Scaron'   => 352,
 272         'scaron'   => 353,
 273         'sdot'     => 8901,
 274         'sect'     => 167,
 275         'shy'      => 173,
 276         'Sigma'    => 931,
 277         'sigma'    => 963,
 278         'sigmaf'   => 962,
 279         'sim'      => 8764,
 280         'spades'   => 9824,
 281         'sub'      => 8834,
 282         'sube'     => 8838,
 283         'sum'      => 8721,
 284         'sup'      => 8835,
 285         'sup1'     => 185,
 286         'sup2'     => 178,
 287         'sup3'     => 179,
 288         'supe'     => 8839,
 289         'szlig'    => 223,
 290         'Tau'      => 932,
 291         'tau'      => 964,
 292         'there4'   => 8756,
 293         'Theta'    => 920,
 294         'theta'    => 952,
 295         'thetasym' => 977,
 296         'thinsp'   => 8201,
 297         'THORN'    => 222,
 298         'thorn'    => 254,
 299         'tilde'    => 732,
 300         'times'    => 215,
 301         'trade'    => 8482,
 302         'Uacute'   => 218,
 303         'uacute'   => 250,
 304         'uarr'     => 8593,
 305         'uArr'     => 8657,
 306         'Ucirc'    => 219,
 307         'ucirc'    => 251,
 308         'Ugrave'   => 217,
 309         'ugrave'   => 249,
 310         'uml'      => 168,
 311         'upsih'    => 978,
 312         'Upsilon'  => 933,
 313         'upsilon'  => 965,
 314         'Uuml'     => 220,
 315         'uuml'     => 252,
 316         'weierp'   => 8472,
 317         'Xi'       => 926,
 318         'xi'       => 958,
 319         'Yacute'   => 221,
 320         'yacute'   => 253,
 321         'yen'      => 165,
 322         'Yuml'     => 376,
 323         'yuml'     => 255,
 324         'Zeta'     => 918,
 325         'zeta'     => 950,
 326         'zwj'      => 8205,
 327         'zwnj'     => 8204 );
 328
 329 /**
 330  * Character entity aliases accepted by MediaWiki
 331  */
 332 global $wgHtmlEntityAliases;
 333 $wgHtmlEntityAliases = array(
 334         'רלמ' => 'rlm',
 335         'رلم' => 'rlm',
 336 );
 337
 338
 339 /**
 340  * XHTML sanitizer for MediaWiki
 341  * @ingroup Parser
 342  */
 343 class Sanitizer {
 344         /**
 345          * Cleans up HTML, removes dangerous tags and attributes, and
 346          * removes HTML comments
 347          * @private
 348          * @param $text String
 349          * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
 350          * @param $args Array for the processing callback
 351          * @param $extratags Array for any extra tags to include
 352          * @param $removetags Array for any tags (default or extra) to exclude
 353          * @return string
 354          */
 355         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
 356                 global $wgUseTidy;
 357
 358                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 359                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 360
 361                 wfProfileIn( __METHOD__ );
 362
 363                 if ( !$staticInitialised ) {
 364
 365                         $htmlpairsStatic = array( # Tags that must be closed
 366                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 367                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 368                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 369                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 370                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
 371                         );
 372                         $htmlsingle = array(
 373                                 'br', 'hr', 'li', 'dt', 'dd'
 374                         );
 375                         $htmlsingleonly = array( # Elements that cannot have close tags
 376                                 'br', 'hr'
 377                         );
 378                         $htmlnest = array( # Tags that can be nested--??
 379                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 380                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 381                         );
 382                         $tabletags = array( # Can only appear inside table, we will close them
 383                                 'td', 'th', 'tr',
 384                         );
 385                         $htmllist = array( # Tags used by list
 386                                 'ul','ol',
 387                         );
 388                         $listtags = array( # Tags that can appear in a list
 389                                 'li',
 390                         );
 391
 392                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 393                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 394
 395                         # Convert them all to hashtables for faster lookup
 396                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 397                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
 398                         foreach ( $vars as $var ) {
 399                                 $$var = array_flip( $$var );
 400                         }
 401                         $staticInitialised = true;
 402                 }
 403                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 404                 $extratags = array_flip( $extratags );
 405                 $removetags = array_flip( $removetags );
 406                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 407                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
 408
 409                 # Remove HTML comments
 410                 $text = Sanitizer::removeHTMLcomments( $text );
 411                 $bits = explode( '<', $text );
 412                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 413                 if(!$wgUseTidy) {
 414                         $tagstack = $tablestack = array();
 415                         foreach ( $bits as $x ) {
 416                                 $regs = array();
 417                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 418                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 419                                 } else {
 420                                         $slash = $t = $params = $brace = $rest = null;
 421                                 }
 422
 423                                 $badtag = 0 ;
 424                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 425                                         # Check our stack
 426                                         if ( $slash ) {
 427                                                 # Closing a tag...
 428                                                 if( isset( $htmlsingleonly[$t] ) ) {
 429                                                         $badtag = 1;
 430                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 431                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 432                                                                 # Pop all elements with an optional close tag
 433                                                                 # and see if we find a match below them
 434                                                                 $optstack = array();
 435                                                                 array_push ($optstack, $ot);
 436                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 437                                                                                 isset( $htmlsingleallowed[$ot] ) )
 438                                                                 {
 439                                                                         array_push ($optstack, $ot);
 440                                                                 }
 441                                                                 if ( $t != $ot ) {
 442                                                                         # No match. Push the optinal elements back again
 443                                                                         $badtag = 1;
 444                                                                         while ( $ot = @array_pop( $optstack ) ) {
 445                                                                                 array_push( $tagstack, $ot );
 446                                                                         }
 447                                                                 }
 448                                                         } else {
 449                                                                 @array_push( $tagstack, $ot );
 450                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 451                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 452                                                                         $badtag = 1;
 453                                                                 }
 454                                                         }
 455                                                 } else {
 456                                                         if ( $t == 'table' ) {
 457                                                                 $tagstack = array_pop( $tablestack );
 458                                                         }
 459                                                 }
 460                                                 $newparams = '';
 461                                         } else {
 462                                                 # Keep track for later
 463                                                 if ( isset( $tabletags[$t] ) &&
 464                                                 ! in_array( 'table', $tagstack ) ) {
 465                                                         $badtag = 1;
 466                                                 } else if ( in_array( $t, $tagstack ) &&
 467                                                 ! isset( $htmlnest [$t ] ) ) {
 468                                                         $badtag = 1 ;
 469                                                 # Is it a self closed htmlpair ? (bug 5487)
 470                                                 } else if( $brace == '/>' &&
 471                                                 isset( $htmlpairs[$t] ) ) {
 472                                                         $badtag = 1;
 473                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 474                                                         # Hack to force empty tag for uncloseable elements
 475                                                         $brace = '/>';
 476                                                 } else if( isset( $htmlsingle[$t] ) ) {
 477                                                         # Hack to not close $htmlsingle tags
 478                                                         $brace = NULL;
 479                                                 } else if( isset( $tabletags[$t] )
 480                                                 &&  in_array($t ,$tagstack) ) {
 481                                                         // New table tag but forgot to close the previous one
 482                                                         $text .= "</$t>";
 483                                                 } else {
 484                                                         if ( $t == 'table' ) {
 485                                                                 array_push( $tablestack, $tagstack );
 486                                                                 $tagstack = array();
 487                                                         }
 488                                                         array_push( $tagstack, $t );
 489                                                 }
 490
 491                                                 # Replace any variables or template parameters with
 492                                                 # plaintext results.
 493                                                 if( is_callable( $processCallback ) ) {
 494                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 495                                                 }
 496
 497                                                 # Strip non-approved attributes from the tag
 498                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 499                                         }
 500                                         if ( ! $badtag ) {
 501                                                 $rest = str_replace( '>', '&gt;', $rest );
 502                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 503                                                 $text .= "<$slash$t$newparams$close>$rest";
 504                                                 continue;
 505                                         }
 506                                 }
 507                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 508                         }
 509                         # Close off any remaining tags
 510                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 511                                 $text .= "</$t>\n";
 512                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 513                         }
 514                 } else {
 515                         # this might be possible using tidy itself
 516                         foreach ( $bits as $x ) {
 517                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 518                                 $x, $regs );
 519                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 520                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 521                                         if( is_callable( $processCallback ) ) {
 522                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 523                                         }
 524                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 525                                         $rest = str_replace( '>', '&gt;', $rest );
 526                                         $text .= "<$slash$t$newparams$brace$rest";
 527                                 } else {
 528                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 529                                 }
 530                         }
 531                 }
 532                 wfProfileOut( __METHOD__ );
 533                 return $text;
 534         }
 535
 536         /**
 537          * Remove '<!--', '-->', and everything between.
 538          * To avoid leaving blank lines, when a comment is both preceded
 539          * and followed by a newline (ignoring spaces), trim leading and
 540          * trailing spaces and one of the newlines.
 541          *
 542          * @private
 543          * @param $text String
 544          * @return string
 545          */
 546         static function removeHTMLcomments( $text ) {
 547                 wfProfileIn( __METHOD__ );
 548                 while (($start = strpos($text, '<!--')) !== false) {
 549                         $end = strpos($text, '-->', $start + 4);
 550                         if ($end === false) {
 551                                 # Unterminated comment; bail out
 552                                 break;
 553                         }
 554
 555                         $end += 3;
 556
 557                         # Trim space and newline if the comment is both
 558                         # preceded and followed by a newline
 559                         $spaceStart = max($start - 1, 0);
 560                         $spaceLen = $end - $spaceStart;
 561                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 562                                 $spaceStart--;
 563                                 $spaceLen++;
 564                         }
 565                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 566                                 $spaceLen++;
 567                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 568                                 # Remove the comment, leading and trailing
 569                                 # spaces, and leave only one newline.
 570                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 571                         }
 572                         else {
 573                                 # Remove just the comment.
 574                                 $text = substr_replace($text, '', $start, $end - $start);
 575                         }
 576                 }
 577                 wfProfileOut( __METHOD__ );
 578                 return $text;
 579         }
 580
 581         /**
 582          * Take an array of attribute names and values and normalize or discard
 583          * illegal values for the given element type.
 584          *
 585          * - Discards attributes not on a whitelist for the given element
 586          * - Unsafe style attributes are discarded
 587          * - Invalid id attributes are reencoded
 588          *
 589          * @param $attribs Array
 590          * @param $element String
 591          * @return Array
 592          *
 593          * @todo Check for legal values where the DTD limits things.
 594          * @todo Check for unique id attribute :P
 595          */
 596         static function validateTagAttributes( $attribs, $element ) {
 597                 return Sanitizer::validateAttributes( $attribs,
 598                         Sanitizer::attributeWhitelist( $element ) );
 599         }
 600
 601         /**
 602          * Take an array of attribute names and values and normalize or discard
 603          * illegal values for the given whitelist.
 604          *
 605          * - Discards attributes not the given whitelist
 606          * - Unsafe style attributes are discarded
 607          * - Invalid id attributes are reencoded
 608          *
 609          * @param $attribs Array
 610          * @param $whitelist Array: list of allowed attribute names
 611          * @return Array
 612          *
 613          * @todo Check for legal values where the DTD limits things.
 614          * @todo Check for unique id attribute :P
 615          */
 616         static function validateAttributes( $attribs, $whitelist ) {
 617                 global $wgAllowRdfaAttributes;
 618
 619                 $whitelist = array_flip( $whitelist );
 620                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 621
 622                 $out = array();
 623                 foreach( $attribs as $attribute => $value ) {
 624                         #allow XML namespace declaration if RDFa is enabled
 625                         if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
 626                                 if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
 627                                         $out[$attribute] = $value;
 628                                 }
 629
 630                                 continue;
 631                         }
 632
 633                         if( !isset( $whitelist[$attribute] ) ) {
 634                                 continue;
 635                         }
 636
 637                         # Strip javascript "expression" from stylesheets.
 638                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 639                         if( $attribute == 'style' ) {
 640                                 $value = Sanitizer::checkCss( $value );
 641                                 if( $value === false ) {
 642                                         # haxx0r
 643                                         continue;
 644                                 }
 645                         }
 646
 647                         if ( $attribute === 'id' ) {
 648                                 global $wgEnforceHtmlIds;
 649                                 $value = Sanitizer::escapeId( $value,
 650                                         $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 651                         }
 652
 653                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
 654                         if ( $attribute === 'rel' || $attribute === 'rev' ||
 655                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
 656                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
 657                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
 658                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
 659
 660                                 //Paranoia. Allow "simple" values but suppress javascript
 661                                 if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
 662                                         continue;
 663                                 }
 664                         }
 665
 666                         # NOTE: even though elements using href/src are not allowed directly, supply
 667                         #       validation code that can be used by tag hook handlers, etc
 668                         if ( $attribute === 'href' || $attribute === 'src' ) {
 669                                 if ( !preg_match( $hrefExp, $value ) ) {
 670                                         continue; //drop any href or src attributes not using an allowed protocol.
 671                                                   //NOTE: this also drops all relative URLs
 672                                 }
 673                         }
 674
 675                         // If this attribute was previously set, override it.
 676                         // Output should only have one attribute of each name.
 677                         $out[$attribute] = $value;
 678                 }
 679                 return $out;
 680         }
 681
 682         /**
 683          * Merge two sets of HTML attributes.  Conflicting items in the second set
 684          * will override those in the first, except for 'class' attributes which
 685          * will be combined (if they're both strings).
 686          *
 687          * @todo implement merging for other attributes such as style
 688          * @param $a Array
 689          * @param $b Array
 690          * @return array
 691          */
 692         static function mergeAttributes( $a, $b ) {
 693                 $out = array_merge( $a, $b );
 694                 if( isset( $a['class'] ) && isset( $b['class'] )
 695                 && is_string( $a['class'] ) && is_string( $b['class'] )
 696                 && $a['class'] !== $b['class'] ) {
 697                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 698                                 -1, PREG_SPLIT_NO_EMPTY );
 699                         $out['class'] = implode( ' ', array_unique( $classes ) );
 700                 }
 701                 return $out;
 702         }
 703
 704         /**
 705          * Pick apart some CSS and check it for forbidden or unsafe structures.
 706          * Returns a sanitized string, or false if it was just too evil.
 707          *
 708          * Currently URL references, 'expression', 'tps' are forbidden.
 709          *
 710          * @param $value String
 711          * @return Mixed
 712          */
 713         static function checkCss( $value ) {
 714                 $stripped = Sanitizer::decodeCharReferences( $value );
 715
 716                 // Remove any comments; IE gets token splitting wrong
 717                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 718
 719                 $value = $stripped;
 720
 721                 // ... and continue checks
 722                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 723                         'codepointToUtf8(hexdec("$1"))', $stripped );
 724                 $stripped = str_replace( '\\', '', $stripped );
 725                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 726                                 $stripped ) ) {
 727                         # haxx0r
 728                         return false;
 729                 }
 730
 731                 return $value;
 732         }
 733
 734         /**
 735          * Take a tag soup fragment listing an HTML element's attributes
 736          * and normalize it to well-formed XML, discarding unwanted attributes.
 737          * Output is safe for further wikitext processing, with escaping of
 738          * values that could trigger problems.
 739          *
 740          * - Normalizes attribute names to lowercase
 741          * - Discards attributes not on a whitelist for the given element
 742          * - Turns broken or invalid entities into plaintext
 743          * - Double-quotes all attribute values
 744          * - Attributes without values are given the name as attribute
 745          * - Double attributes are discarded
 746          * - Unsafe style attributes are discarded
 747          * - Prepends space if there are attributes.
 748          *
 749          * @param $text String
 750          * @param $element String
 751          * @return String
 752          */
 753         static function fixTagAttributes( $text, $element ) {
 754                 if( trim( $text ) == '' ) {
 755                         return '';
 756                 }
 757
 758                 $stripped = Sanitizer::validateTagAttributes(
 759                         Sanitizer::decodeTagAttributes( $text ), $element );
 760
 761                 $attribs = array();
 762                 foreach( $stripped as $attribute => $value ) {
 763                         $encAttribute = htmlspecialchars( $attribute );
 764                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 765
 766                         $attribs[] = "$encAttribute=\"$encValue\"";
 767                 }
 768                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 769         }
 770
 771         /**
 772          * Encode an attribute value for HTML output.
 773          * @param $text String
 774          * @return HTML-encoded text fragment
 775          */
 776         static function encodeAttribute( $text ) {
 777                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 778
 779                 // Whitespace is normalized during attribute decoding,
 780                 // so if we've been passed non-spaces we must encode them
 781                 // ahead of time or they won't be preserved.
 782                 $encValue = strtr( $encValue, array(
 783                         "\n" => '&#10;',
 784                         "\r" => '&#13;',
 785                         "\t" => '&#9;',
 786                 ) );
 787
 788                 return $encValue;
 789         }
 790
 791         /**
 792          * Encode an attribute value for HTML tags, with extra armoring
 793          * against further wiki processing.
 794          * @param $text String
 795          * @return HTML-encoded text fragment
 796          */
 797         static function safeEncodeAttribute( $text ) {
 798                 $encValue = Sanitizer::encodeAttribute( $text );
 799
 800                 # Templates and links may be expanded in later parsing,
 801                 # creating invalid or dangerous output. Suppress this.
 802                 $encValue = strtr( $encValue, array(
 803                         '<'    => '&lt;',   // This should never happen,
 804                         '>'    => '&gt;',   // we've received invalid input
 805                         '"'    => '&quot;', // which should have been escaped.
 806                         '{'    => '&#123;',
 807                         '['    => '&#91;',
 808                         "''"   => '&#39;&#39;',
 809                         'ISBN' => '&#73;SBN',
 810                         'RFC'  => '&#82;FC',
 811                         'PMID' => '&#80;MID',
 812                         '|'    => '&#124;',
 813                         '__'   => '&#95;_',
 814                 ) );
 815
 816                 # Stupid hack
 817                 $encValue = preg_replace_callback(
 818                         '/(' . wfUrlProtocols() . ')/',
 819                         array( 'Sanitizer', 'armorLinksCallback' ),
 820                         $encValue );
 821                 return $encValue;
 822         }
 823
 824         /**
 825          * Given a value escape it so that it can be used in an id attribute and
 826          * return it, this does not validate the value however (see first link)
 827          *
 828          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 829          *                                                          in the id and
 830          *                                                          name attributes
 831          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 832          *
 833          * @param $id String: id to validate
 834          * @param $options Mixed: string or array of strings (default is array()):
 835          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 836          *       so don't pay attention if the first character isn't valid at the
 837          *       beginning of an id.
 838          *   'xml': Don't restrict the id to be HTML4-compatible.  This option
 839          *       allows any alphabetic character to be used, per the XML standard.
 840          *       Therefore, it also completely changes the type of escaping: instead
 841          *       of weird dot-encoding, runs of invalid characters (mostly
 842          *       whitespace) are just compressed into a single underscore.
 843          * @return String
 844          */
 845         static function escapeId( $id, $options = array() ) {
 846                 $options = (array)$options;
 847
 848                 if ( !in_array( 'xml', $options ) ) {
 849                         # HTML4-style escaping
 850                         static $replace = array(
 851                                 '%3A' => ':',
 852                                 '%' => '.'
 853                         );
 854
 855                         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 856                         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 857
 858                         if ( !preg_match( '/^[a-zA-Z]/', $id )
 859                         && !in_array( 'noninitial', $options ) )  {
 860                                 // Initial character must be a letter!
 861                                 $id = "x$id";
 862                         }
 863                         return $id;
 864                 }
 865
 866                 # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
 867                 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
 868                 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
 869                         . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
 870                         . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
 871                 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
 872                         . '\x{203F}-\x{2040}';
 873                 # Replace _ as well so we don't get multiple consecutive underscores
 874                 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
 875                 $id = trim( $id, '_' );
 876
 877                 if ( !preg_match( "/^[$nameStartChar]/u", $id )
 878                 && !in_array( 'noninitial', $options ) ) {
 879                         $id = "_$id";
 880                 }
 881
 882                 return $id;
 883         }
 884
 885         /**
 886          * Given a value, escape it so that it can be used as a CSS class and
 887          * return it.
 888          *
 889          * @todo For extra validity, input should be validated UTF-8.
 890          *
 891          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 892          *
 893          * @param $class String
 894          * @return String
 895          */
 896         static function escapeClass( $class ) {
 897                 // Convert ugly stuff to underscores and kill underscores in ugly places
 898                 return rtrim(preg_replace(
 899                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 900                         '_',
 901                         $class ), '_');
 902         }
 903
 904         /**
 905          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 906          * This allows (generally harmless) entities like &nbsp; to survive.
 907          *
 908          * @param $html String to escape
 909          * @return String: escaped input
 910          */
 911         static function escapeHtmlAllowEntities( $html ) {
 912                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 913                 # hurt.
 914                 $html = htmlspecialchars( $html, ENT_QUOTES );
 915                 $html = str_replace( '&amp;', '&', $html );
 916                 $html = Sanitizer::normalizeCharReferences( $html );
 917                 return $html;
 918         }
 919
 920         /**
 921          * Regex replace callback for armoring links against further processing.
 922          * @param $matches Array
 923          * @return string
 924          */
 925         private static function armorLinksCallback( $matches ) {
 926                 return str_replace( ':', '&#58;', $matches[1] );
 927         }
 928
 929         /**
 930          * Return an associative array of attribute names and values from
 931          * a partial tag string. Attribute names are forces to lowercase,
 932          * character references are decoded to UTF-8 text.
 933          *
 934          * @param $text String
 935          * @return Array
 936          */
 937         public static function decodeTagAttributes( $text ) {
 938                 $attribs = array();
 939
 940                 if( trim( $text ) == '' ) {
 941                         return $attribs;
 942                 }
 943
 944                 $pairs = array();
 945                 if( !preg_match_all(
 946                         MW_ATTRIBS_REGEX,
 947                         $text,
 948                         $pairs,
 949                         PREG_SET_ORDER ) ) {
 950                         return $attribs;
 951                 }
 952
 953                 foreach( $pairs as $set ) {
 954                         $attribute = strtolower( $set[1] );
 955                         $value = Sanitizer::getTagAttributeCallback( $set );
 956
 957                         // Normalize whitespace
 958                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 959                         $value = trim( $value );
 960
 961                         // Decode character references
 962                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 963                 }
 964                 return $attribs;
 965         }
 966
 967         /**
 968          * Pick the appropriate attribute value from a match set from the
 969          * MW_ATTRIBS_REGEX matches.
 970          *
 971          * @param $set Array
 972          * @return String
 973          */
 974         private static function getTagAttributeCallback( $set ) {
 975                 if( isset( $set[6] ) ) {
 976                         # Illegal #XXXXXX color with no quotes.
 977                         return $set[6];
 978                 } elseif( isset( $set[5] ) ) {
 979                         # No quotes.
 980                         return $set[5];
 981                 } elseif( isset( $set[4] ) ) {
 982                         # Single-quoted
 983                         return $set[4];
 984                 } elseif( isset( $set[3] ) ) {
 985                         # Double-quoted
 986                         return $set[3];
 987                 } elseif( !isset( $set[2] ) ) {
 988                         # In XHTML, attributes must have a value.
 989                         # For 'reduced' form, return explicitly the attribute name here.
 990                         return $set[1];
 991                 } else {
 992                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 993                 }
 994         }
 995
 996         /**
 997          * Normalize whitespace and character references in an XML source-
 998          * encoded text for an attribute value.
 999          *
1000          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
1001          * but note that we're not returning the value, but are returning
1002          * XML source fragments that will be slapped into output.
1003          *
1004          * @param $text String
1005          * @return String
1006          */
1007         private static function normalizeAttributeValue( $text ) {
1008                 return str_replace( '"', '&quot;',
1009                         self::normalizeWhitespace(
1010                                 Sanitizer::normalizeCharReferences( $text ) ) );
1011         }
1012
1013         private static function normalizeWhitespace( $text ) {
1014                 return preg_replace(
1015                         '/\r\n|[\x20\x0d\x0a\x09]/',
1016                         ' ',
1017                         $text );
1018         }
1019
1020         /**
1021          * Ensure that any entities and character references are legal
1022          * for XML and XHTML specifically. Any stray bits will be
1023          * &amp;-escaped to result in a valid text fragment.
1024          *
1025          * a. any named char refs must be known in XHTML
1026          * b. any numeric char refs must be legal chars, not invalid or forbidden
1027          * c. use &#x, not &#X
1028          * d. fix or reject non-valid attributes
1029          *
1030          * @param $text String
1031          * @return String
1032          * @private
1033          */
1034         static function normalizeCharReferences( $text ) {
1035                 return preg_replace_callback(
1036                         MW_CHAR_REFS_REGEX,
1037                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1038                         $text );
1039         }
1040         /**
1041          * @param $matches String
1042          * @return String
1043          */
1044         static function normalizeCharReferencesCallback( $matches ) {
1045                 $ret = null;
1046                 if( $matches[1] != '' ) {
1047                         $ret = Sanitizer::normalizeEntity( $matches[1] );
1048                 } elseif( $matches[2] != '' ) {
1049                         $ret = Sanitizer::decCharReference( $matches[2] );
1050                 } elseif( $matches[3] != ''  ) {
1051                         $ret = Sanitizer::hexCharReference( $matches[3] );
1052                 } elseif( $matches[4] != '' ) {
1053                         $ret = Sanitizer::hexCharReference( $matches[4] );
1054                 }
1055                 if( is_null( $ret ) ) {
1056                         return htmlspecialchars( $matches[0] );
1057                 } else {
1058                         return $ret;
1059                 }
1060         }
1061
1062         /**
1063          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1064          * return the named entity reference as is. If the entity is a
1065          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1066          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1067          *
1068          * @param $name String
1069          * @return String
1070          */
1071         static function normalizeEntity( $name ) {
1072                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1073                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1074                         return "&{$wgHtmlEntityAliases[$name]};";
1075                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1076                         return "&$name;";
1077                 } else {
1078                         return "&amp;$name;";
1079                 }
1080         }
1081
1082         static function decCharReference( $codepoint ) {
1083                 $point = intval( $codepoint );
1084                 if( Sanitizer::validateCodepoint( $point ) ) {
1085                         return sprintf( '&#%d;', $point );
1086                 } else {
1087                         return null;
1088                 }
1089         }
1090
1091         static function hexCharReference( $codepoint ) {
1092                 $point = hexdec( $codepoint );
1093                 if( Sanitizer::validateCodepoint( $point ) ) {
1094                         return sprintf( '&#x%x;', $point );
1095                 } else {
1096                         return null;
1097                 }
1098         }
1099
1100         /**
1101          * Returns true if a given Unicode codepoint is a valid character in XML.
1102          * @param $codepoint Integer
1103          * @return Boolean
1104          */
1105         private static function validateCodepoint( $codepoint ) {
1106                 return ($codepoint ==    0x09)
1107                         || ($codepoint ==    0x0a)
1108                         || ($codepoint ==    0x0d)
1109                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1110                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1111                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1112         }
1113
1114         /**
1115          * Decode any character references, numeric or named entities,
1116          * in the text and return a UTF-8 string.
1117          *
1118          * @param $text String
1119          * @return String
1120          */
1121         public static function decodeCharReferences( $text ) {
1122                 return preg_replace_callback(
1123                         MW_CHAR_REFS_REGEX,
1124                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1125                         $text );
1126         }
1127
1128         /**
1129          * @param $matches String
1130          * @return String
1131          */
1132         static function decodeCharReferencesCallback( $matches ) {
1133                 if( $matches[1] != '' ) {
1134                         return Sanitizer::decodeEntity( $matches[1] );
1135                 } elseif( $matches[2] != '' ) {
1136                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1137                 } elseif( $matches[3] != ''  ) {
1138                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1139                 } elseif( $matches[4] != '' ) {
1140                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1141                 }
1142                 # Last case should be an ampersand by itself
1143                 return $matches[0];
1144         }
1145
1146         /**
1147          * Return UTF-8 string for a codepoint if that is a valid
1148          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1149          * @param $codepoint Integer
1150          * @return String
1151          * @private
1152          */
1153         static function decodeChar( $codepoint ) {
1154                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1155                         return codepointToUtf8( $codepoint );
1156                 } else {
1157                         return UTF8_REPLACEMENT;
1158                 }
1159         }
1160
1161         /**
1162          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1163          * return the UTF-8 encoding of that character. Otherwise, returns
1164          * pseudo-entity source (eg &foo;)
1165          *
1166          * @param $name Strings
1167          * @return String
1168          */
1169         static function decodeEntity( $name ) {
1170                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1171                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1172                         $name = $wgHtmlEntityAliases[$name];
1173                 }
1174                 if( isset( $wgHtmlEntities[$name] ) ) {
1175                         return codepointToUtf8( $wgHtmlEntities[$name] );
1176                 } else {
1177                         return "&$name;";
1178                 }
1179         }
1180
1181         /**
1182          * Fetch the whitelist of acceptable attributes for a given element name.
1183          *
1184          * @param $element String
1185          * @return Array
1186          */
1187         static function attributeWhitelist( $element ) {
1188                 static $list;
1189                 if( !isset( $list ) ) {
1190                         $list = Sanitizer::setupAttributeWhitelist();
1191                 }
1192                 return isset( $list[$element] )
1193                         ? $list[$element]
1194                         : array();
1195         }
1196
1197         /**
1198          * Foreach array key (an allowed HTML element), return an array
1199          * of allowed attributes
1200          * @return Array
1201          */
1202         static function setupAttributeWhitelist() {
1203                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
1204
1205                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style', 'xml:lang' );
1206
1207                 if ( $wgAllowRdfaAttributes ) {
1208                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1209                         $common = array_merge( $common, array(
1210                             'about', 'property', 'resource', 'datatype', 'typeof',
1211                         ) );
1212                 }
1213
1214                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
1215                         # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
1216                         $common = array_merge( $common, array(
1217                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1218                         ) );
1219                 }
1220
1221                 $block = array_merge( $common, array( 'align' ) );
1222                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1223                 $tablecell = array( 'abbr',
1224                                     'axis',
1225                                     'headers',
1226                                     'scope',
1227                                     'rowspan',
1228                                     'colspan',
1229                                     'nowrap', # deprecated
1230                                     'width',  # deprecated
1231                                     'height', # deprecated
1232                                     'bgcolor' # deprecated
1233                                     );
1234
1235                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1236                 # See: http://www.w3.org/TR/html4/
1237                 $whitelist = array (
1238                         # 7.5.4
1239                         'div'        => $block,
1240                         'center'     => $common, # deprecated
1241                         'span'       => $block, # ??
1242
1243                         # 7.5.5
1244                         'h1'         => $block,
1245                         'h2'         => $block,
1246                         'h3'         => $block,
1247                         'h4'         => $block,
1248                         'h5'         => $block,
1249                         'h6'         => $block,
1250
1251                         # 7.5.6
1252                         # address
1253
1254                         # 8.2.4
1255                         # bdo
1256
1257                         # 9.2.1
1258                         'em'         => $common,
1259                         'strong'     => $common,
1260                         'cite'       => $common,
1261                         # dfn
1262                         'code'       => $common,
1263                         # samp
1264                         # kbd
1265                         'var'        => $common,
1266                         'abbr'       => $common,
1267                         # acronym
1268
1269                         # 9.2.2
1270                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1271                         # q
1272
1273                         # 9.2.3
1274                         'sub'        => $common,
1275                         'sup'        => $common,
1276
1277                         # 9.3.1
1278                         'p'          => $block,
1279
1280                         # 9.3.2
1281                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1282
1283                         # 9.3.4
1284                         'pre'        => array_merge( $common, array( 'width' ) ),
1285
1286                         # 9.4
1287                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1288                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1289
1290                         # 10.2
1291                         'ul'         => array_merge( $common, array( 'type' ) ),
1292                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1293                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1294
1295                         # 10.3
1296                         'dl'         => $common,
1297                         'dd'         => $common,
1298                         'dt'         => $common,
1299
1300                         # 11.2.1
1301                         'table'      => array_merge( $common,
1302                                                                 array( 'summary', 'width', 'border', 'frame',
1303                                                                                 'rules', 'cellspacing', 'cellpadding',
1304                                                                                 'align', 'bgcolor',
1305                                                                 ) ),
1306
1307                         # 11.2.2
1308                         'caption'    => array_merge( $common, array( 'align' ) ),
1309
1310                         # 11.2.3
1311                         'thead'      => array_merge( $common, $tablealign ),
1312                         'tfoot'      => array_merge( $common, $tablealign ),
1313                         'tbody'      => array_merge( $common, $tablealign ),
1314
1315                         # 11.2.4
1316                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1317                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1318
1319                         # 11.2.5
1320                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1321
1322                         # 11.2.6
1323                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1324                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1325
1326                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
1327                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1328
1329                         # 13.2
1330                         # Not usually allowed, but may be used for extension-style hooks
1331                         # such as <math> when it is rasterized
1332                         'img'        => array_merge( $common, array( 'alt' ) ),
1333
1334                         # 15.2.1
1335                         'tt'         => $common,
1336                         'b'          => $common,
1337                         'i'          => $common,
1338                         'big'        => $common,
1339                         'small'      => $common,
1340                         'strike'     => $common,
1341                         's'          => $common,
1342                         'u'          => $common,
1343
1344                         # 15.2.2
1345                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1346                         # basefont
1347
1348                         # 15.3
1349                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1350
1351                         # XHTML Ruby annotation text module, simple ruby only.
1352                         # http://www.w3c.org/TR/ruby/
1353                         'ruby'       => $common,
1354                         # rbc
1355                         # rtc
1356                         'rb'         => $common,
1357                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1358                         'rp'         => $common,
1359
1360                         # MathML root element, where used for extensions
1361                         # 'title' may not be 100% valid here; it's XHTML
1362                         # http://www.w3.org/TR/REC-MathML/
1363                         'math'       => array( 'class', 'style', 'id', 'title' ),
1364                         );
1365                 return $whitelist;
1366         }
1367
1368         /**
1369          * Take a fragment of (potentially invalid) HTML and return
1370          * a version with any tags removed, encoded as plain text.
1371          *
1372          * Warning: this return value must be further escaped for literal
1373          * inclusion in HTML output as of 1.10!
1374          *
1375          * @param $text String: HTML fragment
1376          * @return String
1377          */
1378         static function stripAllTags( $text ) {
1379                 # Actual <tags>
1380                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1381
1382                 # Normalize &entities and whitespace
1383                 $text = self::decodeCharReferences( $text );
1384                 $text = self::normalizeWhitespace( $text );
1385
1386                 return $text;
1387         }
1388
1389         /**
1390          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1391          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1392          * PHP 5.1 doesn't.
1393          *
1394          * Use for passing XHTML fragments to PHP's XML parsing functions
1395          *
1396          * @return String
1397          */
1398         static function hackDocType() {
1399                 global $wgHtmlEntities;
1400                 $out = "<!DOCTYPE html [\n";
1401                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1402                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1403                 }
1404                 $out .= "]>\n";
1405                 return $out;
1406         }
1407
1408         static function cleanUrl( $url ) {
1409                 # Normalize any HTML entities in input. They will be
1410                 # re-escaped by makeExternalLink().
1411                 $url = Sanitizer::decodeCharReferences( $url );
1412
1413                 # Escape any control characters introduced by the above step
1414                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1415
1416                 # Validate hostname portion
1417                 $matches = array();
1418                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1419                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1420
1421                         // Characters that will be ignored in IDNs.
1422                         // http://tools.ietf.org/html/3454#section-3.1
1423                         // Strip them before further processing so blacklists and such work.
1424                         $strip = "/
1425                                 \\s|          # general whitespace
1426                                 \xc2\xad|     # 00ad SOFT HYPHEN
1427                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1428                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1429                                 \xe2\x81\xa0| # 2060 WORD JOINER
1430                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1431                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1432                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1433                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1434                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1435                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1436                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1437                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1438                                 /xuD";
1439
1440                         $host = preg_replace( $strip, '', $host );
1441
1442                         // @fixme: validate hostnames here
1443
1444                         return $protocol . $host . $rest;
1445                 } else {
1446                         return $url;
1447                 }
1448         }
1449
1450 }