includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * Regular expression to match URIs that could trigger script execution
  61  */
  62 define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
  63
  64 /**
  65  * Regular expression to match namespace attributes
  66  */
  67 define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
  68
  69 /**
  70  * List of all named character entities defined in HTML 4.01
  71  * http://www.w3.org/TR/html4/sgml/entities.html
  72  * @private
  73  */
  74 global $wgHtmlEntities;
  75 $wgHtmlEntities = array(
  76         'Aacute'   => 193,
  77         'aacute'   => 225,
  78         'Acirc'    => 194,
  79         'acirc'    => 226,
  80         'acute'    => 180,
  81         'AElig'    => 198,
  82         'aelig'    => 230,
  83         'Agrave'   => 192,
  84         'agrave'   => 224,
  85         'alefsym'  => 8501,
  86         'Alpha'    => 913,
  87         'alpha'    => 945,
  88         'amp'      => 38,
  89         'and'      => 8743,
  90         'ang'      => 8736,
  91         'Aring'    => 197,
  92         'aring'    => 229,
  93         'asymp'    => 8776,
  94         'Atilde'   => 195,
  95         'atilde'   => 227,
  96         'Auml'     => 196,
  97         'auml'     => 228,
  98         'bdquo'    => 8222,
  99         'Beta'     => 914,
 100         'beta'     => 946,
 101         'brvbar'   => 166,
 102         'bull'     => 8226,
 103         'cap'      => 8745,
 104         'Ccedil'   => 199,
 105         'ccedil'   => 231,
 106         'cedil'    => 184,
 107         'cent'     => 162,
 108         'Chi'      => 935,
 109         'chi'      => 967,
 110         'circ'     => 710,
 111         'clubs'    => 9827,
 112         'cong'     => 8773,
 113         'copy'     => 169,
 114         'crarr'    => 8629,
 115         'cup'      => 8746,
 116         'curren'   => 164,
 117         'dagger'   => 8224,
 118         'Dagger'   => 8225,
 119         'darr'     => 8595,
 120         'dArr'     => 8659,
 121         'deg'      => 176,
 122         'Delta'    => 916,
 123         'delta'    => 948,
 124         'diams'    => 9830,
 125         'divide'   => 247,
 126         'Eacute'   => 201,
 127         'eacute'   => 233,
 128         'Ecirc'    => 202,
 129         'ecirc'    => 234,
 130         'Egrave'   => 200,
 131         'egrave'   => 232,
 132         'empty'    => 8709,
 133         'emsp'     => 8195,
 134         'ensp'     => 8194,
 135         'Epsilon'  => 917,
 136         'epsilon'  => 949,
 137         'equiv'    => 8801,
 138         'Eta'      => 919,
 139         'eta'      => 951,
 140         'ETH'      => 208,
 141         'eth'      => 240,
 142         'Euml'     => 203,
 143         'euml'     => 235,
 144         'euro'     => 8364,
 145         'exist'    => 8707,
 146         'fnof'     => 402,
 147         'forall'   => 8704,
 148         'frac12'   => 189,
 149         'frac14'   => 188,
 150         'frac34'   => 190,
 151         'frasl'    => 8260,
 152         'Gamma'    => 915,
 153         'gamma'    => 947,
 154         'ge'       => 8805,
 155         'gt'       => 62,
 156         'harr'     => 8596,
 157         'hArr'     => 8660,
 158         'hearts'   => 9829,
 159         'hellip'   => 8230,
 160         'Iacute'   => 205,
 161         'iacute'   => 237,
 162         'Icirc'    => 206,
 163         'icirc'    => 238,
 164         'iexcl'    => 161,
 165         'Igrave'   => 204,
 166         'igrave'   => 236,
 167         'image'    => 8465,
 168         'infin'    => 8734,
 169         'int'      => 8747,
 170         'Iota'     => 921,
 171         'iota'     => 953,
 172         'iquest'   => 191,
 173         'isin'     => 8712,
 174         'Iuml'     => 207,
 175         'iuml'     => 239,
 176         'Kappa'    => 922,
 177         'kappa'    => 954,
 178         'Lambda'   => 923,
 179         'lambda'   => 955,
 180         'lang'     => 9001,
 181         'laquo'    => 171,
 182         'larr'     => 8592,
 183         'lArr'     => 8656,
 184         'lceil'    => 8968,
 185         'ldquo'    => 8220,
 186         'le'       => 8804,
 187         'lfloor'   => 8970,
 188         'lowast'   => 8727,
 189         'loz'      => 9674,
 190         'lrm'      => 8206,
 191         'lsaquo'   => 8249,
 192         'lsquo'    => 8216,
 193         'lt'       => 60,
 194         'macr'     => 175,
 195         'mdash'    => 8212,
 196         'micro'    => 181,
 197         'middot'   => 183,
 198         'minus'    => 8722,
 199         'Mu'       => 924,
 200         'mu'       => 956,
 201         'nabla'    => 8711,
 202         'nbsp'     => 160,
 203         'ndash'    => 8211,
 204         'ne'       => 8800,
 205         'ni'       => 8715,
 206         'not'      => 172,
 207         'notin'    => 8713,
 208         'nsub'     => 8836,
 209         'Ntilde'   => 209,
 210         'ntilde'   => 241,
 211         'Nu'       => 925,
 212         'nu'       => 957,
 213         'Oacute'   => 211,
 214         'oacute'   => 243,
 215         'Ocirc'    => 212,
 216         'ocirc'    => 244,
 217         'OElig'    => 338,
 218         'oelig'    => 339,
 219         'Ograve'   => 210,
 220         'ograve'   => 242,
 221         'oline'    => 8254,
 222         'Omega'    => 937,
 223         'omega'    => 969,
 224         'Omicron'  => 927,
 225         'omicron'  => 959,
 226         'oplus'    => 8853,
 227         'or'       => 8744,
 228         'ordf'     => 170,
 229         'ordm'     => 186,
 230         'Oslash'   => 216,
 231         'oslash'   => 248,
 232         'Otilde'   => 213,
 233         'otilde'   => 245,
 234         'otimes'   => 8855,
 235         'Ouml'     => 214,
 236         'ouml'     => 246,
 237         'para'     => 182,
 238         'part'     => 8706,
 239         'permil'   => 8240,
 240         'perp'     => 8869,
 241         'Phi'      => 934,
 242         'phi'      => 966,
 243         'Pi'       => 928,
 244         'pi'       => 960,
 245         'piv'      => 982,
 246         'plusmn'   => 177,
 247         'pound'    => 163,
 248         'prime'    => 8242,
 249         'Prime'    => 8243,
 250         'prod'     => 8719,
 251         'prop'     => 8733,
 252         'Psi'      => 936,
 253         'psi'      => 968,
 254         'quot'     => 34,
 255         'radic'    => 8730,
 256         'rang'     => 9002,
 257         'raquo'    => 187,
 258         'rarr'     => 8594,
 259         'rArr'     => 8658,
 260         'rceil'    => 8969,
 261         'rdquo'    => 8221,
 262         'real'     => 8476,
 263         'reg'      => 174,
 264         'rfloor'   => 8971,
 265         'Rho'      => 929,
 266         'rho'      => 961,
 267         'rlm'      => 8207,
 268         'rsaquo'   => 8250,
 269         'rsquo'    => 8217,
 270         'sbquo'    => 8218,
 271         'Scaron'   => 352,
 272         'scaron'   => 353,
 273         'sdot'     => 8901,
 274         'sect'     => 167,
 275         'shy'      => 173,
 276         'Sigma'    => 931,
 277         'sigma'    => 963,
 278         'sigmaf'   => 962,
 279         'sim'      => 8764,
 280         'spades'   => 9824,
 281         'sub'      => 8834,
 282         'sube'     => 8838,
 283         'sum'      => 8721,
 284         'sup'      => 8835,
 285         'sup1'     => 185,
 286         'sup2'     => 178,
 287         'sup3'     => 179,
 288         'supe'     => 8839,
 289         'szlig'    => 223,
 290         'Tau'      => 932,
 291         'tau'      => 964,
 292         'there4'   => 8756,
 293         'Theta'    => 920,
 294         'theta'    => 952,
 295         'thetasym' => 977,
 296         'thinsp'   => 8201,
 297         'THORN'    => 222,
 298         'thorn'    => 254,
 299         'tilde'    => 732,
 300         'times'    => 215,
 301         'trade'    => 8482,
 302         'Uacute'   => 218,
 303         'uacute'   => 250,
 304         'uarr'     => 8593,
 305         'uArr'     => 8657,
 306         'Ucirc'    => 219,
 307         'ucirc'    => 251,
 308         'Ugrave'   => 217,
 309         'ugrave'   => 249,
 310         'uml'      => 168,
 311         'upsih'    => 978,
 312         'Upsilon'  => 933,
 313         'upsilon'  => 965,
 314         'Uuml'     => 220,
 315         'uuml'     => 252,
 316         'weierp'   => 8472,
 317         'Xi'       => 926,
 318         'xi'       => 958,
 319         'Yacute'   => 221,
 320         'yacute'   => 253,
 321         'yen'      => 165,
 322         'Yuml'     => 376,
 323         'yuml'     => 255,
 324         'Zeta'     => 918,
 325         'zeta'     => 950,
 326         'zwj'      => 8205,
 327         'zwnj'     => 8204 );
 328
 329 /**
 330  * Character entity aliases accepted by MediaWiki
 331  */
 332 global $wgHtmlEntityAliases;
 333 $wgHtmlEntityAliases = array(
 334         'רלמ' => 'rlm',
 335         'رلم' => 'rlm',
 336 );
 337
 338
 339 /**
 340  * XHTML sanitizer for MediaWiki
 341  * @ingroup Parser
 342  */
 343 class Sanitizer {
 344         /**
 345          * Cleans up HTML, removes dangerous tags and attributes, and
 346          * removes HTML comments
 347          * @private
 348          * @param $text String
 349          * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
 350          * @param $args Array for the processing callback
 351          * @param $extratags Array for any extra tags to include
 352          * @param $removetags Array for any tags (default or extra) to exclude
 353          * @return string
 354          */
 355         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
 356                 global $wgUseTidy;
 357
 358                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 359                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 360
 361                 wfProfileIn( __METHOD__ );
 362
 363                 if ( !$staticInitialised ) {
 364
 365                         $htmlpairsStatic = array( # Tags that must be closed
 366                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 367                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 368                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 369                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 370                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
 371                         );
 372                         $htmlsingle = array(
 373                                 'br', 'hr', 'li', 'dt', 'dd'
 374                         );
 375                         $htmlsingleonly = array( # Elements that cannot have close tags
 376                                 'br', 'hr'
 377                         );
 378                         $htmlnest = array( # Tags that can be nested--??
 379                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 380                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 381                         );
 382                         $tabletags = array( # Can only appear inside table, we will close them
 383                                 'td', 'th', 'tr',
 384                         );
 385                         $htmllist = array( # Tags used by list
 386                                 'ul','ol',
 387                         );
 388                         $listtags = array( # Tags that can appear in a list
 389                                 'li',
 390                         );
 391
 392                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 393                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 394
 395                         # Convert them all to hashtables for faster lookup
 396                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 397                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
 398                         foreach ( $vars as $var ) {
 399                                 $$var = array_flip( $$var );
 400                         }
 401                         $staticInitialised = true;
 402                 }
 403                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 404                 $extratags = array_flip( $extratags );
 405                 $removetags = array_flip( $removetags );
 406                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 407                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
 408
 409                 # Remove HTML comments
 410                 $text = Sanitizer::removeHTMLcomments( $text );
 411                 $bits = explode( '<', $text );
 412                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 413                 if(!$wgUseTidy) {
 414                         $tagstack = $tablestack = array();
 415                         foreach ( $bits as $x ) {
 416                                 $regs = array();
 417                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 418                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 419                                 } else {
 420                                         $slash = $t = $params = $brace = $rest = null;
 421                                 }
 422
 423                                 $badtag = 0 ;
 424                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 425                                         # Check our stack
 426                                         if ( $slash ) {
 427                                                 # Closing a tag...
 428                                                 if( isset( $htmlsingleonly[$t] ) ) {
 429                                                         $badtag = 1;
 430                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 431                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 432                                                                 # Pop all elements with an optional close tag
 433                                                                 # and see if we find a match below them
 434                                                                 $optstack = array();
 435                                                                 array_push ($optstack, $ot);
 436                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 437                                                                                 isset( $htmlsingleallowed[$ot] ) )
 438                                                                 {
 439                                                                         array_push ($optstack, $ot);
 440                                                                 }
 441                                                                 if ( $t != $ot ) {
 442                                                                         # No match. Push the optinal elements back again
 443                                                                         $badtag = 1;
 444                                                                         while ( $ot = @array_pop( $optstack ) ) {
 445                                                                                 array_push( $tagstack, $ot );
 446                                                                         }
 447                                                                 }
 448                                                         } else {
 449                                                                 @array_push( $tagstack, $ot );
 450                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 451                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 452                                                                         $badtag = 1;
 453                                                                 }
 454                                                         }
 455                                                 } else {
 456                                                         if ( $t == 'table' ) {
 457                                                                 $tagstack = array_pop( $tablestack );
 458                                                         }
 459                                                 }
 460                                                 $newparams = '';
 461                                         } else {
 462                                                 # Keep track for later
 463                                                 if ( isset( $tabletags[$t] ) &&
 464                                                 ! in_array( 'table', $tagstack ) ) {
 465                                                         $badtag = 1;
 466                                                 } else if ( in_array( $t, $tagstack ) &&
 467                                                 ! isset( $htmlnest [$t ] ) ) {
 468                                                         $badtag = 1 ;
 469                                                 # Is it a self closed htmlpair ? (bug 5487)
 470                                                 } else if( $brace == '/>' &&
 471                                                 isset( $htmlpairs[$t] ) ) {
 472                                                         $badtag = 1;
 473                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 474                                                         # Hack to force empty tag for uncloseable elements
 475                                                         $brace = '/>';
 476                                                 } else if( isset( $htmlsingle[$t] ) ) {
 477                                                         # Hack to not close $htmlsingle tags
 478                                                         $brace = NULL;
 479                                                 } else if( isset( $tabletags[$t] )
 480                                                 &&  in_array($t ,$tagstack) ) {
 481                                                         // New table tag but forgot to close the previous one
 482                                                         $text .= "</$t>";
 483                                                 } else {
 484                                                         if ( $t == 'table' ) {
 485                                                                 array_push( $tablestack, $tagstack );
 486                                                                 $tagstack = array();
 487                                                         }
 488                                                         array_push( $tagstack, $t );
 489                                                 }
 490
 491                                                 # Replace any variables or template parameters with
 492                                                 # plaintext results.
 493                                                 if( is_callable( $processCallback ) ) {
 494                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 495                                                 }
 496
 497                                                 # Strip non-approved attributes from the tag
 498                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 499                                         }
 500                                         if ( ! $badtag ) {
 501                                                 $rest = str_replace( '>', '&gt;', $rest );
 502                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 503                                                 $text .= "<$slash$t$newparams$close>$rest";
 504                                                 continue;
 505                                         }
 506                                 }
 507                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 508                         }
 509                         # Close off any remaining tags
 510                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 511                                 $text .= "</$t>\n";
 512                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 513                         }
 514                 } else {
 515                         # this might be possible using tidy itself
 516                         foreach ( $bits as $x ) {
 517                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 518                                 $x, $regs );
 519                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 520                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 521                                         if( is_callable( $processCallback ) ) {
 522                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 523                                         }
 524                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 525                                         $rest = str_replace( '>', '&gt;', $rest );
 526                                         $text .= "<$slash$t$newparams$brace$rest";
 527                                 } else {
 528                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 529                                 }
 530                         }
 531                 }
 532                 wfProfileOut( __METHOD__ );
 533                 return $text;
 534         }
 535
 536         /**
 537          * Remove '<!--', '-->', and everything between.
 538          * To avoid leaving blank lines, when a comment is both preceded
 539          * and followed by a newline (ignoring spaces), trim leading and
 540          * trailing spaces and one of the newlines.
 541          *
 542          * @private
 543          * @param $text String
 544          * @return string
 545          */
 546         static function removeHTMLcomments( $text ) {
 547                 wfProfileIn( __METHOD__ );
 548                 while (($start = strpos($text, '<!--')) !== false) {
 549                         $end = strpos($text, '-->', $start + 4);
 550                         if ($end === false) {
 551                                 # Unterminated comment; bail out
 552                                 break;
 553                         }
 554
 555                         $end += 3;
 556
 557                         # Trim space and newline if the comment is both
 558                         # preceded and followed by a newline
 559                         $spaceStart = max($start - 1, 0);
 560                         $spaceLen = $end - $spaceStart;
 561                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 562                                 $spaceStart--;
 563                                 $spaceLen++;
 564                         }
 565                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 566                                 $spaceLen++;
 567                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 568                                 # Remove the comment, leading and trailing
 569                                 # spaces, and leave only one newline.
 570                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 571                         }
 572                         else {
 573                                 # Remove just the comment.
 574                                 $text = substr_replace($text, '', $start, $end - $start);
 575                         }
 576                 }
 577                 wfProfileOut( __METHOD__ );
 578                 return $text;
 579         }
 580
 581         /**
 582          * Take an array of attribute names and values and normalize or discard
 583          * illegal values for the given element type.
 584          *
 585          * - Discards attributes not on a whitelist for the given element
 586          * - Unsafe style attributes are discarded
 587          * - Invalid id attributes are reencoded
 588          *
 589          * @param $attribs Array
 590          * @param $element String
 591          * @return Array
 592          *
 593          * @todo Check for legal values where the DTD limits things.
 594          * @todo Check for unique id attribute :P
 595          */
 596         static function validateTagAttributes( $attribs, $element ) {
 597                 return Sanitizer::validateAttributes( $attribs,
 598                         Sanitizer::attributeWhitelist( $element ) );
 599         }
 600
 601         /**
 602          * Take an array of attribute names and values and normalize or discard
 603          * illegal values for the given whitelist.
 604          *
 605          * - Discards attributes not the given whitelist
 606          * - Unsafe style attributes are discarded
 607          * - Invalid id attributes are reencoded
 608          *
 609          * @param $attribs Array
 610          * @param $whitelist Array: list of allowed attribute names
 611          * @return Array
 612          *
 613          * @todo Check for legal values where the DTD limits things.
 614          * @todo Check for unique id attribute :P
 615          */
 616         static function validateAttributes( $attribs, $whitelist ) {
 617                 $whitelist = array_flip( $whitelist );
 618                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 619
 620                 $out = array();
 621                 foreach( $attribs as $attribute => $value ) {
 622                         #allow XML namespace declaration. Useful especially with RDFa
 623                         if ( preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
 624                                 if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
 625                                         $out[$attribute] = $value;
 626                                 }
 627
 628                                 continue;
 629                         }
 630
 631                         if( !isset( $whitelist[$attribute] ) ) {
 632                                 continue;
 633                         }
 634
 635                         # Strip javascript "expression" from stylesheets.
 636                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 637                         if( $attribute == 'style' ) {
 638                                 $value = Sanitizer::checkCss( $value );
 639                                 if( $value === false ) {
 640                                         # haxx0r
 641                                         continue;
 642                                 }
 643                         }
 644
 645                         if ( $attribute === 'id' ) {
 646                                 global $wgEnforceHtmlIds;
 647                                 $value = Sanitizer::escapeId( $value,
 648                                         $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 649                         }
 650
 651                         //RDFa and microdata properties allow URIs. check them
 652                         if ( $attribute === 'rel' || $attribute === 'rev' ||
 653                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' ||
 654                                 $attribute === 'datatype' || $attribute === 'typeof' ||
 655                                 $attribute === 'item' || $attribute === 'itemprop' || $attribute === 'subject' ) {
 656
 657                                 //Paranoia. Allow "simple" values but suppress javascript
 658                                 if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
 659                                         continue;
 660                                 }
 661                         }
 662
 663                         # NOTE: even though elements using href/src are not allowed directly, supply
 664                         #       validation code that can be used by tag hook handlers, etc
 665                         if ( $attribute === 'href' || $attribute === 'src' ) {
 666                                 if ( !preg_match( $hrefExp, $value ) ) {
 667                                         continue; //drop any href or src attributes not using an allowed protocol.
 668                                                   //NOTE: this also drops all relative URLs
 669                                 }
 670                         }
 671
 672                         // If this attribute was previously set, override it.
 673                         // Output should only have one attribute of each name.
 674                         $out[$attribute] = $value;
 675                 }
 676                 return $out;
 677         }
 678
 679         /**
 680          * Merge two sets of HTML attributes.  Conflicting items in the second set
 681          * will override those in the first, except for 'class' attributes which
 682          * will be combined (if they're both strings).
 683          *
 684          * @todo implement merging for other attributes such as style
 685          * @param $a Array
 686          * @param $b Array
 687          * @return array
 688          */
 689         static function mergeAttributes( $a, $b ) {
 690                 $out = array_merge( $a, $b );
 691                 if( isset( $a['class'] ) && isset( $b['class'] )
 692                 && is_string( $a['class'] ) && is_string( $b['class'] )
 693                 && $a['class'] !== $b['class'] ) {
 694                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 695                                 -1, PREG_SPLIT_NO_EMPTY );
 696                         $out['class'] = implode( ' ', array_unique( $classes ) );
 697                 }
 698                 return $out;
 699         }
 700
 701         /**
 702          * Pick apart some CSS and check it for forbidden or unsafe structures.
 703          * Returns a sanitized string, or false if it was just too evil.
 704          *
 705          * Currently URL references, 'expression', 'tps' are forbidden.
 706          *
 707          * @param $value String
 708          * @return Mixed
 709          */
 710         static function checkCss( $value ) {
 711                 $stripped = Sanitizer::decodeCharReferences( $value );
 712
 713                 // Remove any comments; IE gets token splitting wrong
 714                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 715
 716                 $value = $stripped;
 717
 718                 // ... and continue checks
 719                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 720                         'codepointToUtf8(hexdec("$1"))', $stripped );
 721                 $stripped = str_replace( '\\', '', $stripped );
 722                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 723                                 $stripped ) ) {
 724                         # haxx0r
 725                         return false;
 726                 }
 727
 728                 return $value;
 729         }
 730
 731         /**
 732          * Take a tag soup fragment listing an HTML element's attributes
 733          * and normalize it to well-formed XML, discarding unwanted attributes.
 734          * Output is safe for further wikitext processing, with escaping of
 735          * values that could trigger problems.
 736          *
 737          * - Normalizes attribute names to lowercase
 738          * - Discards attributes not on a whitelist for the given element
 739          * - Turns broken or invalid entities into plaintext
 740          * - Double-quotes all attribute values
 741          * - Attributes without values are given the name as attribute
 742          * - Double attributes are discarded
 743          * - Unsafe style attributes are discarded
 744          * - Prepends space if there are attributes.
 745          *
 746          * @param $text String
 747          * @param $element String
 748          * @return String
 749          */
 750         static function fixTagAttributes( $text, $element ) {
 751                 if( trim( $text ) == '' ) {
 752                         return '';
 753                 }
 754
 755                 $stripped = Sanitizer::validateTagAttributes(
 756                         Sanitizer::decodeTagAttributes( $text ), $element );
 757
 758                 $attribs = array();
 759                 foreach( $stripped as $attribute => $value ) {
 760                         $encAttribute = htmlspecialchars( $attribute );
 761                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 762
 763                         $attribs[] = "$encAttribute=\"$encValue\"";
 764                 }
 765                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 766         }
 767
 768         /**
 769          * Encode an attribute value for HTML output.
 770          * @param $text String
 771          * @return HTML-encoded text fragment
 772          */
 773         static function encodeAttribute( $text ) {
 774                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 775
 776                 // Whitespace is normalized during attribute decoding,
 777                 // so if we've been passed non-spaces we must encode them
 778                 // ahead of time or they won't be preserved.
 779                 $encValue = strtr( $encValue, array(
 780                         "\n" => '&#10;',
 781                         "\r" => '&#13;',
 782                         "\t" => '&#9;',
 783                 ) );
 784
 785                 return $encValue;
 786         }
 787
 788         /**
 789          * Encode an attribute value for HTML tags, with extra armoring
 790          * against further wiki processing.
 791          * @param $text String
 792          * @return HTML-encoded text fragment
 793          */
 794         static function safeEncodeAttribute( $text ) {
 795                 $encValue = Sanitizer::encodeAttribute( $text );
 796
 797                 # Templates and links may be expanded in later parsing,
 798                 # creating invalid or dangerous output. Suppress this.
 799                 $encValue = strtr( $encValue, array(
 800                         '<'    => '&lt;',   // This should never happen,
 801                         '>'    => '&gt;',   // we've received invalid input
 802                         '"'    => '&quot;', // which should have been escaped.
 803                         '{'    => '&#123;',
 804                         '['    => '&#91;',
 805                         "''"   => '&#39;&#39;',
 806                         'ISBN' => '&#73;SBN',
 807                         'RFC'  => '&#82;FC',
 808                         'PMID' => '&#80;MID',
 809                         '|'    => '&#124;',
 810                         '__'   => '&#95;_',
 811                 ) );
 812
 813                 # Stupid hack
 814                 $encValue = preg_replace_callback(
 815                         '/(' . wfUrlProtocols() . ')/',
 816                         array( 'Sanitizer', 'armorLinksCallback' ),
 817                         $encValue );
 818                 return $encValue;
 819         }
 820
 821         /**
 822          * Given a value escape it so that it can be used in an id attribute and
 823          * return it, this does not validate the value however (see first link)
 824          *
 825          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 826          *                                                          in the id and
 827          *                                                          name attributes
 828          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 829          *
 830          * @param $id String: id to validate
 831          * @param $options Mixed: string or array of strings (default is array()):
 832          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 833          *       so don't pay attention if the first character isn't valid at the
 834          *       beginning of an id.
 835          *   'xml': Don't restrict the id to be HTML4-compatible.  This option
 836          *       allows any alphabetic character to be used, per the XML standard.
 837          *       Therefore, it also completely changes the type of escaping: instead
 838          *       of weird dot-encoding, runs of invalid characters (mostly
 839          *       whitespace) are just compressed into a single underscore.
 840          * @return String
 841          */
 842         static function escapeId( $id, $options = array() ) {
 843                 $options = (array)$options;
 844
 845                 if ( !in_array( 'xml', $options ) ) {
 846                         # HTML4-style escaping
 847                         static $replace = array(
 848                                 '%3A' => ':',
 849                                 '%' => '.'
 850                         );
 851
 852                         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 853                         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 854
 855                         if ( !preg_match( '/^[a-zA-Z]/', $id )
 856                         && !in_array( 'noninitial', $options ) )  {
 857                                 // Initial character must be a letter!
 858                                 $id = "x$id";
 859                         }
 860                         return $id;
 861                 }
 862
 863                 # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
 864                 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
 865                 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
 866                         . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
 867                         . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
 868                 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
 869                         . '\x{203F}-\x{2040}';
 870                 # Replace _ as well so we don't get multiple consecutive underscores
 871                 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
 872                 $id = trim( $id, '_' );
 873
 874                 if ( !preg_match( "/^[$nameStartChar]/u", $id )
 875                 && !in_array( 'noninitial', $options ) ) {
 876                         $id = "_$id";
 877                 }
 878
 879                 return $id;
 880         }
 881
 882         /**
 883          * Given a value, escape it so that it can be used as a CSS class and
 884          * return it.
 885          *
 886          * @todo For extra validity, input should be validated UTF-8.
 887          *
 888          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 889          *
 890          * @param $class String
 891          * @return String
 892          */
 893         static function escapeClass( $class ) {
 894                 // Convert ugly stuff to underscores and kill underscores in ugly places
 895                 return rtrim(preg_replace(
 896                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 897                         '_',
 898                         $class ), '_');
 899         }
 900
 901         /**
 902          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 903          * This allows (generally harmless) entities like &nbsp; to survive.
 904          *
 905          * @param $html String to escape
 906          * @return String: escaped input
 907          */
 908         static function escapeHtmlAllowEntities( $html ) {
 909                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 910                 # hurt.
 911                 $html = htmlspecialchars( $html, ENT_QUOTES );
 912                 $html = str_replace( '&amp;', '&', $html );
 913                 $html = Sanitizer::normalizeCharReferences( $html );
 914                 return $html;
 915         }
 916
 917         /**
 918          * Regex replace callback for armoring links against further processing.
 919          * @param $matches Array
 920          * @return string
 921          */
 922         private static function armorLinksCallback( $matches ) {
 923                 return str_replace( ':', '&#58;', $matches[1] );
 924         }
 925
 926         /**
 927          * Return an associative array of attribute names and values from
 928          * a partial tag string. Attribute names are forces to lowercase,
 929          * character references are decoded to UTF-8 text.
 930          *
 931          * @param $text String
 932          * @return Array
 933          */
 934         public static function decodeTagAttributes( $text ) {
 935                 $attribs = array();
 936
 937                 if( trim( $text ) == '' ) {
 938                         return $attribs;
 939                 }
 940
 941                 $pairs = array();
 942                 if( !preg_match_all(
 943                         MW_ATTRIBS_REGEX,
 944                         $text,
 945                         $pairs,
 946                         PREG_SET_ORDER ) ) {
 947                         return $attribs;
 948                 }
 949
 950                 foreach( $pairs as $set ) {
 951                         $attribute = strtolower( $set[1] );
 952                         $value = Sanitizer::getTagAttributeCallback( $set );
 953
 954                         // Normalize whitespace
 955                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 956                         $value = trim( $value );
 957
 958                         // Decode character references
 959                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 960                 }
 961                 return $attribs;
 962         }
 963
 964         /**
 965          * Pick the appropriate attribute value from a match set from the
 966          * MW_ATTRIBS_REGEX matches.
 967          *
 968          * @param $set Array
 969          * @return String
 970          */
 971         private static function getTagAttributeCallback( $set ) {
 972                 if( isset( $set[6] ) ) {
 973                         # Illegal #XXXXXX color with no quotes.
 974                         return $set[6];
 975                 } elseif( isset( $set[5] ) ) {
 976                         # No quotes.
 977                         return $set[5];
 978                 } elseif( isset( $set[4] ) ) {
 979                         # Single-quoted
 980                         return $set[4];
 981                 } elseif( isset( $set[3] ) ) {
 982                         # Double-quoted
 983                         return $set[3];
 984                 } elseif( !isset( $set[2] ) ) {
 985                         # In XHTML, attributes must have a value.
 986                         # For 'reduced' form, return explicitly the attribute name here.
 987                         return $set[1];
 988                 } else {
 989                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 990                 }
 991         }
 992
 993         /**
 994          * Normalize whitespace and character references in an XML source-
 995          * encoded text for an attribute value.
 996          *
 997          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 998          * but note that we're not returning the value, but are returning
 999          * XML source fragments that will be slapped into output.
1000          *
1001          * @param $text String
1002          * @return String
1003          */
1004         private static function normalizeAttributeValue( $text ) {
1005                 return str_replace( '"', '&quot;',
1006                         self::normalizeWhitespace(
1007                                 Sanitizer::normalizeCharReferences( $text ) ) );
1008         }
1009
1010         private static function normalizeWhitespace( $text ) {
1011                 return preg_replace(
1012                         '/\r\n|[\x20\x0d\x0a\x09]/',
1013                         ' ',
1014                         $text );
1015         }
1016
1017         /**
1018          * Ensure that any entities and character references are legal
1019          * for XML and XHTML specifically. Any stray bits will be
1020          * &amp;-escaped to result in a valid text fragment.
1021          *
1022          * a. any named char refs must be known in XHTML
1023          * b. any numeric char refs must be legal chars, not invalid or forbidden
1024          * c. use &#x, not &#X
1025          * d. fix or reject non-valid attributes
1026          *
1027          * @param $text String
1028          * @return String
1029          * @private
1030          */
1031         static function normalizeCharReferences( $text ) {
1032                 return preg_replace_callback(
1033                         MW_CHAR_REFS_REGEX,
1034                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1035                         $text );
1036         }
1037         /**
1038          * @param $matches String
1039          * @return String
1040          */
1041         static function normalizeCharReferencesCallback( $matches ) {
1042                 $ret = null;
1043                 if( $matches[1] != '' ) {
1044                         $ret = Sanitizer::normalizeEntity( $matches[1] );
1045                 } elseif( $matches[2] != '' ) {
1046                         $ret = Sanitizer::decCharReference( $matches[2] );
1047                 } elseif( $matches[3] != ''  ) {
1048                         $ret = Sanitizer::hexCharReference( $matches[3] );
1049                 } elseif( $matches[4] != '' ) {
1050                         $ret = Sanitizer::hexCharReference( $matches[4] );
1051                 }
1052                 if( is_null( $ret ) ) {
1053                         return htmlspecialchars( $matches[0] );
1054                 } else {
1055                         return $ret;
1056                 }
1057         }
1058
1059         /**
1060          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1061          * return the named entity reference as is. If the entity is a
1062          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1063          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1064          *
1065          * @param $name String
1066          * @return String
1067          */
1068         static function normalizeEntity( $name ) {
1069                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1070                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1071                         return "&{$wgHtmlEntityAliases[$name]};";
1072                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1073                         return "&$name;";
1074                 } else {
1075                         return "&amp;$name;";
1076                 }
1077         }
1078
1079         static function decCharReference( $codepoint ) {
1080                 $point = intval( $codepoint );
1081                 if( Sanitizer::validateCodepoint( $point ) ) {
1082                         return sprintf( '&#%d;', $point );
1083                 } else {
1084                         return null;
1085                 }
1086         }
1087
1088         static function hexCharReference( $codepoint ) {
1089                 $point = hexdec( $codepoint );
1090                 if( Sanitizer::validateCodepoint( $point ) ) {
1091                         return sprintf( '&#x%x;', $point );
1092                 } else {
1093                         return null;
1094                 }
1095         }
1096
1097         /**
1098          * Returns true if a given Unicode codepoint is a valid character in XML.
1099          * @param $codepoint Integer
1100          * @return Boolean
1101          */
1102         private static function validateCodepoint( $codepoint ) {
1103                 return ($codepoint ==    0x09)
1104                         || ($codepoint ==    0x0a)
1105                         || ($codepoint ==    0x0d)
1106                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1107                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1108                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1109         }
1110
1111         /**
1112          * Decode any character references, numeric or named entities,
1113          * in the text and return a UTF-8 string.
1114          *
1115          * @param $text String
1116          * @return String
1117          */
1118         public static function decodeCharReferences( $text ) {
1119                 return preg_replace_callback(
1120                         MW_CHAR_REFS_REGEX,
1121                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1122                         $text );
1123         }
1124
1125         /**
1126          * @param $matches String
1127          * @return String
1128          */
1129         static function decodeCharReferencesCallback( $matches ) {
1130                 if( $matches[1] != '' ) {
1131                         return Sanitizer::decodeEntity( $matches[1] );
1132                 } elseif( $matches[2] != '' ) {
1133                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1134                 } elseif( $matches[3] != ''  ) {
1135                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1136                 } elseif( $matches[4] != '' ) {
1137                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1138                 }
1139                 # Last case should be an ampersand by itself
1140                 return $matches[0];
1141         }
1142
1143         /**
1144          * Return UTF-8 string for a codepoint if that is a valid
1145          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1146          * @param $codepoint Integer
1147          * @return String
1148          * @private
1149          */
1150         static function decodeChar( $codepoint ) {
1151                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1152                         return codepointToUtf8( $codepoint );
1153                 } else {
1154                         return UTF8_REPLACEMENT;
1155                 }
1156         }
1157
1158         /**
1159          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1160          * return the UTF-8 encoding of that character. Otherwise, returns
1161          * pseudo-entity source (eg &foo;)
1162          *
1163          * @param $name Strings
1164          * @return String
1165          */
1166         static function decodeEntity( $name ) {
1167                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1168                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1169                         $name = $wgHtmlEntityAliases[$name];
1170                 }
1171                 if( isset( $wgHtmlEntities[$name] ) ) {
1172                         return codepointToUtf8( $wgHtmlEntities[$name] );
1173                 } else {
1174                         return "&$name;";
1175                 }
1176         }
1177
1178         /**
1179          * Fetch the whitelist of acceptable attributes for a given element name.
1180          *
1181          * @param $element String
1182          * @return Array
1183          */
1184         static function attributeWhitelist( $element ) {
1185                 static $list;
1186                 if( !isset( $list ) ) {
1187                         $list = Sanitizer::setupAttributeWhitelist();
1188                 }
1189                 return isset( $list[$element] )
1190                         ? $list[$element]
1191                         : array();
1192         }
1193
1194         /**
1195          * Foreach array key (an allowed HTML element), return an array
1196          * of allowed attributes
1197          * @return Array
1198          */
1199         static function setupAttributeWhitelist() {
1200                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowItemAttributes;
1201
1202                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style', 'xml:lang' );
1203
1204                 if ( $wgAllowRdfaAttributes ) {
1205                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1206                         $common = array_merge( $common, array(
1207                             'about', 'property', 'resource', 'datatype', 'typeof',
1208                         ) );
1209                 }
1210
1211                 if ( $wgHtml5 && $wgAllowItemAttributes ) {
1212                         # add HTML5 microdata tages as pecified by http://www.w3.org/TR/html5/microdata.html
1213                         $common = array_merge( $common, array(
1214                             'item', 'itemprop', 'subject'
1215                         ) );
1216                 }
1217
1218                 $block = array_merge( $common, array( 'align' ) );
1219                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1220                 $tablecell = array( 'abbr',
1221                                     'axis',
1222                                     'headers',
1223                                     'scope',
1224                                     'rowspan',
1225                                     'colspan',
1226                                     'nowrap', # deprecated
1227                                     'width',  # deprecated
1228                                     'height', # deprecated
1229                                     'bgcolor' # deprecated
1230                                     );
1231
1232                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1233                 # See: http://www.w3.org/TR/html4/
1234                 $whitelist = array (
1235                         # 7.5.4
1236                         'div'        => $block,
1237                         'center'     => $common, # deprecated
1238                         'span'       => $block, # ??
1239
1240                         # 7.5.5
1241                         'h1'         => $block,
1242                         'h2'         => $block,
1243                         'h3'         => $block,
1244                         'h4'         => $block,
1245                         'h5'         => $block,
1246                         'h6'         => $block,
1247
1248                         # 7.5.6
1249                         # address
1250
1251                         # 8.2.4
1252                         # bdo
1253
1254                         # 9.2.1
1255                         'em'         => $common,
1256                         'strong'     => $common,
1257                         'cite'       => $common,
1258                         # dfn
1259                         'code'       => $common,
1260                         # samp
1261                         # kbd
1262                         'var'        => $common,
1263                         'abbr'       => $common,
1264                         # acronym
1265
1266                         # 9.2.2
1267                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1268                         # q
1269
1270                         # 9.2.3
1271                         'sub'        => $common,
1272                         'sup'        => $common,
1273
1274                         # 9.3.1
1275                         'p'          => $block,
1276
1277                         # 9.3.2
1278                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1279
1280                         # 9.3.4
1281                         'pre'        => array_merge( $common, array( 'width' ) ),
1282
1283                         # 9.4
1284                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1285                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1286
1287                         # 10.2
1288                         'ul'         => array_merge( $common, array( 'type' ) ),
1289                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1290                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1291
1292                         # 10.3
1293                         'dl'         => $common,
1294                         'dd'         => $common,
1295                         'dt'         => $common,
1296
1297                         # 11.2.1
1298                         'table'      => array_merge( $common,
1299                                                                 array( 'summary', 'width', 'border', 'frame',
1300                                                                                 'rules', 'cellspacing', 'cellpadding',
1301                                                                                 'align', 'bgcolor',
1302                                                                 ) ),
1303
1304                         # 11.2.2
1305                         'caption'    => array_merge( $common, array( 'align' ) ),
1306
1307                         # 11.2.3
1308                         'thead'      => array_merge( $common, $tablealign ),
1309                         'tfoot'      => array_merge( $common, $tablealign ),
1310                         'tbody'      => array_merge( $common, $tablealign ),
1311
1312                         # 11.2.4
1313                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1314                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1315
1316                         # 11.2.5
1317                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1318
1319                         # 11.2.6
1320                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1321                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1322
1323                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
1324                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1325
1326                         # 13.2
1327                         # Not usually allowed, but may be used for extension-style hooks
1328                         # such as <math> when it is rasterized
1329                         'img'        => array_merge( $common, array( 'alt' ) ),
1330
1331                         # 15.2.1
1332                         'tt'         => $common,
1333                         'b'          => $common,
1334                         'i'          => $common,
1335                         'big'        => $common,
1336                         'small'      => $common,
1337                         'strike'     => $common,
1338                         's'          => $common,
1339                         'u'          => $common,
1340
1341                         # 15.2.2
1342                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1343                         # basefont
1344
1345                         # 15.3
1346                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1347
1348                         # XHTML Ruby annotation text module, simple ruby only.
1349                         # http://www.w3c.org/TR/ruby/
1350                         'ruby'       => $common,
1351                         # rbc
1352                         # rtc
1353                         'rb'         => $common,
1354                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1355                         'rp'         => $common,
1356
1357                         # MathML root element, where used for extensions
1358                         # 'title' may not be 100% valid here; it's XHTML
1359                         # http://www.w3.org/TR/REC-MathML/
1360                         'math'       => array( 'class', 'style', 'id', 'title' ),
1361                         );
1362                 return $whitelist;
1363         }
1364
1365         /**
1366          * Take a fragment of (potentially invalid) HTML and return
1367          * a version with any tags removed, encoded as plain text.
1368          *
1369          * Warning: this return value must be further escaped for literal
1370          * inclusion in HTML output as of 1.10!
1371          *
1372          * @param $text String: HTML fragment
1373          * @return String
1374          */
1375         static function stripAllTags( $text ) {
1376                 # Actual <tags>
1377                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1378
1379                 # Normalize &entities and whitespace
1380                 $text = self::decodeCharReferences( $text );
1381                 $text = self::normalizeWhitespace( $text );
1382
1383                 return $text;
1384         }
1385
1386         /**
1387          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1388          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1389          * PHP 5.1 doesn't.
1390          *
1391          * Use for passing XHTML fragments to PHP's XML parsing functions
1392          *
1393          * @return String
1394          */
1395         static function hackDocType() {
1396                 global $wgHtmlEntities;
1397                 $out = "<!DOCTYPE html [\n";
1398                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1399                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1400                 }
1401                 $out .= "]>\n";
1402                 return $out;
1403         }
1404
1405         static function cleanUrl( $url ) {
1406                 # Normalize any HTML entities in input. They will be
1407                 # re-escaped by makeExternalLink().
1408                 $url = Sanitizer::decodeCharReferences( $url );
1409
1410                 # Escape any control characters introduced by the above step
1411                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1412
1413                 # Validate hostname portion
1414                 $matches = array();
1415                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1416                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1417
1418                         // Characters that will be ignored in IDNs.
1419                         // http://tools.ietf.org/html/3454#section-3.1
1420                         // Strip them before further processing so blacklists and such work.
1421                         $strip = "/
1422                                 \\s|          # general whitespace
1423                                 \xc2\xad|     # 00ad SOFT HYPHEN
1424                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1425                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1426                                 \xe2\x81\xa0| # 2060 WORD JOINER
1427                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1428                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1429                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1430                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1431                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1432                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1433                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1434                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1435                                 /xuD";
1436
1437                         $host = preg_replace( $strip, '', $host );
1438
1439                         // @fixme: validate hostnames here
1440
1441                         return $protocol . $host . $rest;
1442                 } else {
1443                         return $url;
1444                 }
1445         }
1446
1447 }