includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * Regular expression to match URIs that could trigger script execution
  61  */
  62 define( 'MW_SCRIPT_URL_PATTERN', '/(^|\s)(javascript|vbscript)[^\w]/i' );
  63
  64 /**
  65  * List of all named character entities defined in HTML 4.01
  66  * http://www.w3.org/TR/html4/sgml/entities.html
  67  * @private
  68  */
  69 global $wgHtmlEntities;
  70 $wgHtmlEntities = array(
  71         'Aacute'   => 193,
  72         'aacute'   => 225,
  73         'Acirc'    => 194,
  74         'acirc'    => 226,
  75         'acute'    => 180,
  76         'AElig'    => 198,
  77         'aelig'    => 230,
  78         'Agrave'   => 192,
  79         'agrave'   => 224,
  80         'alefsym'  => 8501,
  81         'Alpha'    => 913,
  82         'alpha'    => 945,
  83         'amp'      => 38,
  84         'and'      => 8743,
  85         'ang'      => 8736,
  86         'Aring'    => 197,
  87         'aring'    => 229,
  88         'asymp'    => 8776,
  89         'Atilde'   => 195,
  90         'atilde'   => 227,
  91         'Auml'     => 196,
  92         'auml'     => 228,
  93         'bdquo'    => 8222,
  94         'Beta'     => 914,
  95         'beta'     => 946,
  96         'brvbar'   => 166,
  97         'bull'     => 8226,
  98         'cap'      => 8745,
  99         'Ccedil'   => 199,
 100         'ccedil'   => 231,
 101         'cedil'    => 184,
 102         'cent'     => 162,
 103         'Chi'      => 935,
 104         'chi'      => 967,
 105         'circ'     => 710,
 106         'clubs'    => 9827,
 107         'cong'     => 8773,
 108         'copy'     => 169,
 109         'crarr'    => 8629,
 110         'cup'      => 8746,
 111         'curren'   => 164,
 112         'dagger'   => 8224,
 113         'Dagger'   => 8225,
 114         'darr'     => 8595,
 115         'dArr'     => 8659,
 116         'deg'      => 176,
 117         'Delta'    => 916,
 118         'delta'    => 948,
 119         'diams'    => 9830,
 120         'divide'   => 247,
 121         'Eacute'   => 201,
 122         'eacute'   => 233,
 123         'Ecirc'    => 202,
 124         'ecirc'    => 234,
 125         'Egrave'   => 200,
 126         'egrave'   => 232,
 127         'empty'    => 8709,
 128         'emsp'     => 8195,
 129         'ensp'     => 8194,
 130         'Epsilon'  => 917,
 131         'epsilon'  => 949,
 132         'equiv'    => 8801,
 133         'Eta'      => 919,
 134         'eta'      => 951,
 135         'ETH'      => 208,
 136         'eth'      => 240,
 137         'Euml'     => 203,
 138         'euml'     => 235,
 139         'euro'     => 8364,
 140         'exist'    => 8707,
 141         'fnof'     => 402,
 142         'forall'   => 8704,
 143         'frac12'   => 189,
 144         'frac14'   => 188,
 145         'frac34'   => 190,
 146         'frasl'    => 8260,
 147         'Gamma'    => 915,
 148         'gamma'    => 947,
 149         'ge'       => 8805,
 150         'gt'       => 62,
 151         'harr'     => 8596,
 152         'hArr'     => 8660,
 153         'hearts'   => 9829,
 154         'hellip'   => 8230,
 155         'Iacute'   => 205,
 156         'iacute'   => 237,
 157         'Icirc'    => 206,
 158         'icirc'    => 238,
 159         'iexcl'    => 161,
 160         'Igrave'   => 204,
 161         'igrave'   => 236,
 162         'image'    => 8465,
 163         'infin'    => 8734,
 164         'int'      => 8747,
 165         'Iota'     => 921,
 166         'iota'     => 953,
 167         'iquest'   => 191,
 168         'isin'     => 8712,
 169         'Iuml'     => 207,
 170         'iuml'     => 239,
 171         'Kappa'    => 922,
 172         'kappa'    => 954,
 173         'Lambda'   => 923,
 174         'lambda'   => 955,
 175         'lang'     => 9001,
 176         'laquo'    => 171,
 177         'larr'     => 8592,
 178         'lArr'     => 8656,
 179         'lceil'    => 8968,
 180         'ldquo'    => 8220,
 181         'le'       => 8804,
 182         'lfloor'   => 8970,
 183         'lowast'   => 8727,
 184         'loz'      => 9674,
 185         'lrm'      => 8206,
 186         'lsaquo'   => 8249,
 187         'lsquo'    => 8216,
 188         'lt'       => 60,
 189         'macr'     => 175,
 190         'mdash'    => 8212,
 191         'micro'    => 181,
 192         'middot'   => 183,
 193         'minus'    => 8722,
 194         'Mu'       => 924,
 195         'mu'       => 956,
 196         'nabla'    => 8711,
 197         'nbsp'     => 160,
 198         'ndash'    => 8211,
 199         'ne'       => 8800,
 200         'ni'       => 8715,
 201         'not'      => 172,
 202         'notin'    => 8713,
 203         'nsub'     => 8836,
 204         'Ntilde'   => 209,
 205         'ntilde'   => 241,
 206         'Nu'       => 925,
 207         'nu'       => 957,
 208         'Oacute'   => 211,
 209         'oacute'   => 243,
 210         'Ocirc'    => 212,
 211         'ocirc'    => 244,
 212         'OElig'    => 338,
 213         'oelig'    => 339,
 214         'Ograve'   => 210,
 215         'ograve'   => 242,
 216         'oline'    => 8254,
 217         'Omega'    => 937,
 218         'omega'    => 969,
 219         'Omicron'  => 927,
 220         'omicron'  => 959,
 221         'oplus'    => 8853,
 222         'or'       => 8744,
 223         'ordf'     => 170,
 224         'ordm'     => 186,
 225         'Oslash'   => 216,
 226         'oslash'   => 248,
 227         'Otilde'   => 213,
 228         'otilde'   => 245,
 229         'otimes'   => 8855,
 230         'Ouml'     => 214,
 231         'ouml'     => 246,
 232         'para'     => 182,
 233         'part'     => 8706,
 234         'permil'   => 8240,
 235         'perp'     => 8869,
 236         'Phi'      => 934,
 237         'phi'      => 966,
 238         'Pi'       => 928,
 239         'pi'       => 960,
 240         'piv'      => 982,
 241         'plusmn'   => 177,
 242         'pound'    => 163,
 243         'prime'    => 8242,
 244         'Prime'    => 8243,
 245         'prod'     => 8719,
 246         'prop'     => 8733,
 247         'Psi'      => 936,
 248         'psi'      => 968,
 249         'quot'     => 34,
 250         'radic'    => 8730,
 251         'rang'     => 9002,
 252         'raquo'    => 187,
 253         'rarr'     => 8594,
 254         'rArr'     => 8658,
 255         'rceil'    => 8969,
 256         'rdquo'    => 8221,
 257         'real'     => 8476,
 258         'reg'      => 174,
 259         'rfloor'   => 8971,
 260         'Rho'      => 929,
 261         'rho'      => 961,
 262         'rlm'      => 8207,
 263         'rsaquo'   => 8250,
 264         'rsquo'    => 8217,
 265         'sbquo'    => 8218,
 266         'Scaron'   => 352,
 267         'scaron'   => 353,
 268         'sdot'     => 8901,
 269         'sect'     => 167,
 270         'shy'      => 173,
 271         'Sigma'    => 931,
 272         'sigma'    => 963,
 273         'sigmaf'   => 962,
 274         'sim'      => 8764,
 275         'spades'   => 9824,
 276         'sub'      => 8834,
 277         'sube'     => 8838,
 278         'sum'      => 8721,
 279         'sup'      => 8835,
 280         'sup1'     => 185,
 281         'sup2'     => 178,
 282         'sup3'     => 179,
 283         'supe'     => 8839,
 284         'szlig'    => 223,
 285         'Tau'      => 932,
 286         'tau'      => 964,
 287         'there4'   => 8756,
 288         'Theta'    => 920,
 289         'theta'    => 952,
 290         'thetasym' => 977,
 291         'thinsp'   => 8201,
 292         'THORN'    => 222,
 293         'thorn'    => 254,
 294         'tilde'    => 732,
 295         'times'    => 215,
 296         'trade'    => 8482,
 297         'Uacute'   => 218,
 298         'uacute'   => 250,
 299         'uarr'     => 8593,
 300         'uArr'     => 8657,
 301         'Ucirc'    => 219,
 302         'ucirc'    => 251,
 303         'Ugrave'   => 217,
 304         'ugrave'   => 249,
 305         'uml'      => 168,
 306         'upsih'    => 978,
 307         'Upsilon'  => 933,
 308         'upsilon'  => 965,
 309         'Uuml'     => 220,
 310         'uuml'     => 252,
 311         'weierp'   => 8472,
 312         'Xi'       => 926,
 313         'xi'       => 958,
 314         'Yacute'   => 221,
 315         'yacute'   => 253,
 316         'yen'      => 165,
 317         'Yuml'     => 376,
 318         'yuml'     => 255,
 319         'Zeta'     => 918,
 320         'zeta'     => 950,
 321         'zwj'      => 8205,
 322         'zwnj'     => 8204 );
 323
 324 /**
 325  * Character entity aliases accepted by MediaWiki
 326  */
 327 global $wgHtmlEntityAliases;
 328 $wgHtmlEntityAliases = array(
 329         'רלמ' => 'rlm',
 330         'رلم' => 'rlm',
 331 );
 332
 333
 334 /**
 335  * XHTML sanitizer for MediaWiki
 336  * @ingroup Parser
 337  */
 338 class Sanitizer {
 339         /**
 340          * Cleans up HTML, removes dangerous tags and attributes, and
 341          * removes HTML comments
 342          * @private
 343          * @param $text String
 344          * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
 345          * @param $args Array for the processing callback
 346          * @param $extratags Array for any extra tags to include
 347          * @param $removetags Array for any tags (default or extra) to exclude
 348          * @return string
 349          */
 350         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
 351                 global $wgUseTidy;
 352
 353                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 354                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 355
 356                 wfProfileIn( __METHOD__ );
 357
 358                 if ( !$staticInitialised ) {
 359
 360                         $htmlpairsStatic = array( # Tags that must be closed
 361                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 362                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 363                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 364                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 365                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
 366                         );
 367                         $htmlsingle = array(
 368                                 'br', 'hr', 'li', 'dt', 'dd'
 369                         );
 370                         $htmlsingleonly = array( # Elements that cannot have close tags
 371                                 'br', 'hr'
 372                         );
 373                         $htmlnest = array( # Tags that can be nested--??
 374                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 375                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 376                         );
 377                         $tabletags = array( # Can only appear inside table, we will close them
 378                                 'td', 'th', 'tr',
 379                         );
 380                         $htmllist = array( # Tags used by list
 381                                 'ul','ol',
 382                         );
 383                         $listtags = array( # Tags that can appear in a list
 384                                 'li',
 385                         );
 386
 387                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 388                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 389
 390                         # Convert them all to hashtables for faster lookup
 391                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 392                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
 393                         foreach ( $vars as $var ) {
 394                                 $$var = array_flip( $$var );
 395                         }
 396                         $staticInitialised = true;
 397                 }
 398                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 399                 $extratags = array_flip( $extratags );
 400                 $removetags = array_flip( $removetags );
 401                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 402                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
 403
 404                 # Remove HTML comments
 405                 $text = Sanitizer::removeHTMLcomments( $text );
 406                 $bits = explode( '<', $text );
 407                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 408                 if(!$wgUseTidy) {
 409                         $tagstack = $tablestack = array();
 410                         foreach ( $bits as $x ) {
 411                                 $regs = array();
 412                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 413                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 414                                 } else {
 415                                         $slash = $t = $params = $brace = $rest = null;
 416                                 }
 417
 418                                 $badtag = 0 ;
 419                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 420                                         # Check our stack
 421                                         if ( $slash ) {
 422                                                 # Closing a tag...
 423                                                 if( isset( $htmlsingleonly[$t] ) ) {
 424                                                         $badtag = 1;
 425                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 426                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 427                                                                 # Pop all elements with an optional close tag
 428                                                                 # and see if we find a match below them
 429                                                                 $optstack = array();
 430                                                                 array_push ($optstack, $ot);
 431                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 432                                                                                 isset( $htmlsingleallowed[$ot] ) )
 433                                                                 {
 434                                                                         array_push ($optstack, $ot);
 435                                                                 }
 436                                                                 if ( $t != $ot ) {
 437                                                                         # No match. Push the optinal elements back again
 438                                                                         $badtag = 1;
 439                                                                         while ( $ot = @array_pop( $optstack ) ) {
 440                                                                                 array_push( $tagstack, $ot );
 441                                                                         }
 442                                                                 }
 443                                                         } else {
 444                                                                 @array_push( $tagstack, $ot );
 445                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 446                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 447                                                                         $badtag = 1;
 448                                                                 }
 449                                                         }
 450                                                 } else {
 451                                                         if ( $t == 'table' ) {
 452                                                                 $tagstack = array_pop( $tablestack );
 453                                                         }
 454                                                 }
 455                                                 $newparams = '';
 456                                         } else {
 457                                                 # Keep track for later
 458                                                 if ( isset( $tabletags[$t] ) &&
 459                                                 ! in_array( 'table', $tagstack ) ) {
 460                                                         $badtag = 1;
 461                                                 } else if ( in_array( $t, $tagstack ) &&
 462                                                 ! isset( $htmlnest [$t ] ) ) {
 463                                                         $badtag = 1 ;
 464                                                 # Is it a self closed htmlpair ? (bug 5487)
 465                                                 } else if( $brace == '/>' &&
 466                                                 isset( $htmlpairs[$t] ) ) {
 467                                                         $badtag = 1;
 468                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 469                                                         # Hack to force empty tag for uncloseable elements
 470                                                         $brace = '/>';
 471                                                 } else if( isset( $htmlsingle[$t] ) ) {
 472                                                         # Hack to not close $htmlsingle tags
 473                                                         $brace = NULL;
 474                                                 } else if( isset( $tabletags[$t] )
 475                                                 &&  in_array($t ,$tagstack) ) {
 476                                                         // New table tag but forgot to close the previous one
 477                                                         $text .= "</$t>";
 478                                                 } else {
 479                                                         if ( $t == 'table' ) {
 480                                                                 array_push( $tablestack, $tagstack );
 481                                                                 $tagstack = array();
 482                                                         }
 483                                                         array_push( $tagstack, $t );
 484                                                 }
 485
 486                                                 # Replace any variables or template parameters with
 487                                                 # plaintext results.
 488                                                 if( is_callable( $processCallback ) ) {
 489                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 490                                                 }
 491
 492                                                 # Strip non-approved attributes from the tag
 493                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 494                                         }
 495                                         if ( ! $badtag ) {
 496                                                 $rest = str_replace( '>', '&gt;', $rest );
 497                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 498                                                 $text .= "<$slash$t$newparams$close>$rest";
 499                                                 continue;
 500                                         }
 501                                 }
 502                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 503                         }
 504                         # Close off any remaining tags
 505                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 506                                 $text .= "</$t>\n";
 507                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 508                         }
 509                 } else {
 510                         # this might be possible using tidy itself
 511                         foreach ( $bits as $x ) {
 512                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 513                                 $x, $regs );
 514                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 515                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 516                                         if( is_callable( $processCallback ) ) {
 517                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 518                                         }
 519                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 520                                         $rest = str_replace( '>', '&gt;', $rest );
 521                                         $text .= "<$slash$t$newparams$brace$rest";
 522                                 } else {
 523                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 524                                 }
 525                         }
 526                 }
 527                 wfProfileOut( __METHOD__ );
 528                 return $text;
 529         }
 530
 531         /**
 532          * Remove '<!--', '-->', and everything between.
 533          * To avoid leaving blank lines, when a comment is both preceded
 534          * and followed by a newline (ignoring spaces), trim leading and
 535          * trailing spaces and one of the newlines.
 536          *
 537          * @private
 538          * @param $text String
 539          * @return string
 540          */
 541         static function removeHTMLcomments( $text ) {
 542                 wfProfileIn( __METHOD__ );
 543                 while (($start = strpos($text, '<!--')) !== false) {
 544                         $end = strpos($text, '-->', $start + 4);
 545                         if ($end === false) {
 546                                 # Unterminated comment; bail out
 547                                 break;
 548                         }
 549
 550                         $end += 3;
 551
 552                         # Trim space and newline if the comment is both
 553                         # preceded and followed by a newline
 554                         $spaceStart = max($start - 1, 0);
 555                         $spaceLen = $end - $spaceStart;
 556                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 557                                 $spaceStart--;
 558                                 $spaceLen++;
 559                         }
 560                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 561                                 $spaceLen++;
 562                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 563                                 # Remove the comment, leading and trailing
 564                                 # spaces, and leave only one newline.
 565                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 566                         }
 567                         else {
 568                                 # Remove just the comment.
 569                                 $text = substr_replace($text, '', $start, $end - $start);
 570                         }
 571                 }
 572                 wfProfileOut( __METHOD__ );
 573                 return $text;
 574         }
 575
 576         /**
 577          * Take an array of attribute names and values and normalize or discard
 578          * illegal values for the given element type.
 579          *
 580          * - Discards attributes not on a whitelist for the given element
 581          * - Unsafe style attributes are discarded
 582          * - Invalid id attributes are reencoded
 583          *
 584          * @param $attribs Array
 585          * @param $element String
 586          * @return Array
 587          *
 588          * @todo Check for legal values where the DTD limits things.
 589          * @todo Check for unique id attribute :P
 590          */
 591         static function validateTagAttributes( $attribs, $element ) {
 592                 return Sanitizer::validateAttributes( $attribs,
 593                         Sanitizer::attributeWhitelist( $element ) );
 594         }
 595
 596         /**
 597          * Take an array of attribute names and values and normalize or discard
 598          * illegal values for the given whitelist.
 599          *
 600          * - Discards attributes not the given whitelist
 601          * - Unsafe style attributes are discarded
 602          * - Invalid id attributes are reencoded
 603          *
 604          * @param $attribs Array
 605          * @param $whitelist Array: list of allowed attribute names
 606          * @return Array
 607          *
 608          * @todo Check for legal values where the DTD limits things.
 609          * @todo Check for unique id attribute :P
 610          */
 611         static function validateAttributes( $attribs, $whitelist ) {
 612                 $whitelist = array_flip( $whitelist );
 613                 $out = array();
 614                 foreach( $attribs as $attribute => $value ) {
 615                         if( !isset( $whitelist[$attribute] ) ) {
 616                                 continue;
 617                         }
 618                         # Strip javascript "expression" from stylesheets.
 619                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 620                         if( $attribute == 'style' ) {
 621                                 $value = Sanitizer::checkCss( $value );
 622                                 if( $value === false ) {
 623                                         # haxx0r
 624                                         continue;
 625                                 }
 626                         }
 627
 628                         if ( $attribute === 'id' ) {
 629                                 global $wgEnforceHtmlIds;
 630                                 $value = Sanitizer::escapeId( $value,
 631                                         $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 632                         }
 633
 634                         //RDFa properties allow URIs. check them
 635                         if ( $attribute === 'rel' || $attribute === 'rev' ||
 636                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' ||
 637                                 $attribute === 'datatype' || $attribute === 'typeof' ) {
 638                                 //Paranoia. Allow "simple" values but suppress javascript
 639                                 if ( preg_match( MW_SCRIPT_URL_PATTERN, $value ) ) {
 640                                         continue;
 641                                 }
 642                         }
 643
 644                         // If this attribute was previously set, override it.
 645                         // Output should only have one attribute of each name.
 646                         $out[$attribute] = $value;
 647                 }
 648                 return $out;
 649         }
 650
 651         /**
 652          * Merge two sets of HTML attributes.  Conflicting items in the second set
 653          * will override those in the first, except for 'class' attributes which
 654          * will be combined (if they're both strings).
 655          *
 656          * @todo implement merging for other attributes such as style
 657          * @param $a Array
 658          * @param $b Array
 659          * @return array
 660          */
 661         static function mergeAttributes( $a, $b ) {
 662                 $out = array_merge( $a, $b );
 663                 if( isset( $a['class'] ) && isset( $b['class'] )
 664                 && is_string( $a['class'] ) && is_string( $b['class'] )
 665                 && $a['class'] !== $b['class'] ) {
 666                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 667                                 -1, PREG_SPLIT_NO_EMPTY );
 668                         $out['class'] = implode( ' ', array_unique( $classes ) );
 669                 }
 670                 return $out;
 671         }
 672
 673         /**
 674          * Pick apart some CSS and check it for forbidden or unsafe structures.
 675          * Returns a sanitized string, or false if it was just too evil.
 676          *
 677          * Currently URL references, 'expression', 'tps' are forbidden.
 678          *
 679          * @param $value String
 680          * @return Mixed
 681          */
 682         static function checkCss( $value ) {
 683                 $stripped = Sanitizer::decodeCharReferences( $value );
 684
 685                 // Remove any comments; IE gets token splitting wrong
 686                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 687
 688                 $value = $stripped;
 689
 690                 // ... and continue checks
 691                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 692                         'codepointToUtf8(hexdec("$1"))', $stripped );
 693                 $stripped = str_replace( '\\', '', $stripped );
 694                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 695                                 $stripped ) ) {
 696                         # haxx0r
 697                         return false;
 698                 }
 699
 700                 return $value;
 701         }
 702
 703         /**
 704          * Take a tag soup fragment listing an HTML element's attributes
 705          * and normalize it to well-formed XML, discarding unwanted attributes.
 706          * Output is safe for further wikitext processing, with escaping of
 707          * values that could trigger problems.
 708          *
 709          * - Normalizes attribute names to lowercase
 710          * - Discards attributes not on a whitelist for the given element
 711          * - Turns broken or invalid entities into plaintext
 712          * - Double-quotes all attribute values
 713          * - Attributes without values are given the name as attribute
 714          * - Double attributes are discarded
 715          * - Unsafe style attributes are discarded
 716          * - Prepends space if there are attributes.
 717          *
 718          * @param $text String
 719          * @param $element String
 720          * @return String
 721          */
 722         static function fixTagAttributes( $text, $element ) {
 723                 if( trim( $text ) == '' ) {
 724                         return '';
 725                 }
 726
 727                 $stripped = Sanitizer::validateTagAttributes(
 728                         Sanitizer::decodeTagAttributes( $text ), $element );
 729
 730                 $attribs = array();
 731                 foreach( $stripped as $attribute => $value ) {
 732                         $encAttribute = htmlspecialchars( $attribute );
 733                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 734
 735                         $attribs[] = "$encAttribute=\"$encValue\"";
 736                 }
 737                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 738         }
 739
 740         /**
 741          * Encode an attribute value for HTML output.
 742          * @param $text String
 743          * @return HTML-encoded text fragment
 744          */
 745         static function encodeAttribute( $text ) {
 746                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 747
 748                 // Whitespace is normalized during attribute decoding,
 749                 // so if we've been passed non-spaces we must encode them
 750                 // ahead of time or they won't be preserved.
 751                 $encValue = strtr( $encValue, array(
 752                         "\n" => '&#10;',
 753                         "\r" => '&#13;',
 754                         "\t" => '&#9;',
 755                 ) );
 756
 757                 return $encValue;
 758         }
 759
 760         /**
 761          * Encode an attribute value for HTML tags, with extra armoring
 762          * against further wiki processing.
 763          * @param $text String
 764          * @return HTML-encoded text fragment
 765          */
 766         static function safeEncodeAttribute( $text ) {
 767                 $encValue = Sanitizer::encodeAttribute( $text );
 768
 769                 # Templates and links may be expanded in later parsing,
 770                 # creating invalid or dangerous output. Suppress this.
 771                 $encValue = strtr( $encValue, array(
 772                         '<'    => '&lt;',   // This should never happen,
 773                         '>'    => '&gt;',   // we've received invalid input
 774                         '"'    => '&quot;', // which should have been escaped.
 775                         '{'    => '&#123;',
 776                         '['    => '&#91;',
 777                         "''"   => '&#39;&#39;',
 778                         'ISBN' => '&#73;SBN',
 779                         'RFC'  => '&#82;FC',
 780                         'PMID' => '&#80;MID',
 781                         '|'    => '&#124;',
 782                         '__'   => '&#95;_',
 783                 ) );
 784
 785                 # Stupid hack
 786                 $encValue = preg_replace_callback(
 787                         '/(' . wfUrlProtocols() . ')/',
 788                         array( 'Sanitizer', 'armorLinksCallback' ),
 789                         $encValue );
 790                 return $encValue;
 791         }
 792
 793         /**
 794          * Given a value escape it so that it can be used in an id attribute and
 795          * return it, this does not validate the value however (see first link)
 796          *
 797          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 798          *                                                          in the id and
 799          *                                                          name attributes
 800          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 801          *
 802          * @param $id String: id to validate
 803          * @param $options Mixed: string or array of strings (default is array()):
 804          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 805          *       so don't pay attention if the first character isn't valid at the
 806          *       beginning of an id.
 807          *   'xml': Don't restrict the id to be HTML4-compatible.  This option
 808          *       allows any alphabetic character to be used, per the XML standard.
 809          *       Therefore, it also completely changes the type of escaping: instead
 810          *       of weird dot-encoding, runs of invalid characters (mostly
 811          *       whitespace) are just compressed into a single underscore.
 812          * @return String
 813          */
 814         static function escapeId( $id, $options = array() ) {
 815                 $options = (array)$options;
 816
 817                 if ( !in_array( 'xml', $options ) ) {
 818                         # HTML4-style escaping
 819                         static $replace = array(
 820                                 '%3A' => ':',
 821                                 '%' => '.'
 822                         );
 823
 824                         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 825                         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 826
 827                         if ( !preg_match( '/^[a-zA-Z]/', $id )
 828                         && !in_array( 'noninitial', $options ) )  {
 829                                 // Initial character must be a letter!
 830                                 $id = "x$id";
 831                         }
 832                         return $id;
 833                 }
 834
 835                 # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
 836                 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
 837                 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
 838                         . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
 839                         . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
 840                 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
 841                         . '\x{203F}-\x{2040}';
 842                 # Replace _ as well so we don't get multiple consecutive underscores
 843                 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
 844                 $id = trim( $id, '_' );
 845
 846                 if ( !preg_match( "/^[$nameStartChar]/u", $id )
 847                 && !in_array( 'noninitial', $options ) ) {
 848                         $id = "_$id";
 849                 }
 850
 851                 return $id;
 852         }
 853
 854         /**
 855          * Given a value, escape it so that it can be used as a CSS class and
 856          * return it.
 857          *
 858          * @todo For extra validity, input should be validated UTF-8.
 859          *
 860          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 861          *
 862          * @param $class String
 863          * @return String
 864          */
 865         static function escapeClass( $class ) {
 866                 // Convert ugly stuff to underscores and kill underscores in ugly places
 867                 return rtrim(preg_replace(
 868                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 869                         '_',
 870                         $class ), '_');
 871         }
 872
 873         /**
 874          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 875          * This allows (generally harmless) entities like &nbsp; to survive.
 876          *
 877          * @param $html String to escape
 878          * @return String: escaped input
 879          */
 880         static function escapeHtmlAllowEntities( $html ) {
 881                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 882                 # hurt.
 883                 $html = htmlspecialchars( $html, ENT_QUOTES );
 884                 $html = str_replace( '&amp;', '&', $html );
 885                 $html = Sanitizer::normalizeCharReferences( $html );
 886                 return $html;
 887         }
 888
 889         /**
 890          * Regex replace callback for armoring links against further processing.
 891          * @param $matches Array
 892          * @return string
 893          */
 894         private static function armorLinksCallback( $matches ) {
 895                 return str_replace( ':', '&#58;', $matches[1] );
 896         }
 897
 898         /**
 899          * Return an associative array of attribute names and values from
 900          * a partial tag string. Attribute names are forces to lowercase,
 901          * character references are decoded to UTF-8 text.
 902          *
 903          * @param $text String
 904          * @return Array
 905          */
 906         public static function decodeTagAttributes( $text ) {
 907                 $attribs = array();
 908
 909                 if( trim( $text ) == '' ) {
 910                         return $attribs;
 911                 }
 912
 913                 $pairs = array();
 914                 if( !preg_match_all(
 915                         MW_ATTRIBS_REGEX,
 916                         $text,
 917                         $pairs,
 918                         PREG_SET_ORDER ) ) {
 919                         return $attribs;
 920                 }
 921
 922                 foreach( $pairs as $set ) {
 923                         $attribute = strtolower( $set[1] );
 924                         $value = Sanitizer::getTagAttributeCallback( $set );
 925
 926                         // Normalize whitespace
 927                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 928                         $value = trim( $value );
 929
 930                         // Decode character references
 931                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 932                 }
 933                 return $attribs;
 934         }
 935
 936         /**
 937          * Pick the appropriate attribute value from a match set from the
 938          * MW_ATTRIBS_REGEX matches.
 939          *
 940          * @param $set Array
 941          * @return String
 942          */
 943         private static function getTagAttributeCallback( $set ) {
 944                 if( isset( $set[6] ) ) {
 945                         # Illegal #XXXXXX color with no quotes.
 946                         return $set[6];
 947                 } elseif( isset( $set[5] ) ) {
 948                         # No quotes.
 949                         return $set[5];
 950                 } elseif( isset( $set[4] ) ) {
 951                         # Single-quoted
 952                         return $set[4];
 953                 } elseif( isset( $set[3] ) ) {
 954                         # Double-quoted
 955                         return $set[3];
 956                 } elseif( !isset( $set[2] ) ) {
 957                         # In XHTML, attributes must have a value.
 958                         # For 'reduced' form, return explicitly the attribute name here.
 959                         return $set[1];
 960                 } else {
 961                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 962                 }
 963         }
 964
 965         /**
 966          * Normalize whitespace and character references in an XML source-
 967          * encoded text for an attribute value.
 968          *
 969          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 970          * but note that we're not returning the value, but are returning
 971          * XML source fragments that will be slapped into output.
 972          *
 973          * @param $text String
 974          * @return String
 975          */
 976         private static function normalizeAttributeValue( $text ) {
 977                 return str_replace( '"', '&quot;',
 978                         self::normalizeWhitespace(
 979                                 Sanitizer::normalizeCharReferences( $text ) ) );
 980         }
 981
 982         private static function normalizeWhitespace( $text ) {
 983                 return preg_replace(
 984                         '/\r\n|[\x20\x0d\x0a\x09]/',
 985                         ' ',
 986                         $text );
 987         }
 988
 989         /**
 990          * Ensure that any entities and character references are legal
 991          * for XML and XHTML specifically. Any stray bits will be
 992          * &amp;-escaped to result in a valid text fragment.
 993          *
 994          * a. any named char refs must be known in XHTML
 995          * b. any numeric char refs must be legal chars, not invalid or forbidden
 996          * c. use &#x, not &#X
 997          * d. fix or reject non-valid attributes
 998          *
 999          * @param $text String
1000          * @return String
1001          * @private
1002          */
1003         static function normalizeCharReferences( $text ) {
1004                 return preg_replace_callback(
1005                         MW_CHAR_REFS_REGEX,
1006                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1007                         $text );
1008         }
1009         /**
1010          * @param $matches String
1011          * @return String
1012          */
1013         static function normalizeCharReferencesCallback( $matches ) {
1014                 $ret = null;
1015                 if( $matches[1] != '' ) {
1016                         $ret = Sanitizer::normalizeEntity( $matches[1] );
1017                 } elseif( $matches[2] != '' ) {
1018                         $ret = Sanitizer::decCharReference( $matches[2] );
1019                 } elseif( $matches[3] != ''  ) {
1020                         $ret = Sanitizer::hexCharReference( $matches[3] );
1021                 } elseif( $matches[4] != '' ) {
1022                         $ret = Sanitizer::hexCharReference( $matches[4] );
1023                 }
1024                 if( is_null( $ret ) ) {
1025                         return htmlspecialchars( $matches[0] );
1026                 } else {
1027                         return $ret;
1028                 }
1029         }
1030
1031         /**
1032          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1033          * return the named entity reference as is. If the entity is a
1034          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1035          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1036          *
1037          * @param $name String
1038          * @return String
1039          */
1040         static function normalizeEntity( $name ) {
1041                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1042                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1043                         return "&{$wgHtmlEntityAliases[$name]};";
1044                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1045                         return "&$name;";
1046                 } else {
1047                         return "&amp;$name;";
1048                 }
1049         }
1050
1051         static function decCharReference( $codepoint ) {
1052                 $point = intval( $codepoint );
1053                 if( Sanitizer::validateCodepoint( $point ) ) {
1054                         return sprintf( '&#%d;', $point );
1055                 } else {
1056                         return null;
1057                 }
1058         }
1059
1060         static function hexCharReference( $codepoint ) {
1061                 $point = hexdec( $codepoint );
1062                 if( Sanitizer::validateCodepoint( $point ) ) {
1063                         return sprintf( '&#x%x;', $point );
1064                 } else {
1065                         return null;
1066                 }
1067         }
1068
1069         /**
1070          * Returns true if a given Unicode codepoint is a valid character in XML.
1071          * @param $codepoint Integer
1072          * @return Boolean
1073          */
1074         private static function validateCodepoint( $codepoint ) {
1075                 return ($codepoint ==    0x09)
1076                         || ($codepoint ==    0x0a)
1077                         || ($codepoint ==    0x0d)
1078                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1079                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1080                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1081         }
1082
1083         /**
1084          * Decode any character references, numeric or named entities,
1085          * in the text and return a UTF-8 string.
1086          *
1087          * @param $text String
1088          * @return String
1089          */
1090         public static function decodeCharReferences( $text ) {
1091                 return preg_replace_callback(
1092                         MW_CHAR_REFS_REGEX,
1093                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1094                         $text );
1095         }
1096
1097         /**
1098          * @param $matches String
1099          * @return String
1100          */
1101         static function decodeCharReferencesCallback( $matches ) {
1102                 if( $matches[1] != '' ) {
1103                         return Sanitizer::decodeEntity( $matches[1] );
1104                 } elseif( $matches[2] != '' ) {
1105                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1106                 } elseif( $matches[3] != ''  ) {
1107                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1108                 } elseif( $matches[4] != '' ) {
1109                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1110                 }
1111                 # Last case should be an ampersand by itself
1112                 return $matches[0];
1113         }
1114
1115         /**
1116          * Return UTF-8 string for a codepoint if that is a valid
1117          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1118          * @param $codepoint Integer
1119          * @return String
1120          * @private
1121          */
1122         static function decodeChar( $codepoint ) {
1123                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1124                         return codepointToUtf8( $codepoint );
1125                 } else {
1126                         return UTF8_REPLACEMENT;
1127                 }
1128         }
1129
1130         /**
1131          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1132          * return the UTF-8 encoding of that character. Otherwise, returns
1133          * pseudo-entity source (eg &foo;)
1134          *
1135          * @param $name Strings
1136          * @return String
1137          */
1138         static function decodeEntity( $name ) {
1139                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1140                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1141                         $name = $wgHtmlEntityAliases[$name];
1142                 }
1143                 if( isset( $wgHtmlEntities[$name] ) ) {
1144                         return codepointToUtf8( $wgHtmlEntities[$name] );
1145                 } else {
1146                         return "&$name;";
1147                 }
1148         }
1149
1150         /**
1151          * Fetch the whitelist of acceptable attributes for a given element name.
1152          *
1153          * @param $element String
1154          * @return Array
1155          */
1156         static function attributeWhitelist( $element ) {
1157                 static $list;
1158                 if( !isset( $list ) ) {
1159                         $list = Sanitizer::setupAttributeWhitelist();
1160                 }
1161                 return isset( $list[$element] )
1162                         ? $list[$element]
1163                         : array();
1164         }
1165
1166         /**
1167          * Foreach array key (an allowed HTML element), return an array
1168          * of allowed attributes
1169          * @return Array
1170          */
1171         static function setupAttributeWhitelist() {
1172                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style',
1173                                  #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1174                                  'about', 'property', 'resource', 'datatype', 'typeof',
1175                                 );
1176
1177                 $block = array_merge( $common, array( 'align' ) );
1178                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1179                 $tablecell = array( 'abbr',
1180                                     'axis',
1181                                     'headers',
1182                                     'scope',
1183                                     'rowspan',
1184                                     'colspan',
1185                                     'nowrap', # deprecated
1186                                     'width',  # deprecated
1187                                     'height', # deprecated
1188                                     'bgcolor' # deprecated
1189                                     );
1190
1191                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1192                 # See: http://www.w3.org/TR/html4/
1193                 $whitelist = array (
1194                         # 7.5.4
1195                         'div'        => $block,
1196                         'center'     => $common, # deprecated
1197                         'span'       => $block, # ??
1198
1199                         # 7.5.5
1200                         'h1'         => $block,
1201                         'h2'         => $block,
1202                         'h3'         => $block,
1203                         'h4'         => $block,
1204                         'h5'         => $block,
1205                         'h6'         => $block,
1206
1207                         # 7.5.6
1208                         # address
1209
1210                         # 8.2.4
1211                         # bdo
1212
1213                         # 9.2.1
1214                         'em'         => $common,
1215                         'strong'     => $common,
1216                         'cite'       => $common,
1217                         # dfn
1218                         'code'       => $common,
1219                         # samp
1220                         # kbd
1221                         'var'        => $common,
1222                         'abbr'       => $common,
1223                         # acronym
1224
1225                         # 9.2.2
1226                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1227                         # q
1228
1229                         # 9.2.3
1230                         'sub'        => $common,
1231                         'sup'        => $common,
1232
1233                         # 9.3.1
1234                         'p'          => $block,
1235
1236                         # 9.3.2
1237                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1238
1239                         # 9.3.4
1240                         'pre'        => array_merge( $common, array( 'width' ) ),
1241
1242                         # 9.4
1243                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1244                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1245
1246                         # 10.2
1247                         'ul'         => array_merge( $common, array( 'type' ) ),
1248                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1249                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1250
1251                         # 10.3
1252                         'dl'         => $common,
1253                         'dd'         => $common,
1254                         'dt'         => $common,
1255
1256                         # 11.2.1
1257                         'table'      => array_merge( $common,
1258                                                                 array( 'summary', 'width', 'border', 'frame',
1259                                                                                 'rules', 'cellspacing', 'cellpadding',
1260                                                                                 'align', 'bgcolor',
1261                                                                 ) ),
1262
1263                         # 11.2.2
1264                         'caption'    => array_merge( $common, array( 'align' ) ),
1265
1266                         # 11.2.3
1267                         'thead'      => array_merge( $common, $tablealign ),
1268                         'tfoot'      => array_merge( $common, $tablealign ),
1269                         'tbody'      => array_merge( $common, $tablealign ),
1270
1271                         # 11.2.4
1272                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1273                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1274
1275                         # 11.2.5
1276                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1277
1278                         # 11.2.6
1279                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1280                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1281
1282                         # 13.2
1283                         # Not usually allowed, but may be used for extension-style hooks
1284                         # such as <math> when it is rasterized
1285                         'img'        => array_merge( $common, array( 'alt' ) ),
1286
1287                         # 15.2.1
1288                         'tt'         => $common,
1289                         'b'          => $common,
1290                         'i'          => $common,
1291                         'big'        => $common,
1292                         'small'      => $common,
1293                         'strike'     => $common,
1294                         's'          => $common,
1295                         'u'          => $common,
1296
1297                         # 15.2.2
1298                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1299                         # basefont
1300
1301                         # 15.3
1302                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1303
1304                         # XHTML Ruby annotation text module, simple ruby only.
1305                         # http://www.w3c.org/TR/ruby/
1306                         'ruby'       => $common,
1307                         # rbc
1308                         # rtc
1309                         'rb'         => $common,
1310                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1311                         'rp'         => $common,
1312
1313                         # MathML root element, where used for extensions
1314                         # 'title' may not be 100% valid here; it's XHTML
1315                         # http://www.w3.org/TR/REC-MathML/
1316                         'math'       => array( 'class', 'style', 'id', 'title' ),
1317                         );
1318                 return $whitelist;
1319         }
1320
1321         /**
1322          * Take a fragment of (potentially invalid) HTML and return
1323          * a version with any tags removed, encoded as plain text.
1324          *
1325          * Warning: this return value must be further escaped for literal
1326          * inclusion in HTML output as of 1.10!
1327          *
1328          * @param $text String: HTML fragment
1329          * @return String
1330          */
1331         static function stripAllTags( $text ) {
1332                 # Actual <tags>
1333                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1334
1335                 # Normalize &entities and whitespace
1336                 $text = self::decodeCharReferences( $text );
1337                 $text = self::normalizeWhitespace( $text );
1338
1339                 return $text;
1340         }
1341
1342         /**
1343          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1344          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1345          * PHP 5.1 doesn't.
1346          *
1347          * Use for passing XHTML fragments to PHP's XML parsing functions
1348          *
1349          * @return String
1350          */
1351         static function hackDocType() {
1352                 global $wgHtmlEntities;
1353                 $out = "<!DOCTYPE html [\n";
1354                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1355                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1356                 }
1357                 $out .= "]>\n";
1358                 return $out;
1359         }
1360
1361         static function cleanUrl( $url ) {
1362                 # Normalize any HTML entities in input. They will be
1363                 # re-escaped by makeExternalLink().
1364                 $url = Sanitizer::decodeCharReferences( $url );
1365
1366                 # Escape any control characters introduced by the above step
1367                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1368
1369                 # Validate hostname portion
1370                 $matches = array();
1371                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1372                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1373
1374                         // Characters that will be ignored in IDNs.
1375                         // http://tools.ietf.org/html/3454#section-3.1
1376                         // Strip them before further processing so blacklists and such work.
1377                         $strip = "/
1378                                 \\s|          # general whitespace
1379                                 \xc2\xad|     # 00ad SOFT HYPHEN
1380                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1381                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1382                                 \xe2\x81\xa0| # 2060 WORD JOINER
1383                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1384                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1385                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1386                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1387                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1388                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1389                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1390                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1391                                 /xuD";
1392
1393                         $host = preg_replace( $strip, '', $host );
1394
1395                         // @fixme: validate hostnames here
1396
1397                         return $protocol . $host . $rest;
1398                 } else {
1399                         return $url;
1400                 }
1401         }
1402
1403 }