includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * Regular expression to match URIs that could trigger script execution
  61  */
  62 define( 'MW_SCRIPT_URL_PATTERN', '/(^|\s)(javascript|vbscript)[^\w]/i' );
  63
  64 /**
  65  * List of all named character entities defined in HTML 4.01
  66  * http://www.w3.org/TR/html4/sgml/entities.html
  67  * @private
  68  */
  69 global $wgHtmlEntities;
  70 $wgHtmlEntities = array(
  71         'Aacute'   => 193,
  72         'aacute'   => 225,
  73         'Acirc'    => 194,
  74         'acirc'    => 226,
  75         'acute'    => 180,
  76         'AElig'    => 198,
  77         'aelig'    => 230,
  78         'Agrave'   => 192,
  79         'agrave'   => 224,
  80         'alefsym'  => 8501,
  81         'Alpha'    => 913,
  82         'alpha'    => 945,
  83         'amp'      => 38,
  84         'and'      => 8743,
  85         'ang'      => 8736,
  86         'Aring'    => 197,
  87         'aring'    => 229,
  88         'asymp'    => 8776,
  89         'Atilde'   => 195,
  90         'atilde'   => 227,
  91         'Auml'     => 196,
  92         'auml'     => 228,
  93         'bdquo'    => 8222,
  94         'Beta'     => 914,
  95         'beta'     => 946,
  96         'brvbar'   => 166,
  97         'bull'     => 8226,
  98         'cap'      => 8745,
  99         'Ccedil'   => 199,
 100         'ccedil'   => 231,
 101         'cedil'    => 184,
 102         'cent'     => 162,
 103         'Chi'      => 935,
 104         'chi'      => 967,
 105         'circ'     => 710,
 106         'clubs'    => 9827,
 107         'cong'     => 8773,
 108         'copy'     => 169,
 109         'crarr'    => 8629,
 110         'cup'      => 8746,
 111         'curren'   => 164,
 112         'dagger'   => 8224,
 113         'Dagger'   => 8225,
 114         'darr'     => 8595,
 115         'dArr'     => 8659,
 116         'deg'      => 176,
 117         'Delta'    => 916,
 118         'delta'    => 948,
 119         'diams'    => 9830,
 120         'divide'   => 247,
 121         'Eacute'   => 201,
 122         'eacute'   => 233,
 123         'Ecirc'    => 202,
 124         'ecirc'    => 234,
 125         'Egrave'   => 200,
 126         'egrave'   => 232,
 127         'empty'    => 8709,
 128         'emsp'     => 8195,
 129         'ensp'     => 8194,
 130         'Epsilon'  => 917,
 131         'epsilon'  => 949,
 132         'equiv'    => 8801,
 133         'Eta'      => 919,
 134         'eta'      => 951,
 135         'ETH'      => 208,
 136         'eth'      => 240,
 137         'Euml'     => 203,
 138         'euml'     => 235,
 139         'euro'     => 8364,
 140         'exist'    => 8707,
 141         'fnof'     => 402,
 142         'forall'   => 8704,
 143         'frac12'   => 189,
 144         'frac14'   => 188,
 145         'frac34'   => 190,
 146         'frasl'    => 8260,
 147         'Gamma'    => 915,
 148         'gamma'    => 947,
 149         'ge'       => 8805,
 150         'gt'       => 62,
 151         'harr'     => 8596,
 152         'hArr'     => 8660,
 153         'hearts'   => 9829,
 154         'hellip'   => 8230,
 155         'Iacute'   => 205,
 156         'iacute'   => 237,
 157         'Icirc'    => 206,
 158         'icirc'    => 238,
 159         'iexcl'    => 161,
 160         'Igrave'   => 204,
 161         'igrave'   => 236,
 162         'image'    => 8465,
 163         'infin'    => 8734,
 164         'int'      => 8747,
 165         'Iota'     => 921,
 166         'iota'     => 953,
 167         'iquest'   => 191,
 168         'isin'     => 8712,
 169         'Iuml'     => 207,
 170         'iuml'     => 239,
 171         'Kappa'    => 922,
 172         'kappa'    => 954,
 173         'Lambda'   => 923,
 174         'lambda'   => 955,
 175         'lang'     => 9001,
 176         'laquo'    => 171,
 177         'larr'     => 8592,
 178         'lArr'     => 8656,
 179         'lceil'    => 8968,
 180         'ldquo'    => 8220,
 181         'le'       => 8804,
 182         'lfloor'   => 8970,
 183         'lowast'   => 8727,
 184         'loz'      => 9674,
 185         'lrm'      => 8206,
 186         'lsaquo'   => 8249,
 187         'lsquo'    => 8216,
 188         'lt'       => 60,
 189         'macr'     => 175,
 190         'mdash'    => 8212,
 191         'micro'    => 181,
 192         'middot'   => 183,
 193         'minus'    => 8722,
 194         'Mu'       => 924,
 195         'mu'       => 956,
 196         'nabla'    => 8711,
 197         'nbsp'     => 160,
 198         'ndash'    => 8211,
 199         'ne'       => 8800,
 200         'ni'       => 8715,
 201         'not'      => 172,
 202         'notin'    => 8713,
 203         'nsub'     => 8836,
 204         'Ntilde'   => 209,
 205         'ntilde'   => 241,
 206         'Nu'       => 925,
 207         'nu'       => 957,
 208         'Oacute'   => 211,
 209         'oacute'   => 243,
 210         'Ocirc'    => 212,
 211         'ocirc'    => 244,
 212         'OElig'    => 338,
 213         'oelig'    => 339,
 214         'Ograve'   => 210,
 215         'ograve'   => 242,
 216         'oline'    => 8254,
 217         'Omega'    => 937,
 218         'omega'    => 969,
 219         'Omicron'  => 927,
 220         'omicron'  => 959,
 221         'oplus'    => 8853,
 222         'or'       => 8744,
 223         'ordf'     => 170,
 224         'ordm'     => 186,
 225         'Oslash'   => 216,
 226         'oslash'   => 248,
 227         'Otilde'   => 213,
 228         'otilde'   => 245,
 229         'otimes'   => 8855,
 230         'Ouml'     => 214,
 231         'ouml'     => 246,
 232         'para'     => 182,
 233         'part'     => 8706,
 234         'permil'   => 8240,
 235         'perp'     => 8869,
 236         'Phi'      => 934,
 237         'phi'      => 966,
 238         'Pi'       => 928,
 239         'pi'       => 960,
 240         'piv'      => 982,
 241         'plusmn'   => 177,
 242         'pound'    => 163,
 243         'prime'    => 8242,
 244         'Prime'    => 8243,
 245         'prod'     => 8719,
 246         'prop'     => 8733,
 247         'Psi'      => 936,
 248         'psi'      => 968,
 249         'quot'     => 34,
 250         'radic'    => 8730,
 251         'rang'     => 9002,
 252         'raquo'    => 187,
 253         'rarr'     => 8594,
 254         'rArr'     => 8658,
 255         'rceil'    => 8969,
 256         'rdquo'    => 8221,
 257         'real'     => 8476,
 258         'reg'      => 174,
 259         'rfloor'   => 8971,
 260         'Rho'      => 929,
 261         'rho'      => 961,
 262         'rlm'      => 8207,
 263         'rsaquo'   => 8250,
 264         'rsquo'    => 8217,
 265         'sbquo'    => 8218,
 266         'Scaron'   => 352,
 267         'scaron'   => 353,
 268         'sdot'     => 8901,
 269         'sect'     => 167,
 270         'shy'      => 173,
 271         'Sigma'    => 931,
 272         'sigma'    => 963,
 273         'sigmaf'   => 962,
 274         'sim'      => 8764,
 275         'spades'   => 9824,
 276         'sub'      => 8834,
 277         'sube'     => 8838,
 278         'sum'      => 8721,
 279         'sup'      => 8835,
 280         'sup1'     => 185,
 281         'sup2'     => 178,
 282         'sup3'     => 179,
 283         'supe'     => 8839,
 284         'szlig'    => 223,
 285         'Tau'      => 932,
 286         'tau'      => 964,
 287         'there4'   => 8756,
 288         'Theta'    => 920,
 289         'theta'    => 952,
 290         'thetasym' => 977,
 291         'thinsp'   => 8201,
 292         'THORN'    => 222,
 293         'thorn'    => 254,
 294         'tilde'    => 732,
 295         'times'    => 215,
 296         'trade'    => 8482,
 297         'Uacute'   => 218,
 298         'uacute'   => 250,
 299         'uarr'     => 8593,
 300         'uArr'     => 8657,
 301         'Ucirc'    => 219,
 302         'ucirc'    => 251,
 303         'Ugrave'   => 217,
 304         'ugrave'   => 249,
 305         'uml'      => 168,
 306         'upsih'    => 978,
 307         'Upsilon'  => 933,
 308         'upsilon'  => 965,
 309         'Uuml'     => 220,
 310         'uuml'     => 252,
 311         'weierp'   => 8472,
 312         'Xi'       => 926,
 313         'xi'       => 958,
 314         'Yacute'   => 221,
 315         'yacute'   => 253,
 316         'yen'      => 165,
 317         'Yuml'     => 376,
 318         'yuml'     => 255,
 319         'Zeta'     => 918,
 320         'zeta'     => 950,
 321         'zwj'      => 8205,
 322         'zwnj'     => 8204 );
 323
 324 /**
 325  * Character entity aliases accepted by MediaWiki
 326  */
 327 global $wgHtmlEntityAliases;
 328 $wgHtmlEntityAliases = array(
 329         'רלמ' => 'rlm',
 330         'رلم' => 'rlm',
 331 );
 332
 333
 334 /**
 335  * XHTML sanitizer for MediaWiki
 336  * @ingroup Parser
 337  */
 338 class Sanitizer {
 339         /**
 340          * Cleans up HTML, removes dangerous tags and attributes, and
 341          * removes HTML comments
 342          * @private
 343          * @param $text String
 344          * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
 345          * @param $args Array for the processing callback
 346          * @param $extratags Array for any extra tags to include
 347          * @param $removetags Array for any tags (default or extra) to exclude
 348          * @return string
 349          */
 350         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
 351                 global $wgUseTidy;
 352
 353                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 354                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 355
 356                 wfProfileIn( __METHOD__ );
 357
 358                 if ( !$staticInitialised ) {
 359
 360                         $htmlpairsStatic = array( # Tags that must be closed
 361                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 362                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 363                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 364                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 365                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
 366                         );
 367                         $htmlsingle = array(
 368                                 'br', 'hr', 'li', 'dt', 'dd'
 369                         );
 370                         $htmlsingleonly = array( # Elements that cannot have close tags
 371                                 'br', 'hr'
 372                         );
 373                         $htmlnest = array( # Tags that can be nested--??
 374                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 375                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 376                         );
 377                         $tabletags = array( # Can only appear inside table, we will close them
 378                                 'td', 'th', 'tr',
 379                         );
 380                         $htmllist = array( # Tags used by list
 381                                 'ul','ol',
 382                         );
 383                         $listtags = array( # Tags that can appear in a list
 384                                 'li',
 385                         );
 386
 387                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 388                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 389
 390                         # Convert them all to hashtables for faster lookup
 391                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 392                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
 393                         foreach ( $vars as $var ) {
 394                                 $$var = array_flip( $$var );
 395                         }
 396                         $staticInitialised = true;
 397                 }
 398                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 399                 $extratags = array_flip( $extratags );
 400                 $removetags = array_flip( $removetags );
 401                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 402                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
 403
 404                 # Remove HTML comments
 405                 $text = Sanitizer::removeHTMLcomments( $text );
 406                 $bits = explode( '<', $text );
 407                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 408                 if(!$wgUseTidy) {
 409                         $tagstack = $tablestack = array();
 410                         foreach ( $bits as $x ) {
 411                                 $regs = array();
 412                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 413                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 414                                 } else {
 415                                         $slash = $t = $params = $brace = $rest = null;
 416                                 }
 417
 418                                 $badtag = 0 ;
 419                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 420                                         # Check our stack
 421                                         if ( $slash ) {
 422                                                 # Closing a tag...
 423                                                 if( isset( $htmlsingleonly[$t] ) ) {
 424                                                         $badtag = 1;
 425                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 426                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 427                                                                 # Pop all elements with an optional close tag
 428                                                                 # and see if we find a match below them
 429                                                                 $optstack = array();
 430                                                                 array_push ($optstack, $ot);
 431                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 432                                                                                 isset( $htmlsingleallowed[$ot] ) )
 433                                                                 {
 434                                                                         array_push ($optstack, $ot);
 435                                                                 }
 436                                                                 if ( $t != $ot ) {
 437                                                                         # No match. Push the optinal elements back again
 438                                                                         $badtag = 1;
 439                                                                         while ( $ot = @array_pop( $optstack ) ) {
 440                                                                                 array_push( $tagstack, $ot );
 441                                                                         }
 442                                                                 }
 443                                                         } else {
 444                                                                 @array_push( $tagstack, $ot );
 445                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 446                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 447                                                                         $badtag = 1;
 448                                                                 }
 449                                                         }
 450                                                 } else {
 451                                                         if ( $t == 'table' ) {
 452                                                                 $tagstack = array_pop( $tablestack );
 453                                                         }
 454                                                 }
 455                                                 $newparams = '';
 456                                         } else {
 457                                                 # Keep track for later
 458                                                 if ( isset( $tabletags[$t] ) &&
 459                                                 ! in_array( 'table', $tagstack ) ) {
 460                                                         $badtag = 1;
 461                                                 } else if ( in_array( $t, $tagstack ) &&
 462                                                 ! isset( $htmlnest [$t ] ) ) {
 463                                                         $badtag = 1 ;
 464                                                 # Is it a self closed htmlpair ? (bug 5487)
 465                                                 } else if( $brace == '/>' &&
 466                                                 isset( $htmlpairs[$t] ) ) {
 467                                                         $badtag = 1;
 468                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 469                                                         # Hack to force empty tag for uncloseable elements
 470                                                         $brace = '/>';
 471                                                 } else if( isset( $htmlsingle[$t] ) ) {
 472                                                         # Hack to not close $htmlsingle tags
 473                                                         $brace = NULL;
 474                                                 } else if( isset( $tabletags[$t] )
 475                                                 &&  in_array($t ,$tagstack) ) {
 476                                                         // New table tag but forgot to close the previous one
 477                                                         $text .= "</$t>";
 478                                                 } else {
 479                                                         if ( $t == 'table' ) {
 480                                                                 array_push( $tablestack, $tagstack );
 481                                                                 $tagstack = array();
 482                                                         }
 483                                                         array_push( $tagstack, $t );
 484                                                 }
 485
 486                                                 # Replace any variables or template parameters with
 487                                                 # plaintext results.
 488                                                 if( is_callable( $processCallback ) ) {
 489                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 490                                                 }
 491
 492                                                 # Strip non-approved attributes from the tag
 493                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 494                                         }
 495                                         if ( ! $badtag ) {
 496                                                 $rest = str_replace( '>', '&gt;', $rest );
 497                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 498                                                 $text .= "<$slash$t$newparams$close>$rest";
 499                                                 continue;
 500                                         }
 501                                 }
 502                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 503                         }
 504                         # Close off any remaining tags
 505                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 506                                 $text .= "</$t>\n";
 507                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 508                         }
 509                 } else {
 510                         # this might be possible using tidy itself
 511                         foreach ( $bits as $x ) {
 512                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 513                                 $x, $regs );
 514                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 515                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 516                                         if( is_callable( $processCallback ) ) {
 517                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 518                                         }
 519                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 520                                         $rest = str_replace( '>', '&gt;', $rest );
 521                                         $text .= "<$slash$t$newparams$brace$rest";
 522                                 } else {
 523                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 524                                 }
 525                         }
 526                 }
 527                 wfProfileOut( __METHOD__ );
 528                 return $text;
 529         }
 530
 531         /**
 532          * Remove '<!--', '-->', and everything between.
 533          * To avoid leaving blank lines, when a comment is both preceded
 534          * and followed by a newline (ignoring spaces), trim leading and
 535          * trailing spaces and one of the newlines.
 536          *
 537          * @private
 538          * @param $text String
 539          * @return string
 540          */
 541         static function removeHTMLcomments( $text ) {
 542                 wfProfileIn( __METHOD__ );
 543                 while (($start = strpos($text, '<!--')) !== false) {
 544                         $end = strpos($text, '-->', $start + 4);
 545                         if ($end === false) {
 546                                 # Unterminated comment; bail out
 547                                 break;
 548                         }
 549
 550                         $end += 3;
 551
 552                         # Trim space and newline if the comment is both
 553                         # preceded and followed by a newline
 554                         $spaceStart = max($start - 1, 0);
 555                         $spaceLen = $end - $spaceStart;
 556                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 557                                 $spaceStart--;
 558                                 $spaceLen++;
 559                         }
 560                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 561                                 $spaceLen++;
 562                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 563                                 # Remove the comment, leading and trailing
 564                                 # spaces, and leave only one newline.
 565                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 566                         }
 567                         else {
 568                                 # Remove just the comment.
 569                                 $text = substr_replace($text, '', $start, $end - $start);
 570                         }
 571                 }
 572                 wfProfileOut( __METHOD__ );
 573                 return $text;
 574         }
 575
 576         /**
 577          * Take an array of attribute names and values and normalize or discard
 578          * illegal values for the given element type.
 579          *
 580          * - Discards attributes not on a whitelist for the given element
 581          * - Unsafe style attributes are discarded
 582          * - Invalid id attributes are reencoded
 583          *
 584          * @param $attribs Array
 585          * @param $element String
 586          * @return Array
 587          *
 588          * @todo Check for legal values where the DTD limits things.
 589          * @todo Check for unique id attribute :P
 590          */
 591         static function validateTagAttributes( $attribs, $element ) {
 592                 return Sanitizer::validateAttributes( $attribs,
 593                         Sanitizer::attributeWhitelist( $element ) );
 594         }
 595
 596         /**
 597          * Take an array of attribute names and values and normalize or discard
 598          * illegal values for the given whitelist.
 599          *
 600          * - Discards attributes not the given whitelist
 601          * - Unsafe style attributes are discarded
 602          * - Invalid id attributes are reencoded
 603          *
 604          * @param $attribs Array
 605          * @param $whitelist Array: list of allowed attribute names
 606          * @return Array
 607          *
 608          * @todo Check for legal values where the DTD limits things.
 609          * @todo Check for unique id attribute :P
 610          */
 611         static function validateAttributes( $attribs, $whitelist ) {
 612                 $whitelist = array_flip( $whitelist );
 613                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 614
 615                 $out = array();
 616                 foreach( $attribs as $attribute => $value ) {
 617                         if( !isset( $whitelist[$attribute] ) ) {
 618                                 continue;
 619                         }
 620                         # Strip javascript "expression" from stylesheets.
 621                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 622                         if( $attribute == 'style' ) {
 623                                 $value = Sanitizer::checkCss( $value );
 624                                 if( $value === false ) {
 625                                         # haxx0r
 626                                         continue;
 627                                 }
 628                         }
 629
 630                         if ( $attribute === 'id' ) {
 631                                 global $wgEnforceHtmlIds;
 632                                 $value = Sanitizer::escapeId( $value,
 633                                         $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 634                         }
 635
 636                         //RDFa properties allow URIs. check them
 637                         if ( $attribute === 'rel' || $attribute === 'rev' ||
 638                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' ||
 639                                 $attribute === 'datatype' || $attribute === 'typeof' ) {
 640                                 //Paranoia. Allow "simple" values but suppress javascript
 641                                 if ( preg_match( MW_SCRIPT_URL_PATTERN, $value ) ) {
 642                                         continue;
 643                                 }
 644                         }
 645
 646                         # NOTE: even though elements using href/src are not allowed directly, supply
 647                         #       validation code that can be used by tag hook handlers, etc
 648                         if ( $attribute === 'href' || $attribute === 'src' ) {
 649                                 if ( !preg_match( $hrefExp, $value ) ) {
 650                                         continue; //drop any href or src attributes not using an allowed protocol.
 651                                                   //NOTE: this also drops all relative URLs
 652                                 }
 653                         }
 654
 655                         // If this attribute was previously set, override it.
 656                         // Output should only have one attribute of each name.
 657                         $out[$attribute] = $value;
 658                 }
 659                 return $out;
 660         }
 661
 662         /**
 663          * Merge two sets of HTML attributes.  Conflicting items in the second set
 664          * will override those in the first, except for 'class' attributes which
 665          * will be combined (if they're both strings).
 666          *
 667          * @todo implement merging for other attributes such as style
 668          * @param $a Array
 669          * @param $b Array
 670          * @return array
 671          */
 672         static function mergeAttributes( $a, $b ) {
 673                 $out = array_merge( $a, $b );
 674                 if( isset( $a['class'] ) && isset( $b['class'] )
 675                 && is_string( $a['class'] ) && is_string( $b['class'] )
 676                 && $a['class'] !== $b['class'] ) {
 677                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 678                                 -1, PREG_SPLIT_NO_EMPTY );
 679                         $out['class'] = implode( ' ', array_unique( $classes ) );
 680                 }
 681                 return $out;
 682         }
 683
 684         /**
 685          * Pick apart some CSS and check it for forbidden or unsafe structures.
 686          * Returns a sanitized string, or false if it was just too evil.
 687          *
 688          * Currently URL references, 'expression', 'tps' are forbidden.
 689          *
 690          * @param $value String
 691          * @return Mixed
 692          */
 693         static function checkCss( $value ) {
 694                 $stripped = Sanitizer::decodeCharReferences( $value );
 695
 696                 // Remove any comments; IE gets token splitting wrong
 697                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 698
 699                 $value = $stripped;
 700
 701                 // ... and continue checks
 702                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 703                         'codepointToUtf8(hexdec("$1"))', $stripped );
 704                 $stripped = str_replace( '\\', '', $stripped );
 705                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 706                                 $stripped ) ) {
 707                         # haxx0r
 708                         return false;
 709                 }
 710
 711                 return $value;
 712         }
 713
 714         /**
 715          * Take a tag soup fragment listing an HTML element's attributes
 716          * and normalize it to well-formed XML, discarding unwanted attributes.
 717          * Output is safe for further wikitext processing, with escaping of
 718          * values that could trigger problems.
 719          *
 720          * - Normalizes attribute names to lowercase
 721          * - Discards attributes not on a whitelist for the given element
 722          * - Turns broken or invalid entities into plaintext
 723          * - Double-quotes all attribute values
 724          * - Attributes without values are given the name as attribute
 725          * - Double attributes are discarded
 726          * - Unsafe style attributes are discarded
 727          * - Prepends space if there are attributes.
 728          *
 729          * @param $text String
 730          * @param $element String
 731          * @return String
 732          */
 733         static function fixTagAttributes( $text, $element ) {
 734                 if( trim( $text ) == '' ) {
 735                         return '';
 736                 }
 737
 738                 $stripped = Sanitizer::validateTagAttributes(
 739                         Sanitizer::decodeTagAttributes( $text ), $element );
 740
 741                 $attribs = array();
 742                 foreach( $stripped as $attribute => $value ) {
 743                         $encAttribute = htmlspecialchars( $attribute );
 744                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 745
 746                         $attribs[] = "$encAttribute=\"$encValue\"";
 747                 }
 748                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 749         }
 750
 751         /**
 752          * Encode an attribute value for HTML output.
 753          * @param $text String
 754          * @return HTML-encoded text fragment
 755          */
 756         static function encodeAttribute( $text ) {
 757                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 758
 759                 // Whitespace is normalized during attribute decoding,
 760                 // so if we've been passed non-spaces we must encode them
 761                 // ahead of time or they won't be preserved.
 762                 $encValue = strtr( $encValue, array(
 763                         "\n" => '&#10;',
 764                         "\r" => '&#13;',
 765                         "\t" => '&#9;',
 766                 ) );
 767
 768                 return $encValue;
 769         }
 770
 771         /**
 772          * Encode an attribute value for HTML tags, with extra armoring
 773          * against further wiki processing.
 774          * @param $text String
 775          * @return HTML-encoded text fragment
 776          */
 777         static function safeEncodeAttribute( $text ) {
 778                 $encValue = Sanitizer::encodeAttribute( $text );
 779
 780                 # Templates and links may be expanded in later parsing,
 781                 # creating invalid or dangerous output. Suppress this.
 782                 $encValue = strtr( $encValue, array(
 783                         '<'    => '&lt;',   // This should never happen,
 784                         '>'    => '&gt;',   // we've received invalid input
 785                         '"'    => '&quot;', // which should have been escaped.
 786                         '{'    => '&#123;',
 787                         '['    => '&#91;',
 788                         "''"   => '&#39;&#39;',
 789                         'ISBN' => '&#73;SBN',
 790                         'RFC'  => '&#82;FC',
 791                         'PMID' => '&#80;MID',
 792                         '|'    => '&#124;',
 793                         '__'   => '&#95;_',
 794                 ) );
 795
 796                 # Stupid hack
 797                 $encValue = preg_replace_callback(
 798                         '/(' . wfUrlProtocols() . ')/',
 799                         array( 'Sanitizer', 'armorLinksCallback' ),
 800                         $encValue );
 801                 return $encValue;
 802         }
 803
 804         /**
 805          * Given a value escape it so that it can be used in an id attribute and
 806          * return it, this does not validate the value however (see first link)
 807          *
 808          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 809          *                                                          in the id and
 810          *                                                          name attributes
 811          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 812          *
 813          * @param $id String: id to validate
 814          * @param $options Mixed: string or array of strings (default is array()):
 815          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 816          *       so don't pay attention if the first character isn't valid at the
 817          *       beginning of an id.
 818          *   'xml': Don't restrict the id to be HTML4-compatible.  This option
 819          *       allows any alphabetic character to be used, per the XML standard.
 820          *       Therefore, it also completely changes the type of escaping: instead
 821          *       of weird dot-encoding, runs of invalid characters (mostly
 822          *       whitespace) are just compressed into a single underscore.
 823          * @return String
 824          */
 825         static function escapeId( $id, $options = array() ) {
 826                 $options = (array)$options;
 827
 828                 if ( !in_array( 'xml', $options ) ) {
 829                         # HTML4-style escaping
 830                         static $replace = array(
 831                                 '%3A' => ':',
 832                                 '%' => '.'
 833                         );
 834
 835                         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 836                         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 837
 838                         if ( !preg_match( '/^[a-zA-Z]/', $id )
 839                         && !in_array( 'noninitial', $options ) )  {
 840                                 // Initial character must be a letter!
 841                                 $id = "x$id";
 842                         }
 843                         return $id;
 844                 }
 845
 846                 # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
 847                 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
 848                 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
 849                         . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
 850                         . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
 851                 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
 852                         . '\x{203F}-\x{2040}';
 853                 # Replace _ as well so we don't get multiple consecutive underscores
 854                 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
 855                 $id = trim( $id, '_' );
 856
 857                 if ( !preg_match( "/^[$nameStartChar]/u", $id )
 858                 && !in_array( 'noninitial', $options ) ) {
 859                         $id = "_$id";
 860                 }
 861
 862                 return $id;
 863         }
 864
 865         /**
 866          * Given a value, escape it so that it can be used as a CSS class and
 867          * return it.
 868          *
 869          * @todo For extra validity, input should be validated UTF-8.
 870          *
 871          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 872          *
 873          * @param $class String
 874          * @return String
 875          */
 876         static function escapeClass( $class ) {
 877                 // Convert ugly stuff to underscores and kill underscores in ugly places
 878                 return rtrim(preg_replace(
 879                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 880                         '_',
 881                         $class ), '_');
 882         }
 883
 884         /**
 885          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 886          * This allows (generally harmless) entities like &nbsp; to survive.
 887          *
 888          * @param $html String to escape
 889          * @return String: escaped input
 890          */
 891         static function escapeHtmlAllowEntities( $html ) {
 892                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 893                 # hurt.
 894                 $html = htmlspecialchars( $html, ENT_QUOTES );
 895                 $html = str_replace( '&amp;', '&', $html );
 896                 $html = Sanitizer::normalizeCharReferences( $html );
 897                 return $html;
 898         }
 899
 900         /**
 901          * Regex replace callback for armoring links against further processing.
 902          * @param $matches Array
 903          * @return string
 904          */
 905         private static function armorLinksCallback( $matches ) {
 906                 return str_replace( ':', '&#58;', $matches[1] );
 907         }
 908
 909         /**
 910          * Return an associative array of attribute names and values from
 911          * a partial tag string. Attribute names are forces to lowercase,
 912          * character references are decoded to UTF-8 text.
 913          *
 914          * @param $text String
 915          * @return Array
 916          */
 917         public static function decodeTagAttributes( $text ) {
 918                 $attribs = array();
 919
 920                 if( trim( $text ) == '' ) {
 921                         return $attribs;
 922                 }
 923
 924                 $pairs = array();
 925                 if( !preg_match_all(
 926                         MW_ATTRIBS_REGEX,
 927                         $text,
 928                         $pairs,
 929                         PREG_SET_ORDER ) ) {
 930                         return $attribs;
 931                 }
 932
 933                 foreach( $pairs as $set ) {
 934                         $attribute = strtolower( $set[1] );
 935                         $value = Sanitizer::getTagAttributeCallback( $set );
 936
 937                         // Normalize whitespace
 938                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 939                         $value = trim( $value );
 940
 941                         // Decode character references
 942                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 943                 }
 944                 return $attribs;
 945         }
 946
 947         /**
 948          * Pick the appropriate attribute value from a match set from the
 949          * MW_ATTRIBS_REGEX matches.
 950          *
 951          * @param $set Array
 952          * @return String
 953          */
 954         private static function getTagAttributeCallback( $set ) {
 955                 if( isset( $set[6] ) ) {
 956                         # Illegal #XXXXXX color with no quotes.
 957                         return $set[6];
 958                 } elseif( isset( $set[5] ) ) {
 959                         # No quotes.
 960                         return $set[5];
 961                 } elseif( isset( $set[4] ) ) {
 962                         # Single-quoted
 963                         return $set[4];
 964                 } elseif( isset( $set[3] ) ) {
 965                         # Double-quoted
 966                         return $set[3];
 967                 } elseif( !isset( $set[2] ) ) {
 968                         # In XHTML, attributes must have a value.
 969                         # For 'reduced' form, return explicitly the attribute name here.
 970                         return $set[1];
 971                 } else {
 972                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 973                 }
 974         }
 975
 976         /**
 977          * Normalize whitespace and character references in an XML source-
 978          * encoded text for an attribute value.
 979          *
 980          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 981          * but note that we're not returning the value, but are returning
 982          * XML source fragments that will be slapped into output.
 983          *
 984          * @param $text String
 985          * @return String
 986          */
 987         private static function normalizeAttributeValue( $text ) {
 988                 return str_replace( '"', '&quot;',
 989                         self::normalizeWhitespace(
 990                                 Sanitizer::normalizeCharReferences( $text ) ) );
 991         }
 992
 993         private static function normalizeWhitespace( $text ) {
 994                 return preg_replace(
 995                         '/\r\n|[\x20\x0d\x0a\x09]/',
 996                         ' ',
 997                         $text );
 998         }
 999
1000         /**
1001          * Ensure that any entities and character references are legal
1002          * for XML and XHTML specifically. Any stray bits will be
1003          * &amp;-escaped to result in a valid text fragment.
1004          *
1005          * a. any named char refs must be known in XHTML
1006          * b. any numeric char refs must be legal chars, not invalid or forbidden
1007          * c. use &#x, not &#X
1008          * d. fix or reject non-valid attributes
1009          *
1010          * @param $text String
1011          * @return String
1012          * @private
1013          */
1014         static function normalizeCharReferences( $text ) {
1015                 return preg_replace_callback(
1016                         MW_CHAR_REFS_REGEX,
1017                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1018                         $text );
1019         }
1020         /**
1021          * @param $matches String
1022          * @return String
1023          */
1024         static function normalizeCharReferencesCallback( $matches ) {
1025                 $ret = null;
1026                 if( $matches[1] != '' ) {
1027                         $ret = Sanitizer::normalizeEntity( $matches[1] );
1028                 } elseif( $matches[2] != '' ) {
1029                         $ret = Sanitizer::decCharReference( $matches[2] );
1030                 } elseif( $matches[3] != ''  ) {
1031                         $ret = Sanitizer::hexCharReference( $matches[3] );
1032                 } elseif( $matches[4] != '' ) {
1033                         $ret = Sanitizer::hexCharReference( $matches[4] );
1034                 }
1035                 if( is_null( $ret ) ) {
1036                         return htmlspecialchars( $matches[0] );
1037                 } else {
1038                         return $ret;
1039                 }
1040         }
1041
1042         /**
1043          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1044          * return the named entity reference as is. If the entity is a
1045          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1046          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1047          *
1048          * @param $name String
1049          * @return String
1050          */
1051         static function normalizeEntity( $name ) {
1052                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1053                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1054                         return "&{$wgHtmlEntityAliases[$name]};";
1055                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1056                         return "&$name;";
1057                 } else {
1058                         return "&amp;$name;";
1059                 }
1060         }
1061
1062         static function decCharReference( $codepoint ) {
1063                 $point = intval( $codepoint );
1064                 if( Sanitizer::validateCodepoint( $point ) ) {
1065                         return sprintf( '&#%d;', $point );
1066                 } else {
1067                         return null;
1068                 }
1069         }
1070
1071         static function hexCharReference( $codepoint ) {
1072                 $point = hexdec( $codepoint );
1073                 if( Sanitizer::validateCodepoint( $point ) ) {
1074                         return sprintf( '&#x%x;', $point );
1075                 } else {
1076                         return null;
1077                 }
1078         }
1079
1080         /**
1081          * Returns true if a given Unicode codepoint is a valid character in XML.
1082          * @param $codepoint Integer
1083          * @return Boolean
1084          */
1085         private static function validateCodepoint( $codepoint ) {
1086                 return ($codepoint ==    0x09)
1087                         || ($codepoint ==    0x0a)
1088                         || ($codepoint ==    0x0d)
1089                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1090                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1091                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1092         }
1093
1094         /**
1095          * Decode any character references, numeric or named entities,
1096          * in the text and return a UTF-8 string.
1097          *
1098          * @param $text String
1099          * @return String
1100          */
1101         public static function decodeCharReferences( $text ) {
1102                 return preg_replace_callback(
1103                         MW_CHAR_REFS_REGEX,
1104                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1105                         $text );
1106         }
1107
1108         /**
1109          * @param $matches String
1110          * @return String
1111          */
1112         static function decodeCharReferencesCallback( $matches ) {
1113                 if( $matches[1] != '' ) {
1114                         return Sanitizer::decodeEntity( $matches[1] );
1115                 } elseif( $matches[2] != '' ) {
1116                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1117                 } elseif( $matches[3] != ''  ) {
1118                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1119                 } elseif( $matches[4] != '' ) {
1120                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1121                 }
1122                 # Last case should be an ampersand by itself
1123                 return $matches[0];
1124         }
1125
1126         /**
1127          * Return UTF-8 string for a codepoint if that is a valid
1128          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1129          * @param $codepoint Integer
1130          * @return String
1131          * @private
1132          */
1133         static function decodeChar( $codepoint ) {
1134                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1135                         return codepointToUtf8( $codepoint );
1136                 } else {
1137                         return UTF8_REPLACEMENT;
1138                 }
1139         }
1140
1141         /**
1142          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1143          * return the UTF-8 encoding of that character. Otherwise, returns
1144          * pseudo-entity source (eg &foo;)
1145          *
1146          * @param $name Strings
1147          * @return String
1148          */
1149         static function decodeEntity( $name ) {
1150                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1151                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1152                         $name = $wgHtmlEntityAliases[$name];
1153                 }
1154                 if( isset( $wgHtmlEntities[$name] ) ) {
1155                         return codepointToUtf8( $wgHtmlEntities[$name] );
1156                 } else {
1157                         return "&$name;";
1158                 }
1159         }
1160
1161         /**
1162          * Fetch the whitelist of acceptable attributes for a given element name.
1163          *
1164          * @param $element String
1165          * @return Array
1166          */
1167         static function attributeWhitelist( $element ) {
1168                 static $list;
1169                 if( !isset( $list ) ) {
1170                         $list = Sanitizer::setupAttributeWhitelist();
1171                 }
1172                 return isset( $list[$element] )
1173                         ? $list[$element]
1174                         : array();
1175         }
1176
1177         /**
1178          * Foreach array key (an allowed HTML element), return an array
1179          * of allowed attributes
1180          * @return Array
1181          */
1182         static function setupAttributeWhitelist() {
1183                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style',
1184                                  #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1185                                  'about', 'property', 'resource', 'datatype', 'typeof',
1186                                 );
1187
1188                 $block = array_merge( $common, array( 'align' ) );
1189                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1190                 $tablecell = array( 'abbr',
1191                                     'axis',
1192                                     'headers',
1193                                     'scope',
1194                                     'rowspan',
1195                                     'colspan',
1196                                     'nowrap', # deprecated
1197                                     'width',  # deprecated
1198                                     'height', # deprecated
1199                                     'bgcolor' # deprecated
1200                                     );
1201
1202                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1203                 # See: http://www.w3.org/TR/html4/
1204                 $whitelist = array (
1205                         # 7.5.4
1206                         'div'        => $block,
1207                         'center'     => $common, # deprecated
1208                         'span'       => $block, # ??
1209
1210                         # 7.5.5
1211                         'h1'         => $block,
1212                         'h2'         => $block,
1213                         'h3'         => $block,
1214                         'h4'         => $block,
1215                         'h5'         => $block,
1216                         'h6'         => $block,
1217
1218                         # 7.5.6
1219                         # address
1220
1221                         # 8.2.4
1222                         # bdo
1223
1224                         # 9.2.1
1225                         'em'         => $common,
1226                         'strong'     => $common,
1227                         'cite'       => $common,
1228                         # dfn
1229                         'code'       => $common,
1230                         # samp
1231                         # kbd
1232                         'var'        => $common,
1233                         'abbr'       => $common,
1234                         # acronym
1235
1236                         # 9.2.2
1237                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1238                         # q
1239
1240                         # 9.2.3
1241                         'sub'        => $common,
1242                         'sup'        => $common,
1243
1244                         # 9.3.1
1245                         'p'          => $block,
1246
1247                         # 9.3.2
1248                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1249
1250                         # 9.3.4
1251                         'pre'        => array_merge( $common, array( 'width' ) ),
1252
1253                         # 9.4
1254                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1255                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1256
1257                         # 10.2
1258                         'ul'         => array_merge( $common, array( 'type' ) ),
1259                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1260                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1261
1262                         # 10.3
1263                         'dl'         => $common,
1264                         'dd'         => $common,
1265                         'dt'         => $common,
1266
1267                         # 11.2.1
1268                         'table'      => array_merge( $common,
1269                                                                 array( 'summary', 'width', 'border', 'frame',
1270                                                                                 'rules', 'cellspacing', 'cellpadding',
1271                                                                                 'align', 'bgcolor',
1272                                                                 ) ),
1273
1274                         # 11.2.2
1275                         'caption'    => array_merge( $common, array( 'align' ) ),
1276
1277                         # 11.2.3
1278                         'thead'      => array_merge( $common, $tablealign ),
1279                         'tfoot'      => array_merge( $common, $tablealign ),
1280                         'tbody'      => array_merge( $common, $tablealign ),
1281
1282                         # 11.2.4
1283                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1284                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1285
1286                         # 11.2.5
1287                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1288
1289                         # 11.2.6
1290                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1291                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1292
1293                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
1294                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1295
1296                         # 13.2
1297                         # Not usually allowed, but may be used for extension-style hooks
1298                         # such as <math> when it is rasterized
1299                         'img'        => array_merge( $common, array( 'alt' ) ),
1300
1301                         # 15.2.1
1302                         'tt'         => $common,
1303                         'b'          => $common,
1304                         'i'          => $common,
1305                         'big'        => $common,
1306                         'small'      => $common,
1307                         'strike'     => $common,
1308                         's'          => $common,
1309                         'u'          => $common,
1310
1311                         # 15.2.2
1312                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1313                         # basefont
1314
1315                         # 15.3
1316                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1317
1318                         # XHTML Ruby annotation text module, simple ruby only.
1319                         # http://www.w3c.org/TR/ruby/
1320                         'ruby'       => $common,
1321                         # rbc
1322                         # rtc
1323                         'rb'         => $common,
1324                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1325                         'rp'         => $common,
1326
1327                         # MathML root element, where used for extensions
1328                         # 'title' may not be 100% valid here; it's XHTML
1329                         # http://www.w3.org/TR/REC-MathML/
1330                         'math'       => array( 'class', 'style', 'id', 'title' ),
1331                         );
1332                 return $whitelist;
1333         }
1334
1335         /**
1336          * Take a fragment of (potentially invalid) HTML and return
1337          * a version with any tags removed, encoded as plain text.
1338          *
1339          * Warning: this return value must be further escaped for literal
1340          * inclusion in HTML output as of 1.10!
1341          *
1342          * @param $text String: HTML fragment
1343          * @return String
1344          */
1345         static function stripAllTags( $text ) {
1346                 # Actual <tags>
1347                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1348
1349                 # Normalize &entities and whitespace
1350                 $text = self::decodeCharReferences( $text );
1351                 $text = self::normalizeWhitespace( $text );
1352
1353                 return $text;
1354         }
1355
1356         /**
1357          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1358          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1359          * PHP 5.1 doesn't.
1360          *
1361          * Use for passing XHTML fragments to PHP's XML parsing functions
1362          *
1363          * @return String
1364          */
1365         static function hackDocType() {
1366                 global $wgHtmlEntities;
1367                 $out = "<!DOCTYPE html [\n";
1368                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1369                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1370                 }
1371                 $out .= "]>\n";
1372                 return $out;
1373         }
1374
1375         static function cleanUrl( $url ) {
1376                 # Normalize any HTML entities in input. They will be
1377                 # re-escaped by makeExternalLink().
1378                 $url = Sanitizer::decodeCharReferences( $url );
1379
1380                 # Escape any control characters introduced by the above step
1381                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1382
1383                 # Validate hostname portion
1384                 $matches = array();
1385                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1386                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1387
1388                         // Characters that will be ignored in IDNs.
1389                         // http://tools.ietf.org/html/3454#section-3.1
1390                         // Strip them before further processing so blacklists and such work.
1391                         $strip = "/
1392                                 \\s|          # general whitespace
1393                                 \xc2\xad|     # 00ad SOFT HYPHEN
1394                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1395                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1396                                 \xe2\x81\xa0| # 2060 WORD JOINER
1397                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1398                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1399                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1400                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1401                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1402                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1403                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1404                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1405                                 /xuD";
1406
1407                         $host = preg_replace( $strip, '', $host );
1408
1409                         // @fixme: validate hostnames here
1410
1411                         return $protocol . $host . $rest;
1412                 } else {
1413                         return $url;
1414                 }
1415         }
1416
1417 }