includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * Regular expression to match URIs that could trigger script execution
  61  */
  62 define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
  63
  64 /**
  65  * Regular expression to match namespace attributes
  66  */
  67 define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
  68
  69 /**
  70  * List of all named character entities defined in HTML 4.01
  71  * http://www.w3.org/TR/html4/sgml/entities.html
  72  * @private
  73  */
  74 global $wgHtmlEntities;
  75 $wgHtmlEntities = array(
  76         'Aacute'   => 193,
  77         'aacute'   => 225,
  78         'Acirc'    => 194,
  79         'acirc'    => 226,
  80         'acute'    => 180,
  81         'AElig'    => 198,
  82         'aelig'    => 230,
  83         'Agrave'   => 192,
  84         'agrave'   => 224,
  85         'alefsym'  => 8501,
  86         'Alpha'    => 913,
  87         'alpha'    => 945,
  88         'amp'      => 38,
  89         'and'      => 8743,
  90         'ang'      => 8736,
  91         'Aring'    => 197,
  92         'aring'    => 229,
  93         'asymp'    => 8776,
  94         'Atilde'   => 195,
  95         'atilde'   => 227,
  96         'Auml'     => 196,
  97         'auml'     => 228,
  98         'bdquo'    => 8222,
  99         'Beta'     => 914,
 100         'beta'     => 946,
 101         'brvbar'   => 166,
 102         'bull'     => 8226,
 103         'cap'      => 8745,
 104         'Ccedil'   => 199,
 105         'ccedil'   => 231,
 106         'cedil'    => 184,
 107         'cent'     => 162,
 108         'Chi'      => 935,
 109         'chi'      => 967,
 110         'circ'     => 710,
 111         'clubs'    => 9827,
 112         'cong'     => 8773,
 113         'copy'     => 169,
 114         'crarr'    => 8629,
 115         'cup'      => 8746,
 116         'curren'   => 164,
 117         'dagger'   => 8224,
 118         'Dagger'   => 8225,
 119         'darr'     => 8595,
 120         'dArr'     => 8659,
 121         'deg'      => 176,
 122         'Delta'    => 916,
 123         'delta'    => 948,
 124         'diams'    => 9830,
 125         'divide'   => 247,
 126         'Eacute'   => 201,
 127         'eacute'   => 233,
 128         'Ecirc'    => 202,
 129         'ecirc'    => 234,
 130         'Egrave'   => 200,
 131         'egrave'   => 232,
 132         'empty'    => 8709,
 133         'emsp'     => 8195,
 134         'ensp'     => 8194,
 135         'Epsilon'  => 917,
 136         'epsilon'  => 949,
 137         'equiv'    => 8801,
 138         'Eta'      => 919,
 139         'eta'      => 951,
 140         'ETH'      => 208,
 141         'eth'      => 240,
 142         'Euml'     => 203,
 143         'euml'     => 235,
 144         'euro'     => 8364,
 145         'exist'    => 8707,
 146         'fnof'     => 402,
 147         'forall'   => 8704,
 148         'frac12'   => 189,
 149         'frac14'   => 188,
 150         'frac34'   => 190,
 151         'frasl'    => 8260,
 152         'Gamma'    => 915,
 153         'gamma'    => 947,
 154         'ge'       => 8805,
 155         'gt'       => 62,
 156         'harr'     => 8596,
 157         'hArr'     => 8660,
 158         'hearts'   => 9829,
 159         'hellip'   => 8230,
 160         'Iacute'   => 205,
 161         'iacute'   => 237,
 162         'Icirc'    => 206,
 163         'icirc'    => 238,
 164         'iexcl'    => 161,
 165         'Igrave'   => 204,
 166         'igrave'   => 236,
 167         'image'    => 8465,
 168         'infin'    => 8734,
 169         'int'      => 8747,
 170         'Iota'     => 921,
 171         'iota'     => 953,
 172         'iquest'   => 191,
 173         'isin'     => 8712,
 174         'Iuml'     => 207,
 175         'iuml'     => 239,
 176         'Kappa'    => 922,
 177         'kappa'    => 954,
 178         'Lambda'   => 923,
 179         'lambda'   => 955,
 180         'lang'     => 9001,
 181         'laquo'    => 171,
 182         'larr'     => 8592,
 183         'lArr'     => 8656,
 184         'lceil'    => 8968,
 185         'ldquo'    => 8220,
 186         'le'       => 8804,
 187         'lfloor'   => 8970,
 188         'lowast'   => 8727,
 189         'loz'      => 9674,
 190         'lrm'      => 8206,
 191         'lsaquo'   => 8249,
 192         'lsquo'    => 8216,
 193         'lt'       => 60,
 194         'macr'     => 175,
 195         'mdash'    => 8212,
 196         'micro'    => 181,
 197         'middot'   => 183,
 198         'minus'    => 8722,
 199         'Mu'       => 924,
 200         'mu'       => 956,
 201         'nabla'    => 8711,
 202         'nbsp'     => 160,
 203         'ndash'    => 8211,
 204         'ne'       => 8800,
 205         'ni'       => 8715,
 206         'not'      => 172,
 207         'notin'    => 8713,
 208         'nsub'     => 8836,
 209         'Ntilde'   => 209,
 210         'ntilde'   => 241,
 211         'Nu'       => 925,
 212         'nu'       => 957,
 213         'Oacute'   => 211,
 214         'oacute'   => 243,
 215         'Ocirc'    => 212,
 216         'ocirc'    => 244,
 217         'OElig'    => 338,
 218         'oelig'    => 339,
 219         'Ograve'   => 210,
 220         'ograve'   => 242,
 221         'oline'    => 8254,
 222         'Omega'    => 937,
 223         'omega'    => 969,
 224         'Omicron'  => 927,
 225         'omicron'  => 959,
 226         'oplus'    => 8853,
 227         'or'       => 8744,
 228         'ordf'     => 170,
 229         'ordm'     => 186,
 230         'Oslash'   => 216,
 231         'oslash'   => 248,
 232         'Otilde'   => 213,
 233         'otilde'   => 245,
 234         'otimes'   => 8855,
 235         'Ouml'     => 214,
 236         'ouml'     => 246,
 237         'para'     => 182,
 238         'part'     => 8706,
 239         'permil'   => 8240,
 240         'perp'     => 8869,
 241         'Phi'      => 934,
 242         'phi'      => 966,
 243         'Pi'       => 928,
 244         'pi'       => 960,
 245         'piv'      => 982,
 246         'plusmn'   => 177,
 247         'pound'    => 163,
 248         'prime'    => 8242,
 249         'Prime'    => 8243,
 250         'prod'     => 8719,
 251         'prop'     => 8733,
 252         'Psi'      => 936,
 253         'psi'      => 968,
 254         'quot'     => 34,
 255         'radic'    => 8730,
 256         'rang'     => 9002,
 257         'raquo'    => 187,
 258         'rarr'     => 8594,
 259         'rArr'     => 8658,
 260         'rceil'    => 8969,
 261         'rdquo'    => 8221,
 262         'real'     => 8476,
 263         'reg'      => 174,
 264         'rfloor'   => 8971,
 265         'Rho'      => 929,
 266         'rho'      => 961,
 267         'rlm'      => 8207,
 268         'rsaquo'   => 8250,
 269         'rsquo'    => 8217,
 270         'sbquo'    => 8218,
 271         'Scaron'   => 352,
 272         'scaron'   => 353,
 273         'sdot'     => 8901,
 274         'sect'     => 167,
 275         'shy'      => 173,
 276         'Sigma'    => 931,
 277         'sigma'    => 963,
 278         'sigmaf'   => 962,
 279         'sim'      => 8764,
 280         'spades'   => 9824,
 281         'sub'      => 8834,
 282         'sube'     => 8838,
 283         'sum'      => 8721,
 284         'sup'      => 8835,
 285         'sup1'     => 185,
 286         'sup2'     => 178,
 287         'sup3'     => 179,
 288         'supe'     => 8839,
 289         'szlig'    => 223,
 290         'Tau'      => 932,
 291         'tau'      => 964,
 292         'there4'   => 8756,
 293         'Theta'    => 920,
 294         'theta'    => 952,
 295         'thetasym' => 977,
 296         'thinsp'   => 8201,
 297         'THORN'    => 222,
 298         'thorn'    => 254,
 299         'tilde'    => 732,
 300         'times'    => 215,
 301         'trade'    => 8482,
 302         'Uacute'   => 218,
 303         'uacute'   => 250,
 304         'uarr'     => 8593,
 305         'uArr'     => 8657,
 306         'Ucirc'    => 219,
 307         'ucirc'    => 251,
 308         'Ugrave'   => 217,
 309         'ugrave'   => 249,
 310         'uml'      => 168,
 311         'upsih'    => 978,
 312         'Upsilon'  => 933,
 313         'upsilon'  => 965,
 314         'Uuml'     => 220,
 315         'uuml'     => 252,
 316         'weierp'   => 8472,
 317         'Xi'       => 926,
 318         'xi'       => 958,
 319         'Yacute'   => 221,
 320         'yacute'   => 253,
 321         'yen'      => 165,
 322         'Yuml'     => 376,
 323         'yuml'     => 255,
 324         'Zeta'     => 918,
 325         'zeta'     => 950,
 326         'zwj'      => 8205,
 327         'zwnj'     => 8204 );
 328
 329 /**
 330  * Character entity aliases accepted by MediaWiki
 331  */
 332 global $wgHtmlEntityAliases;
 333 $wgHtmlEntityAliases = array(
 334         'רלמ' => 'rlm',
 335         'رلم' => 'rlm',
 336 );
 337
 338
 339 /**
 340  * XHTML sanitizer for MediaWiki
 341  * @ingroup Parser
 342  */
 343 class Sanitizer {
 344         /**
 345          * Cleans up HTML, removes dangerous tags and attributes, and
 346          * removes HTML comments
 347          * @private
 348          * @param $text String
 349          * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
 350          * @param $args Array for the processing callback
 351          * @param $extratags Array for any extra tags to include
 352          * @param $removetags Array for any tags (default or extra) to exclude
 353          * @return string
 354          */
 355         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
 356                 global $wgUseTidy;
 357
 358                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 359                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 360
 361                 wfProfileIn( __METHOD__ );
 362
 363                 if ( !$staticInitialised ) {
 364
 365                         $htmlpairsStatic = array( # Tags that must be closed
 366                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 367                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 368                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 369                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 370                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
 371                         );
 372                         $htmlsingle = array(
 373                                 'br', 'hr', 'li', 'dt', 'dd'
 374                         );
 375                         $htmlsingleonly = array( # Elements that cannot have close tags
 376                                 'br', 'hr'
 377                         );
 378                         $htmlnest = array( # Tags that can be nested--??
 379                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 380                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 381                         );
 382                         $tabletags = array( # Can only appear inside table, we will close them
 383                                 'td', 'th', 'tr',
 384                         );
 385                         $htmllist = array( # Tags used by list
 386                                 'ul','ol',
 387                         );
 388                         $listtags = array( # Tags that can appear in a list
 389                                 'li',
 390                         );
 391
 392                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 393                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 394
 395                         # Convert them all to hashtables for faster lookup
 396                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 397                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
 398                         foreach ( $vars as $var ) {
 399                                 $$var = array_flip( $$var );
 400                         }
 401                         $staticInitialised = true;
 402                 }
 403                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 404                 $extratags = array_flip( $extratags );
 405                 $removetags = array_flip( $removetags );
 406                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 407                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
 408
 409                 # Remove HTML comments
 410                 $text = Sanitizer::removeHTMLcomments( $text );
 411                 $bits = explode( '<', $text );
 412                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 413                 if ( !$wgUseTidy ) {
 414                         $tagstack = $tablestack = array();
 415                         foreach ( $bits as $x ) {
 416                                 $regs = array();
 417                                 # $slash: Does the current element start with a '/'?
 418                                 # $t: Current element name
 419                                 # $params: String between element name and >
 420                                 # $brace: Ending '>' or '/>'
 421                                 # $rest: Everything until the next element of $bits
 422                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 423                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 424                                 } else {
 425                                         $slash = $t = $params = $brace = $rest = null;
 426                                 }
 427
 428                                 $badtag = false;
 429                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 430                                         # Check our stack
 431                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
 432                                                 $badtag = true;
 433                                         } elseif ( $slash ) {
 434                                                 # Closing a tag... is it the one we just opened?
 435                                                 $ot = @array_pop( $tagstack );
 436                                                 if ( $ot != $t ) {
 437                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 438                                                                 # Pop all elements with an optional close tag
 439                                                                 # and see if we find a match below them
 440                                                                 $optstack = array();
 441                                                                 array_push( $optstack, $ot );
 442                                                                 $ot = @array_pop( $tagstack );
 443                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
 444                                                                         array_push( $optstack, $ot );
 445                                                                         $ot = @array_pop( $tagstack );
 446                                                                 }
 447                                                                 if ( $t != $ot ) {
 448                                                                         # No match. Push the optional elements back again
 449                                                                         $badtag = true;
 450                                                                         while ( $ot = @array_pop( $optstack ) ) {
 451                                                                                 array_push( $tagstack, $ot );
 452                                                                         }
 453                                                                 }
 454                                                         } else {
 455                                                                 @array_push( $tagstack, $ot );
 456                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 457                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
 458                                                                         $badtag = true;
 459                                                                 }
 460                                                         }
 461                                                 } else {
 462                                                         if ( $t == 'table' ) {
 463                                                                 $tagstack = array_pop( $tablestack );
 464                                                         }
 465                                                 }
 466                                                 $newparams = '';
 467                                         } else {
 468                                                 # Keep track for later
 469                                                 if ( isset( $tabletags[$t] ) &&
 470                                                 !in_array( 'table', $tagstack ) ) {
 471                                                         $badtag = true;
 472                                                 } elseif ( in_array( $t, $tagstack ) &&
 473                                                 !isset( $htmlnest [$t ] ) ) {
 474                                                         $badtag = true;
 475                                                 # Is it a self closed htmlpair ? (bug 5487)
 476                                                 } elseif ( $brace == '/>' &&
 477                                                 isset( $htmlpairs[$t] ) ) {
 478                                                         $badtag = true;
 479                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
 480                                                         # Hack to force empty tag for uncloseable elements
 481                                                         $brace = '/>';
 482                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
 483                                                         # Hack to not close $htmlsingle tags
 484                                                         $brace = null;
 485                                                 } elseif ( isset( $tabletags[$t] )
 486                                                 && in_array( $t, $tagstack ) ) {
 487                                                         // New table tag but forgot to close the previous one
 488                                                         $text .= "</$t>";
 489                                                 } else {
 490                                                         if ( $t == 'table' ) {
 491                                                                 array_push( $tablestack, $tagstack );
 492                                                                 $tagstack = array();
 493                                                         }
 494                                                         array_push( $tagstack, $t );
 495                                                 }
 496
 497                                                 # Replace any variables or template parameters with
 498                                                 # plaintext results.
 499                                                 if( is_callable( $processCallback ) ) {
 500                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 501                                                 }
 502
 503                                                 # Strip non-approved attributes from the tag
 504                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 505                                         }
 506                                         if ( !$badtag ) {
 507                                                 $rest = str_replace( '>', '&gt;', $rest );
 508                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 509                                                 $text .= "<$slash$t$newparams$close>$rest";
 510                                                 continue;
 511                                         }
 512                                 }
 513                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 514                         }
 515                         # Close off any remaining tags
 516                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 517                                 $text .= "</$t>\n";
 518                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 519                         }
 520                 } else {
 521                         # this might be possible using tidy itself
 522                         foreach ( $bits as $x ) {
 523                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 524                                 $x, $regs );
 525                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 526                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 527                                         if( is_callable( $processCallback ) ) {
 528                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 529                                         }
 530                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 531                                         $rest = str_replace( '>', '&gt;', $rest );
 532                                         $text .= "<$slash$t$newparams$brace$rest";
 533                                 } else {
 534                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 535                                 }
 536                         }
 537                 }
 538                 wfProfileOut( __METHOD__ );
 539                 return $text;
 540         }
 541
 542         /**
 543          * Remove '<!--', '-->', and everything between.
 544          * To avoid leaving blank lines, when a comment is both preceded
 545          * and followed by a newline (ignoring spaces), trim leading and
 546          * trailing spaces and one of the newlines.
 547          *
 548          * @private
 549          * @param $text String
 550          * @return string
 551          */
 552         static function removeHTMLcomments( $text ) {
 553                 wfProfileIn( __METHOD__ );
 554                 while (($start = strpos($text, '<!--')) !== false) {
 555                         $end = strpos($text, '-->', $start + 4);
 556                         if ($end === false) {
 557                                 # Unterminated comment; bail out
 558                                 break;
 559                         }
 560
 561                         $end += 3;
 562
 563                         # Trim space and newline if the comment is both
 564                         # preceded and followed by a newline
 565                         $spaceStart = max($start - 1, 0);
 566                         $spaceLen = $end - $spaceStart;
 567                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 568                                 $spaceStart--;
 569                                 $spaceLen++;
 570                         }
 571                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 572                                 $spaceLen++;
 573                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 574                                 # Remove the comment, leading and trailing
 575                                 # spaces, and leave only one newline.
 576                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 577                         }
 578                         else {
 579                                 # Remove just the comment.
 580                                 $text = substr_replace($text, '', $start, $end - $start);
 581                         }
 582                 }
 583                 wfProfileOut( __METHOD__ );
 584                 return $text;
 585         }
 586
 587         /**
 588          * Take an array of attribute names and values and normalize or discard
 589          * illegal values for the given element type.
 590          *
 591          * - Discards attributes not on a whitelist for the given element
 592          * - Unsafe style attributes are discarded
 593          * - Invalid id attributes are reencoded
 594          *
 595          * @param $attribs Array
 596          * @param $element String
 597          * @return Array
 598          *
 599          * @todo Check for legal values where the DTD limits things.
 600          * @todo Check for unique id attribute :P
 601          */
 602         static function validateTagAttributes( $attribs, $element ) {
 603                 return Sanitizer::validateAttributes( $attribs,
 604                         Sanitizer::attributeWhitelist( $element ) );
 605         }
 606
 607         /**
 608          * Take an array of attribute names and values and normalize or discard
 609          * illegal values for the given whitelist.
 610          *
 611          * - Discards attributes not the given whitelist
 612          * - Unsafe style attributes are discarded
 613          * - Invalid id attributes are reencoded
 614          *
 615          * @param $attribs Array
 616          * @param $whitelist Array: list of allowed attribute names
 617          * @return Array
 618          *
 619          * @todo Check for legal values where the DTD limits things.
 620          * @todo Check for unique id attribute :P
 621          */
 622         static function validateAttributes( $attribs, $whitelist ) {
 623                 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
 624
 625                 $whitelist = array_flip( $whitelist );
 626                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 627
 628                 $out = array();
 629                 foreach( $attribs as $attribute => $value ) {
 630                         #allow XML namespace declaration if RDFa is enabled
 631                         if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
 632                                 if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
 633                                         $out[$attribute] = $value;
 634                                 }
 635
 636                                 continue;
 637                         }
 638
 639                         if( !isset( $whitelist[$attribute] ) ) {
 640                                 continue;
 641                         }
 642
 643                         # Strip javascript "expression" from stylesheets.
 644                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 645                         if( $attribute == 'style' ) {
 646                                 $value = Sanitizer::checkCss( $value );
 647                                 if( $value === false ) {
 648                                         # haxx0r
 649                                         continue;
 650                                 }
 651                         }
 652
 653                         if ( $attribute === 'id' ) {
 654                                 $value = Sanitizer::escapeId( $value, 'noninitial' );
 655                         }
 656
 657                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
 658                         if ( $attribute === 'rel' || $attribute === 'rev' ||
 659                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
 660                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
 661                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
 662                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
 663
 664                                 //Paranoia. Allow "simple" values but suppress javascript
 665                                 if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
 666                                         continue;
 667                                 }
 668                         }
 669
 670                         # NOTE: even though elements using href/src are not allowed directly, supply
 671                         #       validation code that can be used by tag hook handlers, etc
 672                         if ( $attribute === 'href' || $attribute === 'src' ) {
 673                                 if ( !preg_match( $hrefExp, $value ) ) {
 674                                         continue; //drop any href or src attributes not using an allowed protocol.
 675                                                   //NOTE: this also drops all relative URLs
 676                                 }
 677                         }
 678
 679                         // If this attribute was previously set, override it.
 680                         // Output should only have one attribute of each name.
 681                         $out[$attribute] = $value;
 682                 }
 683
 684                 if ( $wgAllowMicrodataAttributes ) {
 685                         # There are some complicated validity constraints we need to
 686                         # enforce here.  First of all, we don't want to allow non-standard
 687                         # itemtypes.
 688                         $allowedTypes = array(
 689                                 'http://microformats.org/profile/hcard',
 690                                 'http://microformats.org/profile/hcalendar#vevent',
 691                                 'http://n.whatwg.org/work',
 692                         );
 693                         if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
 694                         $allowedTypes ) ) {
 695                                 # Kill everything
 696                                 unset( $out['itemscope'] );
 697                         }
 698                         # itemtype, itemid, itemref don't make sense without itemscope
 699                         if ( !array_key_exists( 'itemscope', $out ) ) {
 700                                 unset( $out['itemtype'] );
 701                                 unset( $out['itemid'] );
 702                                 unset( $out['itemref'] );
 703                         }
 704                         # TODO: Strip itemprop if we aren't descendants of an itemscope.
 705                 }
 706                 return $out;
 707         }
 708
 709         /**
 710          * Merge two sets of HTML attributes.  Conflicting items in the second set
 711          * will override those in the first, except for 'class' attributes which
 712          * will be combined (if they're both strings).
 713          *
 714          * @todo implement merging for other attributes such as style
 715          * @param $a Array
 716          * @param $b Array
 717          * @return array
 718          */
 719         static function mergeAttributes( $a, $b ) {
 720                 $out = array_merge( $a, $b );
 721                 if( isset( $a['class'] ) && isset( $b['class'] )
 722                 && is_string( $a['class'] ) && is_string( $b['class'] )
 723                 && $a['class'] !== $b['class'] ) {
 724                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 725                                 -1, PREG_SPLIT_NO_EMPTY );
 726                         $out['class'] = implode( ' ', array_unique( $classes ) );
 727                 }
 728                 return $out;
 729         }
 730
 731         /**
 732          * Pick apart some CSS and check it for forbidden or unsafe structures.
 733          * Returns a sanitized string, or false if it was just too evil.
 734          *
 735          * Currently URL references, 'expression', 'tps' are forbidden.
 736          *
 737          * @param $value String
 738          * @return Mixed
 739          */
 740         static function checkCss( $value ) {
 741                 $value = Sanitizer::decodeCharReferences( $value );
 742
 743                 // Remove any comments; IE gets token splitting wrong
 744                 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
 745
 746                 // Decode escape sequences and line continuation
 747                 // See the grammar in the CSS 2 spec, appendix D, Mozilla implements it accurately.
 748                 // IE 8 doesn't implement it at all, but there's no way to introduce url() into
 749                 // IE that doesn't hit Mozilla also.
 750                 static $decodeRegex;
 751                 if ( !$decodeRegex ) {
 752                         $space = '[\\x20\\t\\r\\n\\f]';
 753                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
 754                         $backslash = '\\\\';
 755                         $decodeRegex = "/ $backslash
 756                                 (?:
 757                                         ($nl) |  # 1. Line continuation
 758                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
 759                                         (.) # 3. backslash cancelling special meaning
 760                                 )/xu";
 761                 }
 762                 $decoded = preg_replace_callback( $decodeRegex,
 763                         array( __CLASS__, 'cssDecodeCallback' ), $value );
 764                 if ( preg_match( '!expression|https?://|url\s*\(!i', $decoded ) ) {
 765                         // Not allowed
 766                         return false;
 767                 } else {
 768                         // Allowed, return CSS with comments stripped
 769                         return $value;
 770                 }
 771         }
 772
 773         static function cssDecodeCallback( $matches ) {
 774                 if ( $matches[1] !== '' ) {
 775                         return '';
 776                 } elseif ( $matches[2] !== '' ) {
 777                         return codepointToUtf8( hexdec( $matches[2] ) );
 778                 } elseif ( $matches[3] !== '' ) {
 779                         return $matches[3];
 780                 } else {
 781                         throw new MWException( __METHOD__.': invalid match' );
 782                 }
 783         }
 784
 785         /**
 786          * Take a tag soup fragment listing an HTML element's attributes
 787          * and normalize it to well-formed XML, discarding unwanted attributes.
 788          * Output is safe for further wikitext processing, with escaping of
 789          * values that could trigger problems.
 790          *
 791          * - Normalizes attribute names to lowercase
 792          * - Discards attributes not on a whitelist for the given element
 793          * - Turns broken or invalid entities into plaintext
 794          * - Double-quotes all attribute values
 795          * - Attributes without values are given the name as attribute
 796          * - Double attributes are discarded
 797          * - Unsafe style attributes are discarded
 798          * - Prepends space if there are attributes.
 799          *
 800          * @param $text String
 801          * @param $element String
 802          * @return String
 803          */
 804         static function fixTagAttributes( $text, $element ) {
 805                 if( trim( $text ) == '' ) {
 806                         return '';
 807                 }
 808
 809                 $stripped = Sanitizer::validateTagAttributes(
 810                         Sanitizer::decodeTagAttributes( $text ), $element );
 811
 812                 $attribs = array();
 813                 foreach( $stripped as $attribute => $value ) {
 814                         $encAttribute = htmlspecialchars( $attribute );
 815                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 816
 817                         $attribs[] = "$encAttribute=\"$encValue\"";
 818                 }
 819                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 820         }
 821
 822         /**
 823          * Encode an attribute value for HTML output.
 824          * @param $text String
 825          * @return HTML-encoded text fragment
 826          */
 827         static function encodeAttribute( $text ) {
 828                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 829
 830                 // Whitespace is normalized during attribute decoding,
 831                 // so if we've been passed non-spaces we must encode them
 832                 // ahead of time or they won't be preserved.
 833                 $encValue = strtr( $encValue, array(
 834                         "\n" => '&#10;',
 835                         "\r" => '&#13;',
 836                         "\t" => '&#9;',
 837                 ) );
 838
 839                 return $encValue;
 840         }
 841
 842         /**
 843          * Encode an attribute value for HTML tags, with extra armoring
 844          * against further wiki processing.
 845          * @param $text String
 846          * @return HTML-encoded text fragment
 847          */
 848         static function safeEncodeAttribute( $text ) {
 849                 $encValue = Sanitizer::encodeAttribute( $text );
 850
 851                 # Templates and links may be expanded in later parsing,
 852                 # creating invalid or dangerous output. Suppress this.
 853                 $encValue = strtr( $encValue, array(
 854                         '<'    => '&lt;',   // This should never happen,
 855                         '>'    => '&gt;',   // we've received invalid input
 856                         '"'    => '&quot;', // which should have been escaped.
 857                         '{'    => '&#123;',
 858                         '['    => '&#91;',
 859                         "''"   => '&#39;&#39;',
 860                         'ISBN' => '&#73;SBN',
 861                         'RFC'  => '&#82;FC',
 862                         'PMID' => '&#80;MID',
 863                         '|'    => '&#124;',
 864                         '__'   => '&#95;_',
 865                 ) );
 866
 867                 # Stupid hack
 868                 $encValue = preg_replace_callback(
 869                         '/(' . wfUrlProtocols() . ')/',
 870                         array( 'Sanitizer', 'armorLinksCallback' ),
 871                         $encValue );
 872                 return $encValue;
 873         }
 874
 875         /**
 876          * Given a value, escape it so that it can be used in an id attribute and
 877          * return it.  This will use HTML5 validation if $wgExperimentalHtmlIds is
 878          * true, allowing anything but ASCII whitespace.  Otherwise it will use
 879          * HTML 4 rules, which means a narrow subset of ASCII, with bad characters
 880          * escaped with lots of dots.
 881          *
 882          * To ensure we don't have to bother escaping anything, we also strip ', ",
 883          * & even if $wgExperimentalIds is true.  TODO: Is this the best tactic?
 884          * We also strip # because it upsets IE6.
 885          *
 886          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 887          *                                                          in the id and
 888          *                                                          name attributes
 889          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 890          * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
 891          *   HTML5 definition of id attribute
 892          *
 893          * @param $id String: id to escape
 894          * @param $options Mixed: string or array of strings (default is array()):
 895          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 896          *       so don't pay attention if the first character isn't valid at the
 897          *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
 898          *       false.
 899          *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
 900          *       if $wgExperimentalHtmlIds is used, so we can generate extra
 901          *       anchors and links won't break.
 902          * @return String
 903          */
 904         static function escapeId( $id, $options = array() ) {
 905                 global $wgHtml5, $wgExperimentalHtmlIds;
 906                 $options = (array)$options;
 907
 908                 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
 909                         $id = Sanitizer::decodeCharReferences( $id );
 910                         $id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id );
 911                         $id = trim( $id, '_' );
 912                         if ( $id === '' ) {
 913                                 # Must have been all whitespace to start with.
 914                                 return '_';
 915                         } else {
 916                                 return $id;
 917                         }
 918                 }
 919
 920                 # HTML4-style escaping
 921                 static $replace = array(
 922                         '%3A' => ':',
 923                         '%' => '.'
 924                 );
 925
 926                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 927                 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 928
 929                 if ( !preg_match( '/^[a-zA-Z]/', $id )
 930                 && !in_array( 'noninitial', $options ) )  {
 931                         // Initial character must be a letter!
 932                         $id = "x$id";
 933                 }
 934                 return $id;
 935         }
 936
 937         /**
 938          * Given a value, escape it so that it can be used as a CSS class and
 939          * return it.
 940          *
 941          * @todo For extra validity, input should be validated UTF-8.
 942          *
 943          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 944          *
 945          * @param $class String
 946          * @return String
 947          */
 948         static function escapeClass( $class ) {
 949                 // Convert ugly stuff to underscores and kill underscores in ugly places
 950                 return rtrim(preg_replace(
 951                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 952                         '_',
 953                         $class ), '_');
 954         }
 955
 956         /**
 957          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 958          * This allows (generally harmless) entities like &nbsp; to survive.
 959          *
 960          * @param $html String to escape
 961          * @return String: escaped input
 962          */
 963         static function escapeHtmlAllowEntities( $html ) {
 964                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 965                 # hurt.
 966                 $html = htmlspecialchars( $html, ENT_QUOTES );
 967                 $html = str_replace( '&amp;', '&', $html );
 968                 $html = Sanitizer::normalizeCharReferences( $html );
 969                 return $html;
 970         }
 971
 972         /**
 973          * Regex replace callback for armoring links against further processing.
 974          * @param $matches Array
 975          * @return string
 976          */
 977         private static function armorLinksCallback( $matches ) {
 978                 return str_replace( ':', '&#58;', $matches[1] );
 979         }
 980
 981         /**
 982          * Return an associative array of attribute names and values from
 983          * a partial tag string. Attribute names are forces to lowercase,
 984          * character references are decoded to UTF-8 text.
 985          *
 986          * @param $text String
 987          * @return Array
 988          */
 989         public static function decodeTagAttributes( $text ) {
 990                 if( trim( $text ) == '' ) {
 991                         return array();
 992                 }
 993
 994                 $attribs = array();
 995                 $pairs = array();
 996                 if( !preg_match_all(
 997                         MW_ATTRIBS_REGEX,
 998                         $text,
 999                         $pairs,
1000                         PREG_SET_ORDER ) ) {
1001                         return $attribs;
1002                 }
1003
1004                 foreach( $pairs as $set ) {
1005                         $attribute = strtolower( $set[1] );
1006                         $value = Sanitizer::getTagAttributeCallback( $set );
1007
1008                         // Normalize whitespace
1009                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1010                         $value = trim( $value );
1011
1012                         // Decode character references
1013                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
1014                 }
1015                 return $attribs;
1016         }
1017
1018         /**
1019          * Pick the appropriate attribute value from a match set from the
1020          * MW_ATTRIBS_REGEX matches.
1021          *
1022          * @param $set Array
1023          * @return String
1024          */
1025         private static function getTagAttributeCallback( $set ) {
1026                 if( isset( $set[6] ) ) {
1027                         # Illegal #XXXXXX color with no quotes.
1028                         return $set[6];
1029                 } elseif( isset( $set[5] ) ) {
1030                         # No quotes.
1031                         return $set[5];
1032                 } elseif( isset( $set[4] ) ) {
1033                         # Single-quoted
1034                         return $set[4];
1035                 } elseif( isset( $set[3] ) ) {
1036                         # Double-quoted
1037                         return $set[3];
1038                 } elseif( !isset( $set[2] ) ) {
1039                         # In XHTML, attributes must have a value.
1040                         # For 'reduced' form, return explicitly the attribute name here.
1041                         return $set[1];
1042                 } else {
1043                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1044                 }
1045         }
1046
1047         /**
1048          * Normalize whitespace and character references in an XML source-
1049          * encoded text for an attribute value.
1050          *
1051          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
1052          * but note that we're not returning the value, but are returning
1053          * XML source fragments that will be slapped into output.
1054          *
1055          * @param $text String
1056          * @return String
1057          */
1058         private static function normalizeAttributeValue( $text ) {
1059                 return str_replace( '"', '&quot;',
1060                         self::normalizeWhitespace(
1061                                 Sanitizer::normalizeCharReferences( $text ) ) );
1062         }
1063
1064         private static function normalizeWhitespace( $text ) {
1065                 return preg_replace(
1066                         '/\r\n|[\x20\x0d\x0a\x09]/',
1067                         ' ',
1068                         $text );
1069         }
1070
1071         /**
1072          * Ensure that any entities and character references are legal
1073          * for XML and XHTML specifically. Any stray bits will be
1074          * &amp;-escaped to result in a valid text fragment.
1075          *
1076          * a. any named char refs must be known in XHTML
1077          * b. any numeric char refs must be legal chars, not invalid or forbidden
1078          * c. use &#x, not &#X
1079          * d. fix or reject non-valid attributes
1080          *
1081          * @param $text String
1082          * @return String
1083          * @private
1084          */
1085         static function normalizeCharReferences( $text ) {
1086                 return preg_replace_callback(
1087                         MW_CHAR_REFS_REGEX,
1088                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1089                         $text );
1090         }
1091         /**
1092          * @param $matches String
1093          * @return String
1094          */
1095         static function normalizeCharReferencesCallback( $matches ) {
1096                 $ret = null;
1097                 if( $matches[1] != '' ) {
1098                         $ret = Sanitizer::normalizeEntity( $matches[1] );
1099                 } elseif( $matches[2] != '' ) {
1100                         $ret = Sanitizer::decCharReference( $matches[2] );
1101                 } elseif( $matches[3] != ''  ) {
1102                         $ret = Sanitizer::hexCharReference( $matches[3] );
1103                 } elseif( $matches[4] != '' ) {
1104                         $ret = Sanitizer::hexCharReference( $matches[4] );
1105                 }
1106                 if( is_null( $ret ) ) {
1107                         return htmlspecialchars( $matches[0] );
1108                 } else {
1109                         return $ret;
1110                 }
1111         }
1112
1113         /**
1114          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1115          * return the named entity reference as is. If the entity is a
1116          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1117          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1118          *
1119          * @param $name String
1120          * @return String
1121          */
1122         static function normalizeEntity( $name ) {
1123                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1124                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1125                         return "&{$wgHtmlEntityAliases[$name]};";
1126                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1127                         return "&$name;";
1128                 } else {
1129                         return "&amp;$name;";
1130                 }
1131         }
1132
1133         static function decCharReference( $codepoint ) {
1134                 $point = intval( $codepoint );
1135                 if( Sanitizer::validateCodepoint( $point ) ) {
1136                         return sprintf( '&#%d;', $point );
1137                 } else {
1138                         return null;
1139                 }
1140         }
1141
1142         static function hexCharReference( $codepoint ) {
1143                 $point = hexdec( $codepoint );
1144                 if( Sanitizer::validateCodepoint( $point ) ) {
1145                         return sprintf( '&#x%x;', $point );
1146                 } else {
1147                         return null;
1148                 }
1149         }
1150
1151         /**
1152          * Returns true if a given Unicode codepoint is a valid character in XML.
1153          * @param $codepoint Integer
1154          * @return Boolean
1155          */
1156         private static function validateCodepoint( $codepoint ) {
1157                 return ($codepoint ==    0x09)
1158                         || ($codepoint ==    0x0a)
1159                         || ($codepoint ==    0x0d)
1160                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1161                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1162                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1163         }
1164
1165         /**
1166          * Decode any character references, numeric or named entities,
1167          * in the text and return a UTF-8 string.
1168          *
1169          * @param $text String
1170          * @return String
1171          */
1172         public static function decodeCharReferences( $text ) {
1173                 return preg_replace_callback(
1174                         MW_CHAR_REFS_REGEX,
1175                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1176                         $text );
1177         }
1178
1179         /**
1180          * @param $matches String
1181          * @return String
1182          */
1183         static function decodeCharReferencesCallback( $matches ) {
1184                 if( $matches[1] != '' ) {
1185                         return Sanitizer::decodeEntity( $matches[1] );
1186                 } elseif( $matches[2] != '' ) {
1187                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1188                 } elseif( $matches[3] != ''  ) {
1189                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1190                 } elseif( $matches[4] != '' ) {
1191                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1192                 }
1193                 # Last case should be an ampersand by itself
1194                 return $matches[0];
1195         }
1196
1197         /**
1198          * Return UTF-8 string for a codepoint if that is a valid
1199          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1200          * @param $codepoint Integer
1201          * @return String
1202          * @private
1203          */
1204         static function decodeChar( $codepoint ) {
1205                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1206                         return codepointToUtf8( $codepoint );
1207                 } else {
1208                         return UTF8_REPLACEMENT;
1209                 }
1210         }
1211
1212         /**
1213          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1214          * return the UTF-8 encoding of that character. Otherwise, returns
1215          * pseudo-entity source (eg &foo;)
1216          *
1217          * @param $name Strings
1218          * @return String
1219          */
1220         static function decodeEntity( $name ) {
1221                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1222                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1223                         $name = $wgHtmlEntityAliases[$name];
1224                 }
1225                 if( isset( $wgHtmlEntities[$name] ) ) {
1226                         return codepointToUtf8( $wgHtmlEntities[$name] );
1227                 } else {
1228                         return "&$name;";
1229                 }
1230         }
1231
1232         /**
1233          * Fetch the whitelist of acceptable attributes for a given element name.
1234          *
1235          * @param $element String
1236          * @return Array
1237          */
1238         static function attributeWhitelist( $element ) {
1239                 static $list;
1240                 if( !isset( $list ) ) {
1241                         $list = Sanitizer::setupAttributeWhitelist();
1242                 }
1243                 return isset( $list[$element] )
1244                         ? $list[$element]
1245                         : array();
1246         }
1247
1248         /**
1249          * Foreach array key (an allowed HTML element), return an array
1250          * of allowed attributes
1251          * @return Array
1252          */
1253         static function setupAttributeWhitelist() {
1254                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
1255
1256                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1257
1258                 if ( $wgAllowRdfaAttributes ) {
1259                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1260                         $common = array_merge( $common, array(
1261                             'about', 'property', 'resource', 'datatype', 'typeof',
1262                         ) );
1263                 }
1264
1265                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
1266                         # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
1267                         $common = array_merge( $common, array(
1268                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1269                         ) );
1270                 }
1271
1272                 $block = array_merge( $common, array( 'align' ) );
1273                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1274                 $tablecell = array( 'abbr',
1275                                     'axis',
1276                                     'headers',
1277                                     'scope',
1278                                     'rowspan',
1279                                     'colspan',
1280                                     'nowrap', # deprecated
1281                                     'width',  # deprecated
1282                                     'height', # deprecated
1283                                     'bgcolor' # deprecated
1284                                     );
1285
1286                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1287                 # See: http://www.w3.org/TR/html4/
1288                 $whitelist = array (
1289                         # 7.5.4
1290                         'div'        => $block,
1291                         'center'     => $common, # deprecated
1292                         'span'       => $block, # ??
1293
1294                         # 7.5.5
1295                         'h1'         => $block,
1296                         'h2'         => $block,
1297                         'h3'         => $block,
1298                         'h4'         => $block,
1299                         'h5'         => $block,
1300                         'h6'         => $block,
1301
1302                         # 7.5.6
1303                         # address
1304
1305                         # 8.2.4
1306                         # bdo
1307
1308                         # 9.2.1
1309                         'em'         => $common,
1310                         'strong'     => $common,
1311                         'cite'       => $common,
1312                         # dfn
1313                         'code'       => $common,
1314                         # samp
1315                         # kbd
1316                         'var'        => $common,
1317                         'abbr'       => $common,
1318                         # acronym
1319
1320                         # 9.2.2
1321                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1322                         # q
1323
1324                         # 9.2.3
1325                         'sub'        => $common,
1326                         'sup'        => $common,
1327
1328                         # 9.3.1
1329                         'p'          => $block,
1330
1331                         # 9.3.2
1332                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1333
1334                         # 9.3.4
1335                         'pre'        => array_merge( $common, array( 'width' ) ),
1336
1337                         # 9.4
1338                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1339                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1340
1341                         # 10.2
1342                         'ul'         => array_merge( $common, array( 'type' ) ),
1343                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1344                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1345
1346                         # 10.3
1347                         'dl'         => $common,
1348                         'dd'         => $common,
1349                         'dt'         => $common,
1350
1351                         # 11.2.1
1352                         'table'      => array_merge( $common,
1353                                                                 array( 'summary', 'width', 'border', 'frame',
1354                                                                                 'rules', 'cellspacing', 'cellpadding',
1355                                                                                 'align', 'bgcolor',
1356                                                                 ) ),
1357
1358                         # 11.2.2
1359                         'caption'    => array_merge( $common, array( 'align' ) ),
1360
1361                         # 11.2.3
1362                         'thead'      => array_merge( $common, $tablealign ),
1363                         'tfoot'      => array_merge( $common, $tablealign ),
1364                         'tbody'      => array_merge( $common, $tablealign ),
1365
1366                         # 11.2.4
1367                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1368                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1369
1370                         # 11.2.5
1371                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1372
1373                         # 11.2.6
1374                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1375                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1376
1377                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
1378                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1379
1380                         # 13.2
1381                         # Not usually allowed, but may be used for extension-style hooks
1382                         # such as <math> when it is rasterized
1383                         'img'        => array_merge( $common, array( 'alt' ) ),
1384
1385                         # 15.2.1
1386                         'tt'         => $common,
1387                         'b'          => $common,
1388                         'i'          => $common,
1389                         'big'        => $common,
1390                         'small'      => $common,
1391                         'strike'     => $common,
1392                         's'          => $common,
1393                         'u'          => $common,
1394
1395                         # 15.2.2
1396                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1397                         # basefont
1398
1399                         # 15.3
1400                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1401
1402                         # XHTML Ruby annotation text module, simple ruby only.
1403                         # http://www.w3c.org/TR/ruby/
1404                         'ruby'       => $common,
1405                         # rbc
1406                         # rtc
1407                         'rb'         => $common,
1408                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1409                         'rp'         => $common,
1410
1411                         # MathML root element, where used for extensions
1412                         # 'title' may not be 100% valid here; it's XHTML
1413                         # http://www.w3.org/TR/REC-MathML/
1414                         'math'       => array( 'class', 'style', 'id', 'title' ),
1415                         );
1416                 return $whitelist;
1417         }
1418
1419         /**
1420          * Take a fragment of (potentially invalid) HTML and return
1421          * a version with any tags removed, encoded as plain text.
1422          *
1423          * Warning: this return value must be further escaped for literal
1424          * inclusion in HTML output as of 1.10!
1425          *
1426          * @param $text String: HTML fragment
1427          * @return String
1428          */
1429         static function stripAllTags( $text ) {
1430                 # Actual <tags>
1431                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1432
1433                 # Normalize &entities and whitespace
1434                 $text = self::decodeCharReferences( $text );
1435                 $text = self::normalizeWhitespace( $text );
1436
1437                 return $text;
1438         }
1439
1440         /**
1441          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1442          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1443          * PHP 5.1 doesn't.
1444          *
1445          * Use for passing XHTML fragments to PHP's XML parsing functions
1446          *
1447          * @return String
1448          */
1449         static function hackDocType() {
1450                 global $wgHtmlEntities;
1451                 $out = "<!DOCTYPE html [\n";
1452                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1453                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1454                 }
1455                 $out .= "]>\n";
1456                 return $out;
1457         }
1458
1459         static function cleanUrl( $url ) {
1460                 # Normalize any HTML entities in input. They will be
1461                 # re-escaped by makeExternalLink().
1462                 $url = Sanitizer::decodeCharReferences( $url );
1463
1464                 # Escape any control characters introduced by the above step
1465                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1466
1467                 # Validate hostname portion
1468                 $matches = array();
1469                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1470                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1471
1472                         // Characters that will be ignored in IDNs.
1473                         // http://tools.ietf.org/html/3454#section-3.1
1474                         // Strip them before further processing so blacklists and such work.
1475                         $strip = "/
1476                                 \\s|          # general whitespace
1477                                 \xc2\xad|     # 00ad SOFT HYPHEN
1478                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1479                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1480                                 \xe2\x81\xa0| # 2060 WORD JOINER
1481                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1482                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1483                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1484                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1485                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1486                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1487                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1488                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1489                                 /xuD";
1490
1491                         $host = preg_replace( $strip, '', $host );
1492
1493                         // @todo Fixme: validate hostnames here
1494
1495                         return $protocol . $host . $rest;
1496                 } else {
1497                         return $url;
1498                 }
1499         }
1500
1501 }