includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @addtogroup Parser
  24  */
  25
  26 /**
  27  * Regular expression to match various types of character references in
  28  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  29  */
  30 define( 'MW_CHAR_REFS_REGEX',
  31         '/&([A-Za-z0-9]+);
  32          |&\#([0-9]+);
  33          |&\#x([0-9A-Za-z]+);
  34          |&\#X([0-9A-Za-z]+);
  35          |(&)/x' );
  36
  37 /**
  38  * Regular expression to match HTML/XML attribute pairs within a tag.
  39  * Allows some... latitude.
  40  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  41  */
  42 $attrib = '[A-Za-z0-9]';
  43 $space = '[\x09\x0a\x0d\x20]';
  44 define( 'MW_ATTRIBS_REGEX',
  45         "/(?:^|$space)($attrib+)
  46           ($space*=$space*
  47                 (?:
  48                  # The attribute value: quoted or alone
  49                   \"([^<\"]*)\"
  50                  | '([^<']*)'
  51                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  52                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  53                                                          # colors are specified like this.
  54                                                          # We'll be normalizing it.
  55                 )
  56            )?(?=$space|\$)/sx" );
  57
  58 /**
  59  * List of all named character entities defined in HTML 4.01
  60  * http://www.w3.org/TR/html4/sgml/entities.html
  61  * @private
  62  */
  63 global $wgHtmlEntities;
  64 $wgHtmlEntities = array(
  65         'Aacute'   => 193,
  66         'aacute'   => 225,
  67         'Acirc'    => 194,
  68         'acirc'    => 226,
  69         'acute'    => 180,
  70         'AElig'    => 198,
  71         'aelig'    => 230,
  72         'Agrave'   => 192,
  73         'agrave'   => 224,
  74         'alefsym'  => 8501,
  75         'Alpha'    => 913,
  76         'alpha'    => 945,
  77         'amp'      => 38,
  78         'and'      => 8743,
  79         'ang'      => 8736,
  80         'Aring'    => 197,
  81         'aring'    => 229,
  82         'asymp'    => 8776,
  83         'Atilde'   => 195,
  84         'atilde'   => 227,
  85         'Auml'     => 196,
  86         'auml'     => 228,
  87         'bdquo'    => 8222,
  88         'Beta'     => 914,
  89         'beta'     => 946,
  90         'brvbar'   => 166,
  91         'bull'     => 8226,
  92         'cap'      => 8745,
  93         'Ccedil'   => 199,
  94         'ccedil'   => 231,
  95         'cedil'    => 184,
  96         'cent'     => 162,
  97         'Chi'      => 935,
  98         'chi'      => 967,
  99         'circ'     => 710,
 100         'clubs'    => 9827,
 101         'cong'     => 8773,
 102         'copy'     => 169,
 103         'crarr'    => 8629,
 104         'cup'      => 8746,
 105         'curren'   => 164,
 106         'dagger'   => 8224,
 107         'Dagger'   => 8225,
 108         'darr'     => 8595,
 109         'dArr'     => 8659,
 110         'deg'      => 176,
 111         'Delta'    => 916,
 112         'delta'    => 948,
 113         'diams'    => 9830,
 114         'divide'   => 247,
 115         'Eacute'   => 201,
 116         'eacute'   => 233,
 117         'Ecirc'    => 202,
 118         'ecirc'    => 234,
 119         'Egrave'   => 200,
 120         'egrave'   => 232,
 121         'empty'    => 8709,
 122         'emsp'     => 8195,
 123         'ensp'     => 8194,
 124         'Epsilon'  => 917,
 125         'epsilon'  => 949,
 126         'equiv'    => 8801,
 127         'Eta'      => 919,
 128         'eta'      => 951,
 129         'ETH'      => 208,
 130         'eth'      => 240,
 131         'Euml'     => 203,
 132         'euml'     => 235,
 133         'euro'     => 8364,
 134         'exist'    => 8707,
 135         'fnof'     => 402,
 136         'forall'   => 8704,
 137         'frac12'   => 189,
 138         'frac14'   => 188,
 139         'frac34'   => 190,
 140         'frasl'    => 8260,
 141         'Gamma'    => 915,
 142         'gamma'    => 947,
 143         'ge'       => 8805,
 144         'gt'       => 62,
 145         'harr'     => 8596,
 146         'hArr'     => 8660,
 147         'hearts'   => 9829,
 148         'hellip'   => 8230,
 149         'Iacute'   => 205,
 150         'iacute'   => 237,
 151         'Icirc'    => 206,
 152         'icirc'    => 238,
 153         'iexcl'    => 161,
 154         'Igrave'   => 204,
 155         'igrave'   => 236,
 156         'image'    => 8465,
 157         'infin'    => 8734,
 158         'int'      => 8747,
 159         'Iota'     => 921,
 160         'iota'     => 953,
 161         'iquest'   => 191,
 162         'isin'     => 8712,
 163         'Iuml'     => 207,
 164         'iuml'     => 239,
 165         'Kappa'    => 922,
 166         'kappa'    => 954,
 167         'Lambda'   => 923,
 168         'lambda'   => 955,
 169         'lang'     => 9001,
 170         'laquo'    => 171,
 171         'larr'     => 8592,
 172         'lArr'     => 8656,
 173         'lceil'    => 8968,
 174         'ldquo'    => 8220,
 175         'le'       => 8804,
 176         'lfloor'   => 8970,
 177         'lowast'   => 8727,
 178         'loz'      => 9674,
 179         'lrm'      => 8206,
 180         'lsaquo'   => 8249,
 181         'lsquo'    => 8216,
 182         'lt'       => 60,
 183         'macr'     => 175,
 184         'mdash'    => 8212,
 185         'micro'    => 181,
 186         'middot'   => 183,
 187         'minus'    => 8722,
 188         'Mu'       => 924,
 189         'mu'       => 956,
 190         'nabla'    => 8711,
 191         'nbsp'     => 160,
 192         'ndash'    => 8211,
 193         'ne'       => 8800,
 194         'ni'       => 8715,
 195         'not'      => 172,
 196         'notin'    => 8713,
 197         'nsub'     => 8836,
 198         'Ntilde'   => 209,
 199         'ntilde'   => 241,
 200         'Nu'       => 925,
 201         'nu'       => 957,
 202         'Oacute'   => 211,
 203         'oacute'   => 243,
 204         'Ocirc'    => 212,
 205         'ocirc'    => 244,
 206         'OElig'    => 338,
 207         'oelig'    => 339,
 208         'Ograve'   => 210,
 209         'ograve'   => 242,
 210         'oline'    => 8254,
 211         'Omega'    => 937,
 212         'omega'    => 969,
 213         'Omicron'  => 927,
 214         'omicron'  => 959,
 215         'oplus'    => 8853,
 216         'or'       => 8744,
 217         'ordf'     => 170,
 218         'ordm'     => 186,
 219         'Oslash'   => 216,
 220         'oslash'   => 248,
 221         'Otilde'   => 213,
 222         'otilde'   => 245,
 223         'otimes'   => 8855,
 224         'Ouml'     => 214,
 225         'ouml'     => 246,
 226         'para'     => 182,
 227         'part'     => 8706,
 228         'permil'   => 8240,
 229         'perp'     => 8869,
 230         'Phi'      => 934,
 231         'phi'      => 966,
 232         'Pi'       => 928,
 233         'pi'       => 960,
 234         'piv'      => 982,
 235         'plusmn'   => 177,
 236         'pound'    => 163,
 237         'prime'    => 8242,
 238         'Prime'    => 8243,
 239         'prod'     => 8719,
 240         'prop'     => 8733,
 241         'Psi'      => 936,
 242         'psi'      => 968,
 243         'quot'     => 34,
 244         'radic'    => 8730,
 245         'rang'     => 9002,
 246         'raquo'    => 187,
 247         'rarr'     => 8594,
 248         'rArr'     => 8658,
 249         'rceil'    => 8969,
 250         'rdquo'    => 8221,
 251         'real'     => 8476,
 252         'reg'      => 174,
 253         'rfloor'   => 8971,
 254         'Rho'      => 929,
 255         'rho'      => 961,
 256         'rlm'      => 8207,
 257         'rsaquo'   => 8250,
 258         'rsquo'    => 8217,
 259         'sbquo'    => 8218,
 260         'Scaron'   => 352,
 261         'scaron'   => 353,
 262         'sdot'     => 8901,
 263         'sect'     => 167,
 264         'shy'      => 173,
 265         'Sigma'    => 931,
 266         'sigma'    => 963,
 267         'sigmaf'   => 962,
 268         'sim'      => 8764,
 269         'spades'   => 9824,
 270         'sub'      => 8834,
 271         'sube'     => 8838,
 272         'sum'      => 8721,
 273         'sup'      => 8835,
 274         'sup1'     => 185,
 275         'sup2'     => 178,
 276         'sup3'     => 179,
 277         'supe'     => 8839,
 278         'szlig'    => 223,
 279         'Tau'      => 932,
 280         'tau'      => 964,
 281         'there4'   => 8756,
 282         'Theta'    => 920,
 283         'theta'    => 952,
 284         'thetasym' => 977,
 285         'thinsp'   => 8201,
 286         'THORN'    => 222,
 287         'thorn'    => 254,
 288         'tilde'    => 732,
 289         'times'    => 215,
 290         'trade'    => 8482,
 291         'Uacute'   => 218,
 292         'uacute'   => 250,
 293         'uarr'     => 8593,
 294         'uArr'     => 8657,
 295         'Ucirc'    => 219,
 296         'ucirc'    => 251,
 297         'Ugrave'   => 217,
 298         'ugrave'   => 249,
 299         'uml'      => 168,
 300         'upsih'    => 978,
 301         'Upsilon'  => 933,
 302         'upsilon'  => 965,
 303         'Uuml'     => 220,
 304         'uuml'     => 252,
 305         'weierp'   => 8472,
 306         'Xi'       => 926,
 307         'xi'       => 958,
 308         'Yacute'   => 221,
 309         'yacute'   => 253,
 310         'yen'      => 165,
 311         'Yuml'     => 376,
 312         'yuml'     => 255,
 313         'Zeta'     => 918,
 314         'zeta'     => 950,
 315         'zwj'      => 8205,
 316         'zwnj'     => 8204 );
 317
 318 class Sanitizer {
 319         /**
 320          * Cleans up HTML, removes dangerous tags and attributes, and
 321          * removes HTML comments
 322          * @private
 323          * @param string $text
 324          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 325          * @param array $args for the processing callback
 326          * @return string
 327          */
 328         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 329                 global $wgUseTidy, $wgUserHtml;
 330
 331                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 332                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 333
 334                 wfProfileIn( __METHOD__ );
 335
 336                 if ( !$staticInitialised ) {
 337                         if( $wgUserHtml ) {
 338                                 $htmlpairs = array( # Tags that must be closed
 339                                         'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 340                                         'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 341                                         'strike', 'strong', 'tt', 'var', 'div', 'center',
 342                                         'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 343                                         'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 344                                 );
 345                                 $htmlsingle = array(
 346                                         'br', 'hr', 'li', 'dt', 'dd'
 347                                 );
 348                                 $htmlsingleonly = array( # Elements that cannot have close tags
 349                                         'br', 'hr'
 350                                 );
 351                                 $htmlnest = array( # Tags that can be nested--??
 352                                         'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 353                                         'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 354                                 );
 355                                 $tabletags = array( # Can only appear inside table, we will close them
 356                                         'td', 'th', 'tr',
 357                                 );
 358                                 $htmllist = array( # Tags used by list
 359                                         'ul','ol',
 360                                 );
 361                                 $listtags = array( # Tags that can appear in a list
 362                                         'li',
 363                                 );
 364
 365                         } else {
 366                                 $htmlpairs = array();
 367                                 $htmlsingle = array();
 368                                 $htmlnest = array();
 369                                 $tabletags = array();
 370                         }
 371
 372                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 373                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 374
 375                         # Convert them all to hashtables for faster lookup
 376                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 377                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 378                         foreach ( $vars as $var ) {
 379                                 $$var = array_flip( $$var );
 380                         }
 381                         $staticInitialised = true;
 382                 }
 383
 384                 # Remove HTML comments
 385                 $text = Sanitizer::removeHTMLcomments( $text );
 386                 $bits = explode( '<', $text );
 387                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 388                 if(!$wgUseTidy) {
 389                         $tagstack = $tablestack = array();
 390                         foreach ( $bits as $x ) {
 391                                 $regs = array();
 392                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 393                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 394                                 } else {
 395                                         $slash = $t = $params = $brace = $rest = null;
 396                                 }
 397
 398                                 $badtag = 0 ;
 399                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 400                                         # Check our stack
 401                                         if ( $slash ) {
 402                                                 # Closing a tag...
 403                                                 if( isset( $htmlsingleonly[$t] ) ) {
 404                                                         $badtag = 1;
 405                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 406                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 407                                                                 # Pop all elements with an optional close tag
 408                                                                 # and see if we find a match below them
 409                                                                 $optstack = array();
 410                                                                 array_push ($optstack, $ot);
 411                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 412                                                                                 isset( $htmlsingleallowed[$ot] ) )
 413                                                                 {
 414                                                                         array_push ($optstack, $ot);
 415                                                                 }
 416                                                                 if ( $t != $ot ) {
 417                                                                         # No match. Push the optinal elements back again
 418                                                                         $badtag = 1;
 419                                                                         while ( $ot = @array_pop( $optstack ) ) {
 420                                                                                 array_push( $tagstack, $ot );
 421                                                                         }
 422                                                                 }
 423                                                         } else {
 424                                                                 @array_push( $tagstack, $ot );
 425                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 426                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 427                                                                         $badtag = 1;
 428                                                                 }
 429                                                         }
 430                                                 } else {
 431                                                         if ( $t == 'table' ) {
 432                                                                 $tagstack = array_pop( $tablestack );
 433                                                         }
 434                                                 }
 435                                                 $newparams = '';
 436                                         } else {
 437                                                 # Keep track for later
 438                                                 if ( isset( $tabletags[$t] ) &&
 439                                                 ! in_array( 'table', $tagstack ) ) {
 440                                                         $badtag = 1;
 441                                                 } else if ( in_array( $t, $tagstack ) &&
 442                                                 ! isset( $htmlnest [$t ] ) ) {
 443                                                         $badtag = 1 ;
 444                                                 # Is it a self closed htmlpair ? (bug 5487)
 445                                                 } else if( $brace == '/>' &&
 446                                                 isset( $htmlpairs[$t] ) ) {
 447                                                         $badtag = 1;
 448                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 449                                                         # Hack to force empty tag for uncloseable elements
 450                                                         $brace = '/>';
 451                                                 } else if( isset( $htmlsingle[$t] ) ) {
 452                                                         # Hack to not close $htmlsingle tags
 453                                                         $brace = NULL;
 454                                                 } else if( isset( $tabletags[$t] )
 455                                                 &&  in_array($t ,$tagstack) ) {
 456                                                         // New table tag but forgot to close the previous one
 457                                                         $text .= "</$t>";
 458                                                 } else {
 459                                                         if ( $t == 'table' ) {
 460                                                                 array_push( $tablestack, $tagstack );
 461                                                                 $tagstack = array();
 462                                                         }
 463                                                         array_push( $tagstack, $t );
 464                                                 }
 465
 466                                                 # Replace any variables or template parameters with
 467                                                 # plaintext results.
 468                                                 if( is_callable( $processCallback ) ) {
 469                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 470                                                 }
 471
 472                                                 # Strip non-approved attributes from the tag
 473                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 474                                         }
 475                                         if ( ! $badtag ) {
 476                                                 $rest = str_replace( '>', '&gt;', $rest );
 477                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 478                                                 $text .= "<$slash$t$newparams$close>$rest";
 479                                                 continue;
 480                                         }
 481                                 }
 482                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 483                         }
 484                         # Close off any remaining tags
 485                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 486                                 $text .= "</$t>\n";
 487                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 488                         }
 489                 } else {
 490                         # this might be possible using tidy itself
 491                         foreach ( $bits as $x ) {
 492                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 493                                 $x, $regs );
 494                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 495                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 496                                         if( is_callable( $processCallback ) ) {
 497                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 498                                         }
 499                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 500                                         $rest = str_replace( '>', '&gt;', $rest );
 501                                         $text .= "<$slash$t$newparams$brace$rest";
 502                                 } else {
 503                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 504                                 }
 505                         }
 506                 }
 507                 wfProfileOut( __METHOD__ );
 508                 return $text;
 509         }
 510
 511         /**
 512          * Remove '<!--', '-->', and everything between.
 513          * To avoid leaving blank lines, when a comment is both preceded
 514          * and followed by a newline (ignoring spaces), trim leading and
 515          * trailing spaces and one of the newlines.
 516          *
 517          * @private
 518          * @param string $text
 519          * @return string
 520          */
 521         static function removeHTMLcomments( $text ) {
 522                 wfProfileIn( __METHOD__ );
 523                 while (($start = strpos($text, '<!--')) !== false) {
 524                         $end = strpos($text, '-->', $start + 4);
 525                         if ($end === false) {
 526                                 # Unterminated comment; bail out
 527                                 break;
 528                         }
 529
 530                         $end += 3;
 531
 532                         # Trim space and newline if the comment is both
 533                         # preceded and followed by a newline
 534                         $spaceStart = max($start - 1, 0);
 535                         $spaceLen = $end - $spaceStart;
 536                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 537                                 $spaceStart--;
 538                                 $spaceLen++;
 539                         }
 540                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 541                                 $spaceLen++;
 542                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 543                                 # Remove the comment, leading and trailing
 544                                 # spaces, and leave only one newline.
 545                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 546                         }
 547                         else {
 548                                 # Remove just the comment.
 549                                 $text = substr_replace($text, '', $start, $end - $start);
 550                         }
 551                 }
 552                 wfProfileOut( __METHOD__ );
 553                 return $text;
 554         }
 555
 556         /**
 557          * Take an array of attribute names and values and normalize or discard
 558          * illegal values for the given element type.
 559          *
 560          * - Discards attributes not on a whitelist for the given element
 561          * - Unsafe style attributes are discarded
 562          *
 563          * @param array $attribs
 564          * @param string $element
 565          * @return array
 566          *
 567          * @todo Check for legal values where the DTD limits things.
 568          * @todo Check for unique id attribute :P
 569          */
 570         static function validateTagAttributes( $attribs, $element ) {
 571                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 572                 $out = array();
 573                 foreach( $attribs as $attribute => $value ) {
 574                         if( !isset( $whitelist[$attribute] ) ) {
 575                                 continue;
 576                         }
 577                         # Strip javascript "expression" from stylesheets.
 578                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 579                         if( $attribute == 'style' ) {
 580                                 $value = Sanitizer::checkCss( $value );
 581                                 if( $value === false ) {
 582                                         # haxx0r
 583                                         continue;
 584                                 }
 585                         }
 586
 587                         if ( $attribute === 'id' )
 588                                 $value = Sanitizer::escapeId( $value );
 589
 590                         // If this attribute was previously set, override it.
 591                         // Output should only have one attribute of each name.
 592                         $out[$attribute] = $value;
 593                 }
 594                 return $out;
 595         }
 596
 597         /**
 598          * Pick apart some CSS and check it for forbidden or unsafe structures.
 599          * Returns a sanitized string, or false if it was just too evil.
 600          *
 601          * Currently URL references, 'expression', 'tps' are forbidden.
 602          *
 603          * @param string $value
 604          * @return mixed
 605          */
 606         static function checkCss( $value ) {
 607                 $stripped = Sanitizer::decodeCharReferences( $value );
 608
 609                 // Remove any comments; IE gets token splitting wrong
 610                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 611
 612                 $value = $stripped;
 613
 614                 // ... and continue checks
 615                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 616                         'codepointToUtf8(hexdec("$1"))', $stripped );
 617                 $stripped = str_replace( '\\', '', $stripped );
 618                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 619                                 $stripped ) ) {
 620                         # haxx0r
 621                         return false;
 622                 }
 623
 624                 return $value;
 625         }
 626
 627         /**
 628          * Take a tag soup fragment listing an HTML element's attributes
 629          * and normalize it to well-formed XML, discarding unwanted attributes.
 630          * Output is safe for further wikitext processing, with escaping of
 631          * values that could trigger problems.
 632          *
 633          * - Normalizes attribute names to lowercase
 634          * - Discards attributes not on a whitelist for the given element
 635          * - Turns broken or invalid entities into plaintext
 636          * - Double-quotes all attribute values
 637          * - Attributes without values are given the name as attribute
 638          * - Double attributes are discarded
 639          * - Unsafe style attributes are discarded
 640          * - Prepends space if there are attributes.
 641          *
 642          * @param string $text
 643          * @param string $element
 644          * @return string
 645          */
 646         static function fixTagAttributes( $text, $element ) {
 647                 if( trim( $text ) == '' ) {
 648                         return '';
 649                 }
 650
 651                 $stripped = Sanitizer::validateTagAttributes(
 652                         Sanitizer::decodeTagAttributes( $text ), $element );
 653
 654                 $attribs = array();
 655                 foreach( $stripped as $attribute => $value ) {
 656                         $encAttribute = htmlspecialchars( $attribute );
 657                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 658
 659                         $attribs[] = "$encAttribute=\"$encValue\"";
 660                 }
 661                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 662         }
 663
 664         /**
 665          * Encode an attribute value for HTML output.
 666          * @param $text
 667          * @return HTML-encoded text fragment
 668          */
 669         static function encodeAttribute( $text ) {
 670                 $encValue = htmlspecialchars( $text );
 671
 672                 // Whitespace is normalized during attribute decoding,
 673                 // so if we've been passed non-spaces we must encode them
 674                 // ahead of time or they won't be preserved.
 675                 $encValue = strtr( $encValue, array(
 676                         "\n" => '&#10;',
 677                         "\r" => '&#13;',
 678                         "\t" => '&#9;',
 679                 ) );
 680
 681                 return $encValue;
 682         }
 683
 684         /**
 685          * Encode an attribute value for HTML tags, with extra armoring
 686          * against further wiki processing.
 687          * @param $text
 688          * @return HTML-encoded text fragment
 689          */
 690         static function safeEncodeAttribute( $text ) {
 691                 $encValue = Sanitizer::encodeAttribute( $text );
 692
 693                 # Templates and links may be expanded in later parsing,
 694                 # creating invalid or dangerous output. Suppress this.
 695                 $encValue = strtr( $encValue, array(
 696                         '<'    => '&lt;',   // This should never happen,
 697                         '>'    => '&gt;',   // we've received invalid input
 698                         '"'    => '&quot;', // which should have been escaped.
 699                         '{'    => '&#123;',
 700                         '['    => '&#91;',
 701                         "''"   => '&#39;&#39;',
 702                         'ISBN' => '&#73;SBN',
 703                         'RFC'  => '&#82;FC',
 704                         'PMID' => '&#80;MID',
 705                         '|'    => '&#124;',
 706                         '__'   => '&#95;_',
 707                 ) );
 708
 709                 # Stupid hack
 710                 $encValue = preg_replace_callback(
 711                         '/(' . wfUrlProtocols() . ')/',
 712                         array( 'Sanitizer', 'armorLinksCallback' ),
 713                         $encValue );
 714                 return $encValue;
 715         }
 716
 717         /**
 718          * Given a value escape it so that it can be used in an id attribute and
 719          * return it, this does not validate the value however (see first link)
 720          *
 721          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 722          *                                                          in the id and
 723          *                                                          name attributes
 724          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 725          *
 726          * @static
 727          *
 728          * @param string $id
 729          * @return string
 730          */
 731         static function escapeId( $id ) {
 732                 static $replace = array(
 733                         '%3A' => ':',
 734                         '%' => '.'
 735                 );
 736
 737                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 738
 739                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 740         }
 741
 742         /**
 743          * Given a value, escape it so that it can be used as a CSS class and
 744          * return it.
 745          *
 746          * @todo For extra validity, input should be validated UTF-8.
 747          *
 748          * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 749          *
 750          * @param string $class
 751          * @return string
 752          */
 753         static function escapeClass( $class ) {
 754                 // Convert ugly stuff to underscores and kill underscores in ugly places
 755                 return rtrim(preg_replace(
 756                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 757                         '_',
 758                         $class ), '_');
 759         }
 760
 761         /**
 762          * Regex replace callback for armoring links against further processing.
 763          * @param array $matches
 764          * @return string
 765          * @private
 766          */
 767         private static function armorLinksCallback( $matches ) {
 768                 return str_replace( ':', '&#58;', $matches[1] );
 769         }
 770
 771         /**
 772          * Return an associative array of attribute names and values from
 773          * a partial tag string. Attribute names are forces to lowercase,
 774          * character references are decoded to UTF-8 text.
 775          *
 776          * @param string
 777          * @return array
 778          */
 779         static function decodeTagAttributes( $text ) {
 780                 $attribs = array();
 781
 782                 if( trim( $text ) == '' ) {
 783                         return $attribs;
 784                 }
 785
 786                 $pairs = array();
 787                 if( !preg_match_all(
 788                         MW_ATTRIBS_REGEX,
 789                         $text,
 790                         $pairs,
 791                         PREG_SET_ORDER ) ) {
 792                         return $attribs;
 793                 }
 794
 795                 foreach( $pairs as $set ) {
 796                         $attribute = strtolower( $set[1] );
 797                         $value = Sanitizer::getTagAttributeCallback( $set );
 798
 799                         // Normalize whitespace
 800                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 801                         $value = trim( $value );
 802
 803                         // Decode character references
 804                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 805                 }
 806                 return $attribs;
 807         }
 808
 809         /**
 810          * Pick the appropriate attribute value from a match set from the
 811          * MW_ATTRIBS_REGEX matches.
 812          *
 813          * @param array $set
 814          * @return string
 815          * @private
 816          */
 817         private static function getTagAttributeCallback( $set ) {
 818                 if( isset( $set[6] ) ) {
 819                         # Illegal #XXXXXX color with no quotes.
 820                         return $set[6];
 821                 } elseif( isset( $set[5] ) ) {
 822                         # No quotes.
 823                         return $set[5];
 824                 } elseif( isset( $set[4] ) ) {
 825                         # Single-quoted
 826                         return $set[4];
 827                 } elseif( isset( $set[3] ) ) {
 828                         # Double-quoted
 829                         return $set[3];
 830                 } elseif( !isset( $set[2] ) ) {
 831                         # In XHTML, attributes must have a value.
 832                         # For 'reduced' form, return explicitly the attribute name here.
 833                         return $set[1];
 834                 } else {
 835                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 836                 }
 837         }
 838
 839         /**
 840          * Normalize whitespace and character references in an XML source-
 841          * encoded text for an attribute value.
 842          *
 843          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 844          * but note that we're not returning the value, but are returning
 845          * XML source fragments that will be slapped into output.
 846          *
 847          * @param string $text
 848          * @return string
 849          * @private
 850          */
 851         private static function normalizeAttributeValue( $text ) {
 852                 return str_replace( '"', '&quot;',
 853                         preg_replace(
 854                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 855                                 ' ',
 856                                 Sanitizer::normalizeCharReferences( $text ) ) );
 857         }
 858
 859         /**
 860          * Ensure that any entities and character references are legal
 861          * for XML and XHTML specifically. Any stray bits will be
 862          * &amp;-escaped to result in a valid text fragment.
 863          *
 864          * a. any named char refs must be known in XHTML
 865          * b. any numeric char refs must be legal chars, not invalid or forbidden
 866          * c. use &#x, not &#X
 867          * d. fix or reject non-valid attributes
 868          *
 869          * @param string $text
 870          * @return string
 871          * @private
 872          */
 873         static function normalizeCharReferences( $text ) {
 874                 return preg_replace_callback(
 875                         MW_CHAR_REFS_REGEX,
 876                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 877                         $text );
 878         }
 879         /**
 880          * @param string $matches
 881          * @return string
 882          */
 883         static function normalizeCharReferencesCallback( $matches ) {
 884                 $ret = null;
 885                 if( $matches[1] != '' ) {
 886                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 887                 } elseif( $matches[2] != '' ) {
 888                         $ret = Sanitizer::decCharReference( $matches[2] );
 889                 } elseif( $matches[3] != ''  ) {
 890                         $ret = Sanitizer::hexCharReference( $matches[3] );
 891                 } elseif( $matches[4] != '' ) {
 892                         $ret = Sanitizer::hexCharReference( $matches[4] );
 893                 }
 894                 if( is_null( $ret ) ) {
 895                         return htmlspecialchars( $matches[0] );
 896                 } else {
 897                         return $ret;
 898                 }
 899         }
 900
 901         /**
 902          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 903          * return the named entity reference as is. Otherwise, returns
 904          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 905          *
 906          * @param string $name
 907          * @return string
 908          * @static
 909          */
 910         static function normalizeEntity( $name ) {
 911                 global $wgHtmlEntities;
 912                 if( isset( $wgHtmlEntities[$name] ) ) {
 913                         return "&$name;";
 914                 } else {
 915                         return "&amp;$name;";
 916                 }
 917         }
 918
 919         static function decCharReference( $codepoint ) {
 920                 $point = intval( $codepoint );
 921                 if( Sanitizer::validateCodepoint( $point ) ) {
 922                         return sprintf( '&#%d;', $point );
 923                 } else {
 924                         return null;
 925                 }
 926         }
 927
 928         static function hexCharReference( $codepoint ) {
 929                 $point = hexdec( $codepoint );
 930                 if( Sanitizer::validateCodepoint( $point ) ) {
 931                         return sprintf( '&#x%x;', $point );
 932                 } else {
 933                         return null;
 934                 }
 935         }
 936
 937         /**
 938          * Returns true if a given Unicode codepoint is a valid character in XML.
 939          * @param int $codepoint
 940          * @return bool
 941          */
 942         private static function validateCodepoint( $codepoint ) {
 943                 return ($codepoint ==    0x09)
 944                         || ($codepoint ==    0x0a)
 945                         || ($codepoint ==    0x0d)
 946                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 947                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 948                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 949         }
 950
 951         /**
 952          * Decode any character references, numeric or named entities,
 953          * in the text and return a UTF-8 string.
 954          *
 955          * @param string $text
 956          * @return string
 957          * @public
 958          * @static
 959          */
 960         public static function decodeCharReferences( $text ) {
 961                 return preg_replace_callback(
 962                         MW_CHAR_REFS_REGEX,
 963                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 964                         $text );
 965         }
 966
 967         /**
 968          * @param string $matches
 969          * @return string
 970          */
 971         static function decodeCharReferencesCallback( $matches ) {
 972                 if( $matches[1] != '' ) {
 973                         return Sanitizer::decodeEntity( $matches[1] );
 974                 } elseif( $matches[2] != '' ) {
 975                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 976                 } elseif( $matches[3] != ''  ) {
 977                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 978                 } elseif( $matches[4] != '' ) {
 979                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 980                 }
 981                 # Last case should be an ampersand by itself
 982                 return $matches[0];
 983         }
 984
 985         /**
 986          * Return UTF-8 string for a codepoint if that is a valid
 987          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 988          * @param int $codepoint
 989          * @return string
 990          * @private
 991          */
 992         static function decodeChar( $codepoint ) {
 993                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 994                         return codepointToUtf8( $codepoint );
 995                 } else {
 996                         return UTF8_REPLACEMENT;
 997                 }
 998         }
 999
1000         /**
1001          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1002          * return the UTF-8 encoding of that character. Otherwise, returns
1003          * pseudo-entity source (eg &foo;)
1004          *
1005          * @param string $name
1006          * @return string
1007          */
1008         static function decodeEntity( $name ) {
1009                 global $wgHtmlEntities;
1010                 if( isset( $wgHtmlEntities[$name] ) ) {
1011                         return codepointToUtf8( $wgHtmlEntities[$name] );
1012                 } else {
1013                         return "&$name;";
1014                 }
1015         }
1016
1017         /**
1018          * Fetch the whitelist of acceptable attributes for a given
1019          * element name.
1020          *
1021          * @param string $element
1022          * @return array
1023          */
1024         static function attributeWhitelist( $element ) {
1025                 static $list;
1026                 if( !isset( $list ) ) {
1027                         $list = Sanitizer::setupAttributeWhitelist();
1028                 }
1029                 return isset( $list[$element] )
1030                         ? $list[$element]
1031                         : array();
1032         }
1033
1034         /**
1035          * @todo Document it a bit
1036          * @return array
1037          */
1038         static function setupAttributeWhitelist() {
1039                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1040                 $block = array_merge( $common, array( 'align' ) );
1041                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1042                 $tablecell = array( 'abbr',
1043                                     'axis',
1044                                     'headers',
1045                                     'scope',
1046                                     'rowspan',
1047                                     'colspan',
1048                                     'nowrap', # deprecated
1049                                     'width',  # deprecated
1050                                     'height', # deprecated
1051                                     'bgcolor' # deprecated
1052                                     );
1053
1054                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1055                 # See: http://www.w3.org/TR/html4/
1056                 $whitelist = array (
1057                         # 7.5.4
1058                         'div'        => $block,
1059                         'center'     => $common, # deprecated
1060                         'span'       => $block, # ??
1061
1062                         # 7.5.5
1063                         'h1'         => $block,
1064                         'h2'         => $block,
1065                         'h3'         => $block,
1066                         'h4'         => $block,
1067                         'h5'         => $block,
1068                         'h6'         => $block,
1069
1070                         # 7.5.6
1071                         # address
1072
1073                         # 8.2.4
1074                         # bdo
1075
1076                         # 9.2.1
1077                         'em'         => $common,
1078                         'strong'     => $common,
1079                         'cite'       => $common,
1080                         # dfn
1081                         'code'       => $common,
1082                         # samp
1083                         # kbd
1084                         'var'        => $common,
1085                         # abbr
1086                         # acronym
1087
1088                         # 9.2.2
1089                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1090                         # q
1091
1092                         # 9.2.3
1093                         'sub'        => $common,
1094                         'sup'        => $common,
1095
1096                         # 9.3.1
1097                         'p'          => $block,
1098
1099                         # 9.3.2
1100                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1101
1102                         # 9.3.4
1103                         'pre'        => array_merge( $common, array( 'width' ) ),
1104
1105                         # 9.4
1106                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1107                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1108
1109                         # 10.2
1110                         'ul'         => array_merge( $common, array( 'type' ) ),
1111                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1112                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1113
1114                         # 10.3
1115                         'dl'         => $common,
1116                         'dd'         => $common,
1117                         'dt'         => $common,
1118
1119                         # 11.2.1
1120                         'table'      => array_merge( $common,
1121                                                                 array( 'summary', 'width', 'border', 'frame',
1122                                                                                 'rules', 'cellspacing', 'cellpadding',
1123                                                                                 'align', 'bgcolor',
1124                                                                 ) ),
1125
1126                         # 11.2.2
1127                         'caption'    => array_merge( $common, array( 'align' ) ),
1128
1129                         # 11.2.3
1130                         'thead'      => array_merge( $common, $tablealign ),
1131                         'tfoot'      => array_merge( $common, $tablealign ),
1132                         'tbody'      => array_merge( $common, $tablealign ),
1133
1134                         # 11.2.4
1135                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1136                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1137
1138                         # 11.2.5
1139                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1140
1141                         # 11.2.6
1142                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1143                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1144
1145                         # 15.2.1
1146                         'tt'         => $common,
1147                         'b'          => $common,
1148                         'i'          => $common,
1149                         'big'        => $common,
1150                         'small'      => $common,
1151                         'strike'     => $common,
1152                         's'          => $common,
1153                         'u'          => $common,
1154
1155                         # 15.2.2
1156                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1157                         # basefont
1158
1159                         # 15.3
1160                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1161
1162                         # XHTML Ruby annotation text module, simple ruby only.
1163                         # http://www.w3c.org/TR/ruby/
1164                         'ruby'       => $common,
1165                         # rbc
1166                         # rtc
1167                         'rb'         => $common,
1168                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1169                         'rp'         => $common,
1170                         );
1171                 return $whitelist;
1172         }
1173
1174         /**
1175          * Take a fragment of (potentially invalid) HTML and return
1176          * a version with any tags removed, encoded suitably for literal
1177          * inclusion in an attribute value.
1178          *
1179          * @param string $text HTML fragment
1180          * @return string
1181          */
1182         static function stripAllTags( $text ) {
1183                 # Actual <tags>
1184                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1185
1186                 # Normalize &entities and whitespace
1187                 $text = Sanitizer::normalizeAttributeValue( $text );
1188
1189                 # Will be placed into "double-quoted" attributes,
1190                 # make sure remaining bits are safe.
1191                 $text = str_replace(
1192                         array('<', '>', '"'),
1193                         array('&lt;', '&gt;', '&quot;'),
1194                         $text );
1195
1196                 return $text;
1197         }
1198
1199         /**
1200          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1201          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1202          * PHP 5.1 doesn't.
1203          *
1204          * Use for passing XHTML fragments to PHP's XML parsing functions
1205          *
1206          * @return string
1207          * @static
1208          */
1209         static function hackDocType() {
1210                 global $wgHtmlEntities;
1211                 $out = "<!DOCTYPE html [\n";
1212                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1213                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1214                 }
1215                 $out .= "]>\n";
1216                 return $out;
1217         }
1218
1219         static function cleanUrl( $url, $hostname=true ) {
1220                 # Normalize any HTML entities in input. They will be
1221                 # re-escaped by makeExternalLink().
1222                 $url = Sanitizer::decodeCharReferences( $url );
1223
1224                 # Escape any control characters introduced by the above step
1225                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1226
1227                 # Validate hostname portion
1228                 $matches = array();
1229                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1230                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1231
1232                         // Characters that will be ignored in IDNs.
1233                         // http://tools.ietf.org/html/3454#section-3.1
1234                         // Strip them before further processing so blacklists and such work.
1235                         $strip = "/
1236                                 \\s|          # general whitespace
1237                                 \xc2\xad|     # 00ad SOFT HYPHEN
1238                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1239                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1240                                 \xe2\x81\xa0| # 2060 WORD JOINER
1241                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1242                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1243                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1244                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1245                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1246                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1247                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1248                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1249                                 /xuD";
1250
1251                         $host = preg_replace( $strip, '', $host );
1252
1253                         // @fixme: validate hostnames here
1254
1255                         return $protocol . $host . $rest;
1256                 } else {
1257                         return $url;
1258                 }
1259         }
1260
1261 }
1262
1263 ?>