includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332
 333                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 334                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 335
 336                 wfProfileIn( __METHOD__ );
 337
 338                 if ( !$staticInitialised ) {
 339                         if( $wgUserHtml ) {
 340                                 $htmlpairs = array( # Tags that must be closed
 341                                         'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 342                                         'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 343                                         'strike', 'strong', 'tt', 'var', 'div', 'center',
 344                                         'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 345                                         'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 346                                 );
 347                                 $htmlsingle = array(
 348                                         'br', 'hr', 'li', 'dt', 'dd'
 349                                 );
 350                                 $htmlsingleonly = array( # Elements that cannot have close tags
 351                                         'br', 'hr'
 352                                 );
 353                                 $htmlnest = array( # Tags that can be nested--??
 354                                         'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 355                                         'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 356                                 );
 357                                 $tabletags = array( # Can only appear inside table
 358                                         'td', 'th', 'tr',
 359                                 );
 360                                 $htmllist = array( # Tags used by list
 361                                         'ul','ol',
 362                                 );
 363                                 $listtags = array( # Tags that can appear in a list
 364                                         'li',
 365                                 );
 366
 367                         } else {
 368                                 $htmlpairs = array();
 369                                 $htmlsingle = array();
 370                                 $htmlnest = array();
 371                                 $tabletags = array();
 372                         }
 373
 374                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 375                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 376
 377                         # Convert them all to hashtables for faster lookup
 378                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 379                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 380                         foreach ( $vars as $var ) {
 381                                 $$var = array_flip( $$var );
 382                         }
 383                         $staticInitialised = true;
 384                 }
 385
 386                 # Remove HTML comments
 387                 $text = Sanitizer::removeHTMLcomments( $text );
 388                 $bits = explode( '<', $text );
 389                 $text = array_shift( $bits );
 390                 if(!$wgUseTidy) {
 391                         $tagstack = $tablestack = array();
 392                         foreach ( $bits as $x ) {
 393                                 $regs = array();
 394                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 395                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 396                                 } else {
 397                                         $slash = $t = $params = $brace = $rest = null;
 398                                 }
 399
 400                                 $badtag = 0 ;
 401                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 402                                         # Check our stack
 403                                         if ( $slash ) {
 404                                                 # Closing a tag...
 405                                                 if( isset( $htmlsingleonly[$t] ) ) {
 406                                                         $badtag = 1;
 407                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 408                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 409                                                                 # Pop all elements with an optional close tag
 410                                                                 # and see if we find a match below them
 411                                                                 $optstack = array();
 412                                                                 array_push ($optstack, $ot);
 413                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 414                                                                                 isset( $htmlsingleallowed[$ot] ) )
 415                                                                 {
 416                                                                         array_push ($optstack, $ot);
 417                                                                 }
 418                                                                 if ( $t != $ot ) {
 419                                                                         # No match. Push the optinal elements back again
 420                                                                         $badtag = 1;
 421                                                                         while ( $ot = @array_pop( $optstack ) ) {
 422                                                                                 array_push( $tagstack, $ot );
 423                                                                         }
 424                                                                 }
 425                                                         } else {
 426                                                                 @array_push( $tagstack, $ot );
 427                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 428                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 429                                                                         $badtag = 1;
 430                                                                 }
 431                                                         }
 432                                                 } else {
 433                                                         if ( $t == 'table' ) {
 434                                                                 $tagstack = array_pop( $tablestack );
 435                                                         }
 436                                                 }
 437                                                 $newparams = '';
 438                                         } else {
 439                                                 # Keep track for later
 440                                                 if ( isset( $tabletags[$t] ) &&
 441                                                 ! in_array( 'table', $tagstack ) ) {
 442                                                         $badtag = 1;
 443                                                 } else if ( in_array( $t, $tagstack ) &&
 444                                                 ! isset( $htmlnest [$t ] ) ) {
 445                                                         $badtag = 1 ;
 446                                                 # Is it a self closed htmlpair ? (bug 5487)
 447                                                 } else if( $brace == '/>' &&
 448                                                 isset( $htmlpairs[$t] ) ) {
 449                                                         $badtag = 1;
 450                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 451                                                         # Hack to force empty tag for uncloseable elements
 452                                                         $brace = '/>';
 453                                                 } else if( isset( $htmlsingle[$t] ) ) {
 454                                                         # Hack to not close $htmlsingle tags
 455                                                         $brace = NULL;
 456                                                 } else {
 457                                                         if ( $t == 'table' ) {
 458                                                                 array_push( $tablestack, $tagstack );
 459                                                                 $tagstack = array();
 460                                                         }
 461                                                         array_push( $tagstack, $t );
 462                                                 }
 463
 464                                                 # Replace any variables or template parameters with
 465                                                 # plaintext results.
 466                                                 if( is_callable( $processCallback ) ) {
 467                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 468                                                 }
 469
 470                                                 # Strip non-approved attributes from the tag
 471                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 472                                         }
 473                                         if ( ! $badtag ) {
 474                                                 $rest = str_replace( '>', '&gt;', $rest );
 475                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 476                                                 $text .= "<$slash$t$newparams$close>$rest";
 477                                                 continue;
 478                                         }
 479                                 }
 480                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 481                         }
 482                         # Close off any remaining tags
 483                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 484                                 $text .= "</$t>\n";
 485                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 486                         }
 487                 } else {
 488                         # this might be possible using tidy itself
 489                         foreach ( $bits as $x ) {
 490                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 491                                 $x, $regs );
 492                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 493                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 494                                         if( is_callable( $processCallback ) ) {
 495                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 496                                         }
 497                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 498                                         $rest = str_replace( '>', '&gt;', $rest );
 499                                         $text .= "<$slash$t$newparams$brace$rest";
 500                                 } else {
 501                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 502                                 }
 503                         }
 504                 }
 505                 wfProfileOut( __METHOD__ );
 506                 return $text;
 507         }
 508
 509         /**
 510          * Remove '<!--', '-->', and everything between.
 511          * To avoid leaving blank lines, when a comment is both preceded
 512          * and followed by a newline (ignoring spaces), trim leading and
 513          * trailing spaces and one of the newlines.
 514          *
 515          * @private
 516          * @param string $text
 517          * @return string
 518          */
 519         static function removeHTMLcomments( $text ) {
 520                 wfProfileIn( __METHOD__ );
 521                 while (($start = strpos($text, '<!--')) !== false) {
 522                         $end = strpos($text, '-->', $start + 4);
 523                         if ($end === false) {
 524                                 # Unterminated comment; bail out
 525                                 break;
 526                         }
 527
 528                         $end += 3;
 529
 530                         # Trim space and newline if the comment is both
 531                         # preceded and followed by a newline
 532                         $spaceStart = max($start - 1, 0);
 533                         $spaceLen = $end - $spaceStart;
 534                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 535                                 $spaceStart--;
 536                                 $spaceLen++;
 537                         }
 538                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 539                                 $spaceLen++;
 540                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 541                                 # Remove the comment, leading and trailing
 542                                 # spaces, and leave only one newline.
 543                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 544                         }
 545                         else {
 546                                 # Remove just the comment.
 547                                 $text = substr_replace($text, '', $start, $end - $start);
 548                         }
 549                 }
 550                 wfProfileOut( __METHOD__ );
 551                 return $text;
 552         }
 553
 554         /**
 555          * Take an array of attribute names and values and normalize or discard
 556          * illegal values for the given element type.
 557          *
 558          * - Discards attributes not on a whitelist for the given element
 559          * - Unsafe style attributes are discarded
 560          *
 561          * @param array $attribs
 562          * @param string $element
 563          * @return array
 564          *
 565          * @todo Check for legal values where the DTD limits things.
 566          * @todo Check for unique id attribute :P
 567          */
 568         static function validateTagAttributes( $attribs, $element ) {
 569                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 570                 $out = array();
 571                 foreach( $attribs as $attribute => $value ) {
 572                         if( !isset( $whitelist[$attribute] ) ) {
 573                                 continue;
 574                         }
 575                         # Strip javascript "expression" from stylesheets.
 576                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 577                         if( $attribute == 'style' ) {
 578                                 $value = Sanitizer::checkCss( $value );
 579                                 if( $value === false ) {
 580                                         # haxx0r
 581                                         continue;
 582                                 }
 583                         }
 584
 585                         if ( $attribute === 'id' )
 586                                 $value = Sanitizer::escapeId( $value );
 587
 588                         // If this attribute was previously set, override it.
 589                         // Output should only have one attribute of each name.
 590                         $out[$attribute] = $value;
 591                 }
 592                 return $out;
 593         }
 594
 595         /**
 596          * Pick apart some CSS and check it for forbidden or unsafe structures.
 597          * Returns a sanitized string, or false if it was just too evil.
 598          *
 599          * Currently URL references, 'expression', 'tps' are forbidden.
 600          *
 601          * @param string $value
 602          * @return mixed
 603          */
 604         static function checkCss( $value ) {
 605                 $stripped = Sanitizer::decodeCharReferences( $value );
 606
 607                 // Remove any comments; IE gets token splitting wrong
 608                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 609
 610                 $value = $stripped;
 611
 612                 // ... and continue checks
 613                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 614                         'codepointToUtf8(hexdec("$1"))', $stripped );
 615                 $stripped = str_replace( '\\', '', $stripped );
 616                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 617                                 $stripped ) ) {
 618                         # haxx0r
 619                         return false;
 620                 }
 621
 622                 return $value;
 623         }
 624
 625         /**
 626          * Take a tag soup fragment listing an HTML element's attributes
 627          * and normalize it to well-formed XML, discarding unwanted attributes.
 628          * Output is safe for further wikitext processing, with escaping of
 629          * values that could trigger problems.
 630          *
 631          * - Normalizes attribute names to lowercase
 632          * - Discards attributes not on a whitelist for the given element
 633          * - Turns broken or invalid entities into plaintext
 634          * - Double-quotes all attribute values
 635          * - Attributes without values are given the name as attribute
 636          * - Double attributes are discarded
 637          * - Unsafe style attributes are discarded
 638          * - Prepends space if there are attributes.
 639          *
 640          * @param string $text
 641          * @param string $element
 642          * @return string
 643          */
 644         static function fixTagAttributes( $text, $element ) {
 645                 if( trim( $text ) == '' ) {
 646                         return '';
 647                 }
 648
 649                 $stripped = Sanitizer::validateTagAttributes(
 650                         Sanitizer::decodeTagAttributes( $text ), $element );
 651
 652                 $attribs = array();
 653                 foreach( $stripped as $attribute => $value ) {
 654                         $encAttribute = htmlspecialchars( $attribute );
 655                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 656
 657                         $attribs[] = "$encAttribute=\"$encValue\"";
 658                 }
 659                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 660         }
 661
 662         /**
 663          * Encode an attribute value for HTML output.
 664          * @param $text
 665          * @return HTML-encoded text fragment
 666          */
 667         static function encodeAttribute( $text ) {
 668                 $encValue = htmlspecialchars( $text );
 669
 670                 // Whitespace is normalized during attribute decoding,
 671                 // so if we've been passed non-spaces we must encode them
 672                 // ahead of time or they won't be preserved.
 673                 $encValue = strtr( $encValue, array(
 674                         "\n" => '&#10;',
 675                         "\r" => '&#13;',
 676                         "\t" => '&#9;',
 677                 ) );
 678
 679                 return $encValue;
 680         }
 681
 682         /**
 683          * Encode an attribute value for HTML tags, with extra armoring
 684          * against further wiki processing.
 685          * @param $text
 686          * @return HTML-encoded text fragment
 687          */
 688         static function safeEncodeAttribute( $text ) {
 689                 $encValue = Sanitizer::encodeAttribute( $text );
 690
 691                 # Templates and links may be expanded in later parsing,
 692                 # creating invalid or dangerous output. Suppress this.
 693                 $encValue = strtr( $encValue, array(
 694                         '<'    => '&lt;',   // This should never happen,
 695                         '>'    => '&gt;',   // we've received invalid input
 696                         '"'    => '&quot;', // which should have been escaped.
 697                         '{'    => '&#123;',
 698                         '['    => '&#91;',
 699                         "''"   => '&#39;&#39;',
 700                         'ISBN' => '&#73;SBN',
 701                         'RFC'  => '&#82;FC',
 702                         'PMID' => '&#80;MID',
 703                         '|'    => '&#124;',
 704                         '__'   => '&#95;_',
 705                 ) );
 706
 707                 # Stupid hack
 708                 $encValue = preg_replace_callback(
 709                         '/(' . wfUrlProtocols() . ')/',
 710                         array( 'Sanitizer', 'armorLinksCallback' ),
 711                         $encValue );
 712                 return $encValue;
 713         }
 714
 715         /**
 716          * Given a value escape it so that it can be used in an id attribute and
 717          * return it, this does not validate the value however (see first link)
 718          *
 719          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 720          *                                                          in the id and
 721          *                                                          name attributes
 722          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 723          *
 724          * @bug 4461
 725          *
 726          * @static
 727          *
 728          * @param string $id
 729          * @return string
 730          */
 731         static function escapeId( $id ) {
 732                 static $replace = array(
 733                         '%3A' => ':',
 734                         '%' => '.'
 735                 );
 736
 737                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 738
 739                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 740         }
 741
 742         /**
 743          * Given a value, escape it so that it can be used as a CSS class and
 744          * return it.
 745          *
 746          * @todo For extra validity, input should be validated UTF-8.
 747          *
 748          * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 749          *
 750          * @param string $class
 751          * @return string
 752          */
 753         static function escapeClass( $class ) {
 754                 // Convert ugly stuff to underscores and kill underscores in ugly places
 755                 return rtrim(preg_replace(
 756                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 757                         '_',
 758                         $class ), '_');
 759         }
 760
 761         /**
 762          * Regex replace callback for armoring links against further processing.
 763          * @param array $matches
 764          * @return string
 765          * @private
 766          */
 767         private static function armorLinksCallback( $matches ) {
 768                 return str_replace( ':', '&#58;', $matches[1] );
 769         }
 770
 771         /**
 772          * Return an associative array of attribute names and values from
 773          * a partial tag string. Attribute names are forces to lowercase,
 774          * character references are decoded to UTF-8 text.
 775          *
 776          * @param string
 777          * @return array
 778          */
 779         static function decodeTagAttributes( $text ) {
 780                 $attribs = array();
 781
 782                 if( trim( $text ) == '' ) {
 783                         return $attribs;
 784                 }
 785
 786                 $pairs = array();
 787                 if( !preg_match_all(
 788                         MW_ATTRIBS_REGEX,
 789                         $text,
 790                         $pairs,
 791                         PREG_SET_ORDER ) ) {
 792                         return $attribs;
 793                 }
 794
 795                 foreach( $pairs as $set ) {
 796                         $attribute = strtolower( $set[1] );
 797                         $value = Sanitizer::getTagAttributeCallback( $set );
 798
 799                         // Normalize whitespace
 800                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 801                         $value = trim( $value );
 802
 803                         // Decode character references
 804                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 805                 }
 806                 return $attribs;
 807         }
 808
 809         /**
 810          * Pick the appropriate attribute value from a match set from the
 811          * MW_ATTRIBS_REGEX matches.
 812          *
 813          * @param array $set
 814          * @return string
 815          * @private
 816          */
 817         private static function getTagAttributeCallback( $set ) {
 818                 if( isset( $set[6] ) ) {
 819                         # Illegal #XXXXXX color with no quotes.
 820                         return $set[6];
 821                 } elseif( isset( $set[5] ) ) {
 822                         # No quotes.
 823                         return $set[5];
 824                 } elseif( isset( $set[4] ) ) {
 825                         # Single-quoted
 826                         return $set[4];
 827                 } elseif( isset( $set[3] ) ) {
 828                         # Double-quoted
 829                         return $set[3];
 830                 } elseif( !isset( $set[2] ) ) {
 831                         # In XHTML, attributes must have a value.
 832                         # For 'reduced' form, return explicitly the attribute name here.
 833                         return $set[1];
 834                 } else {
 835                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 836                 }
 837         }
 838
 839         /**
 840          * Normalize whitespace and character references in an XML source-
 841          * encoded text for an attribute value.
 842          *
 843          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 844          * but note that we're not returning the value, but are returning
 845          * XML source fragments that will be slapped into output.
 846          *
 847          * @param string $text
 848          * @return string
 849          * @private
 850          */
 851         private static function normalizeAttributeValue( $text ) {
 852                 return str_replace( '"', '&quot;',
 853                         preg_replace(
 854                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 855                                 ' ',
 856                                 Sanitizer::normalizeCharReferences( $text ) ) );
 857         }
 858
 859         /**
 860          * Ensure that any entities and character references are legal
 861          * for XML and XHTML specifically. Any stray bits will be
 862          * &amp;-escaped to result in a valid text fragment.
 863          *
 864          * a. any named char refs must be known in XHTML
 865          * b. any numeric char refs must be legal chars, not invalid or forbidden
 866          * c. use &#x, not &#X
 867          * d. fix or reject non-valid attributes
 868          *
 869          * @param string $text
 870          * @return string
 871          * @private
 872          */
 873         static function normalizeCharReferences( $text ) {
 874                 return preg_replace_callback(
 875                         MW_CHAR_REFS_REGEX,
 876                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 877                         $text );
 878         }
 879         /**
 880          * @param string $matches
 881          * @return string
 882          */
 883         static function normalizeCharReferencesCallback( $matches ) {
 884                 $ret = null;
 885                 if( $matches[1] != '' ) {
 886                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 887                 } elseif( $matches[2] != '' ) {
 888                         $ret = Sanitizer::decCharReference( $matches[2] );
 889                 } elseif( $matches[3] != ''  ) {
 890                         $ret = Sanitizer::hexCharReference( $matches[3] );
 891                 } elseif( $matches[4] != '' ) {
 892                         $ret = Sanitizer::hexCharReference( $matches[4] );
 893                 }
 894                 if( is_null( $ret ) ) {
 895                         return htmlspecialchars( $matches[0] );
 896                 } else {
 897                         return $ret;
 898                 }
 899         }
 900
 901         /**
 902          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 903          * return the named entity reference as is. Otherwise, returns
 904          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 905          *
 906          * @param string $name
 907          * @return string
 908          * @static
 909          */
 910         static function normalizeEntity( $name ) {
 911                 global $wgHtmlEntities;
 912                 if( isset( $wgHtmlEntities[$name] ) ) {
 913                         return "&$name;";
 914                 } else {
 915                         return "&amp;$name;";
 916                 }
 917         }
 918
 919         static function decCharReference( $codepoint ) {
 920                 $point = intval( $codepoint );
 921                 if( Sanitizer::validateCodepoint( $point ) ) {
 922                         return sprintf( '&#%d;', $point );
 923                 } else {
 924                         return null;
 925                 }
 926         }
 927
 928         static function hexCharReference( $codepoint ) {
 929                 $point = hexdec( $codepoint );
 930                 if( Sanitizer::validateCodepoint( $point ) ) {
 931                         return sprintf( '&#x%x;', $point );
 932                 } else {
 933                         return null;
 934                 }
 935         }
 936
 937         /**
 938          * Returns true if a given Unicode codepoint is a valid character in XML.
 939          * @param int $codepoint
 940          * @return bool
 941          */
 942         private static function validateCodepoint( $codepoint ) {
 943                 return ($codepoint ==    0x09)
 944                         || ($codepoint ==    0x0a)
 945                         || ($codepoint ==    0x0d)
 946                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 947                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 948                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 949         }
 950
 951         /**
 952          * Decode any character references, numeric or named entities,
 953          * in the text and return a UTF-8 string.
 954          *
 955          * @param string $text
 956          * @return string
 957          * @public
 958          * @static
 959          */
 960         public static function decodeCharReferences( $text ) {
 961                 return preg_replace_callback(
 962                         MW_CHAR_REFS_REGEX,
 963                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 964                         $text );
 965         }
 966
 967         /**
 968          * @param string $matches
 969          * @return string
 970          */
 971         static function decodeCharReferencesCallback( $matches ) {
 972                 if( $matches[1] != '' ) {
 973                         return Sanitizer::decodeEntity( $matches[1] );
 974                 } elseif( $matches[2] != '' ) {
 975                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 976                 } elseif( $matches[3] != ''  ) {
 977                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 978                 } elseif( $matches[4] != '' ) {
 979                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 980                 }
 981                 # Last case should be an ampersand by itself
 982                 return $matches[0];
 983         }
 984
 985         /**
 986          * Return UTF-8 string for a codepoint if that is a valid
 987          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 988          * @param int $codepoint
 989          * @return string
 990          * @private
 991          */
 992         static function decodeChar( $codepoint ) {
 993                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 994                         return codepointToUtf8( $codepoint );
 995                 } else {
 996                         return UTF8_REPLACEMENT;
 997                 }
 998         }
 999
1000         /**
1001          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1002          * return the UTF-8 encoding of that character. Otherwise, returns
1003          * pseudo-entity source (eg &foo;)
1004          *
1005          * @param string $name
1006          * @return string
1007          */
1008         static function decodeEntity( $name ) {
1009                 global $wgHtmlEntities;
1010                 if( isset( $wgHtmlEntities[$name] ) ) {
1011                         return codepointToUtf8( $wgHtmlEntities[$name] );
1012                 } else {
1013                         return "&$name;";
1014                 }
1015         }
1016
1017         /**
1018          * Fetch the whitelist of acceptable attributes for a given
1019          * element name.
1020          *
1021          * @param string $element
1022          * @return array
1023          */
1024         static function attributeWhitelist( $element ) {
1025                 static $list;
1026                 if( !isset( $list ) ) {
1027                         $list = Sanitizer::setupAttributeWhitelist();
1028                 }
1029                 return isset( $list[$element] )
1030                         ? $list[$element]
1031                         : array();
1032         }
1033
1034         /**
1035          * @todo Document it a bit
1036          * @return array
1037          */
1038         static function setupAttributeWhitelist() {
1039                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1040                 $block = array_merge( $common, array( 'align' ) );
1041                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1042                 $tablecell = array( 'abbr',
1043                                     'axis',
1044                                     'headers',
1045                                     'scope',
1046                                     'rowspan',
1047                                     'colspan',
1048                                     'nowrap', # deprecated
1049                                     'width',  # deprecated
1050                                     'height', # deprecated
1051                                     'bgcolor' # deprecated
1052                                     );
1053
1054                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1055                 # See: http://www.w3.org/TR/html4/
1056                 $whitelist = array (
1057                         # 7.5.4
1058                         'div'        => $block,
1059                         'center'     => $common, # deprecated
1060                         'span'       => $block, # ??
1061
1062                         # 7.5.5
1063                         'h1'         => $block,
1064                         'h2'         => $block,
1065                         'h3'         => $block,
1066                         'h4'         => $block,
1067                         'h5'         => $block,
1068                         'h6'         => $block,
1069
1070                         # 7.5.6
1071                         # address
1072
1073                         # 8.2.4
1074                         # bdo
1075
1076                         # 9.2.1
1077                         'em'         => $common,
1078                         'strong'     => $common,
1079                         'cite'       => $common,
1080                         # dfn
1081                         'code'       => $common,
1082                         # samp
1083                         # kbd
1084                         'var'        => $common,
1085                         # abbr
1086                         # acronym
1087
1088                         # 9.2.2
1089                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1090                         # q
1091
1092                         # 9.2.3
1093                         'sub'        => $common,
1094                         'sup'        => $common,
1095
1096                         # 9.3.1
1097                         'p'          => $block,
1098
1099                         # 9.3.2
1100                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1101
1102                         # 9.3.4
1103                         'pre'        => array_merge( $common, array( 'width' ) ),
1104
1105                         # 9.4
1106                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1107                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1108
1109                         # 10.2
1110                         'ul'         => array_merge( $common, array( 'type' ) ),
1111                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1112                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1113
1114                         # 10.3
1115                         'dl'         => $common,
1116                         'dd'         => $common,
1117                         'dt'         => $common,
1118
1119                         # 11.2.1
1120                         'table'      => array_merge( $common,
1121                                                                 array( 'summary', 'width', 'border', 'frame',
1122                                                                                 'rules', 'cellspacing', 'cellpadding',
1123                                                                                 'align', 'bgcolor',
1124                                                                 ) ),
1125
1126                         # 11.2.2
1127                         'caption'    => array_merge( $common, array( 'align' ) ),
1128
1129                         # 11.2.3
1130                         'thead'      => array_merge( $common, $tablealign ),
1131                         'tfoot'      => array_merge( $common, $tablealign ),
1132                         'tbody'      => array_merge( $common, $tablealign ),
1133
1134                         # 11.2.4
1135                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1136                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1137
1138                         # 11.2.5
1139                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1140
1141                         # 11.2.6
1142                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1143                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1144
1145                         # 15.2.1
1146                         'tt'         => $common,
1147                         'b'          => $common,
1148                         'i'          => $common,
1149                         'big'        => $common,
1150                         'small'      => $common,
1151                         'strike'     => $common,
1152                         's'          => $common,
1153                         'u'          => $common,
1154
1155                         # 15.2.2
1156                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1157                         # basefont
1158
1159                         # 15.3
1160                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1161
1162                         # XHTML Ruby annotation text module, simple ruby only.
1163                         # http://www.w3c.org/TR/ruby/
1164                         'ruby'       => $common,
1165                         # rbc
1166                         # rtc
1167                         'rb'         => $common,
1168                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1169                         'rp'         => $common,
1170                         );
1171                 return $whitelist;
1172         }
1173
1174         /**
1175          * Take a fragment of (potentially invalid) HTML and return
1176          * a version with any tags removed, encoded suitably for literal
1177          * inclusion in an attribute value.
1178          *
1179          * @param string $text HTML fragment
1180          * @return string
1181          */
1182         static function stripAllTags( $text ) {
1183                 # Actual <tags>
1184                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1185
1186                 # Normalize &entities and whitespace
1187                 $text = Sanitizer::normalizeAttributeValue( $text );
1188
1189                 # Will be placed into "double-quoted" attributes,
1190                 # make sure remaining bits are safe.
1191                 $text = str_replace(
1192                         array('<', '>', '"'),
1193                         array('&lt;', '&gt;', '&quot;'),
1194                         $text );
1195
1196                 return $text;
1197         }
1198
1199         /**
1200          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1201          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1202          * PHP 5.1 doesn't.
1203          *
1204          * Use for passing XHTML fragments to PHP's XML parsing functions
1205          *
1206          * @return string
1207          * @static
1208          */
1209         static function hackDocType() {
1210                 global $wgHtmlEntities;
1211                 $out = "<!DOCTYPE html [\n";
1212                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1213                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1214                 }
1215                 $out .= "]>\n";
1216                 return $out;
1217         }
1218
1219         static function cleanUrl( $url, $hostname=true ) {
1220                 # Normalize any HTML entities in input. They will be
1221                 # re-escaped by makeExternalLink().
1222                 $url = Sanitizer::decodeCharReferences( $url );
1223
1224                 # Escape any control characters introduced by the above step
1225                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1226
1227                 # Validate hostname portion
1228                 $matches = array();
1229                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1230                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1231
1232                         // Characters that will be ignored in IDNs.
1233                         // http://tools.ietf.org/html/3454#section-3.1
1234                         // Strip them before further processing so blacklists and such work.
1235                         $strip = "/
1236                                 \\s|          # general whitespace
1237                                 \xc2\xad|     # 00ad SOFT HYPHEN
1238                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1239                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1240                                 \xe2\x81\xa0| # 2060 WORD JOINER
1241                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1242                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1243                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1244                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1245                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1246                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1247                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1248                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1249                                 /xuD";
1250
1251                         $host = preg_replace( $strip, '', $host );
1252
1253                         // @fixme: validate hostnames here
1254
1255                         return $protocol . $host . $rest;
1256                 } else {
1257                         return $url;
1258                 }
1259         }
1260
1261 }
1262
1263 ?>