includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332
 333                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 334                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 335
 336                 wfProfileIn( __METHOD__ );
 337
 338                 if ( !$staticInitialised ) {
 339                         if( $wgUserHtml ) {
 340                                 $htmlpairs = array( # Tags that must be closed
 341                                         'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 342                                         'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 343                                         'strike', 'strong', 'tt', 'var', 'div', 'center',
 344                                         'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 345                                         'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 346                                 );
 347                                 $htmlsingle = array(
 348                                         'br', 'hr', 'li', 'dt', 'dd'
 349                                 );
 350                                 $htmlsingleonly = array( # Elements that cannot have close tags
 351                                         'br', 'hr'
 352                                 );
 353                                 $htmlnest = array( # Tags that can be nested--??
 354                                         'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 355                                         'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 356                                 );
 357                                 $tabletags = array( # Can only appear inside table
 358                                         'td', 'th', 'tr',
 359                                 );
 360                                 $htmllist = array( # Tags used by list
 361                                         'ul','ol',
 362                                 );
 363                                 $listtags = array( # Tags that can appear in a list
 364                                         'li',
 365                                 );
 366
 367                         } else {
 368                                 $htmlpairs = array();
 369                                 $htmlsingle = array();
 370                                 $htmlnest = array();
 371                                 $tabletags = array();
 372                         }
 373
 374                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 375                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 376
 377                         # Convert them all to hashtables for faster lookup
 378                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 379                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 380                         foreach ( $vars as $var ) {
 381                                 $$var = array_flip( $$var );
 382                         }
 383                         $staticInitialised = true;
 384                 }
 385
 386                 # Remove HTML comments
 387                 $text = Sanitizer::removeHTMLcomments( $text );
 388                 $bits = explode( '<', $text );
 389                 $text = array_shift( $bits );
 390                 if(!$wgUseTidy) {
 391                         $tagstack = $tablestack = array();
 392                         foreach ( $bits as $x ) {
 393                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 394                                 preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs );
 395                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 396                                 error_reporting( $prev );
 397
 398                                 $badtag = 0 ;
 399                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 400                                         # Check our stack
 401                                         if ( $slash ) {
 402                                                 # Closing a tag...
 403                                                 if( isset( $htmlsingleonly[$t] ) ) {
 404                                                         $badtag = 1;
 405                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 406                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 407                                                                 # Pop all elements with an optional close tag
 408                                                                 # and see if we find a match below them
 409                                                                 $optstack = array();
 410                                                                 array_push ($optstack, $ot);
 411                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 412                                                                                 isset( $htmlsingleallowed[$ot] ) )
 413                                                                 {
 414                                                                         array_push ($optstack, $ot);
 415                                                                 }
 416                                                                 if ( $t != $ot ) {
 417                                                                         # No match. Push the optinal elements back again
 418                                                                         $badtag = 1;
 419                                                                         while ( $ot = @array_pop( $optstack ) ) {
 420                                                                                 array_push( $tagstack, $ot );
 421                                                                         }
 422                                                                 }
 423                                                         } else {
 424                                                                 @array_push( $tagstack, $ot );
 425                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 426                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 427                                                                         $badtag = 1;
 428                                                                 }
 429                                                         }
 430                                                 } else {
 431                                                         if ( $t == 'table' ) {
 432                                                                 $tagstack = array_pop( $tablestack );
 433                                                         }
 434                                                 }
 435                                                 $newparams = '';
 436                                         } else {
 437                                                 # Keep track for later
 438                                                 if ( isset( $tabletags[$t] ) &&
 439                                                 ! in_array( 'table', $tagstack ) ) {
 440                                                         $badtag = 1;
 441                                                 } else if ( in_array( $t, $tagstack ) &&
 442                                                 ! isset( $htmlnest [$t ] ) ) {
 443                                                         $badtag = 1 ;
 444                                                 # Is it a self closed htmlpair ? (bug 5487)
 445                                                 } else if( $brace == '/>' &&
 446                                                 isset( $htmlpairs[$t] ) ) {
 447                                                         $badtag = 1;
 448                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 449                                                         # Hack to force empty tag for uncloseable elements
 450                                                         $brace = '/>';
 451                                                 } else if( isset( $htmlsingle[$t] ) ) {
 452                                                         # Hack to not close $htmlsingle tags
 453                                                         $brace = NULL;
 454                                                 } else {
 455                                                         if ( $t == 'table' ) {
 456                                                                 array_push( $tablestack, $tagstack );
 457                                                                 $tagstack = array();
 458                                                         }
 459                                                         array_push( $tagstack, $t );
 460                                                 }
 461
 462                                                 # Replace any variables or template parameters with
 463                                                 # plaintext results.
 464                                                 if( is_callable( $processCallback ) ) {
 465                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 466                                                 }
 467
 468                                                 # Strip non-approved attributes from the tag
 469                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 470                                         }
 471                                         if ( ! $badtag ) {
 472                                                 $rest = str_replace( '>', '&gt;', $rest );
 473                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 474                                                 $text .= "<$slash$t$newparams$close>$rest";
 475                                                 continue;
 476                                         }
 477                                 }
 478                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 479                         }
 480                         # Close off any remaining tags
 481                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 482                                 $text .= "</$t>\n";
 483                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 484                         }
 485                 } else {
 486                         # this might be possible using tidy itself
 487                         foreach ( $bits as $x ) {
 488                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 489                                 $x, $regs );
 490                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 491                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 492                                         if( is_callable( $processCallback ) ) {
 493                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 494                                         }
 495                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 496                                         $rest = str_replace( '>', '&gt;', $rest );
 497                                         $text .= "<$slash$t$newparams$brace$rest";
 498                                 } else {
 499                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 500                                 }
 501                         }
 502                 }
 503                 wfProfileOut( __METHOD__ );
 504                 return $text;
 505         }
 506
 507         /**
 508          * Remove '<!--', '-->', and everything between.
 509          * To avoid leaving blank lines, when a comment is both preceded
 510          * and followed by a newline (ignoring spaces), trim leading and
 511          * trailing spaces and one of the newlines.
 512          *
 513          * @private
 514          * @param string $text
 515          * @return string
 516          */
 517         static function removeHTMLcomments( $text ) {
 518                 wfProfileIn( __METHOD__ );
 519                 while (($start = strpos($text, '<!--')) !== false) {
 520                         $end = strpos($text, '-->', $start + 4);
 521                         if ($end === false) {
 522                                 # Unterminated comment; bail out
 523                                 break;
 524                         }
 525
 526                         $end += 3;
 527
 528                         # Trim space and newline if the comment is both
 529                         # preceded and followed by a newline
 530                         $spaceStart = max($start - 1, 0);
 531                         $spaceLen = $end - $spaceStart;
 532                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 533                                 $spaceStart--;
 534                                 $spaceLen++;
 535                         }
 536                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 537                                 $spaceLen++;
 538                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 539                                 # Remove the comment, leading and trailing
 540                                 # spaces, and leave only one newline.
 541                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 542                         }
 543                         else {
 544                                 # Remove just the comment.
 545                                 $text = substr_replace($text, '', $start, $end - $start);
 546                         }
 547                 }
 548                 wfProfileOut( __METHOD__ );
 549                 return $text;
 550         }
 551
 552         /**
 553          * Take an array of attribute names and values and normalize or discard
 554          * illegal values for the given element type.
 555          *
 556          * - Discards attributes not on a whitelist for the given element
 557          * - Unsafe style attributes are discarded
 558          *
 559          * @param array $attribs
 560          * @param string $element
 561          * @return array
 562          *
 563          * @todo Check for legal values where the DTD limits things.
 564          * @todo Check for unique id attribute :P
 565          */
 566         static function validateTagAttributes( $attribs, $element ) {
 567                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 568                 $out = array();
 569                 foreach( $attribs as $attribute => $value ) {
 570                         if( !isset( $whitelist[$attribute] ) ) {
 571                                 continue;
 572                         }
 573                         # Strip javascript "expression" from stylesheets.
 574                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 575                         if( $attribute == 'style' ) {
 576                                 $value = Sanitizer::checkCss( $value );
 577                                 if( $value === false ) {
 578                                         # haxx0r
 579                                         continue;
 580                                 }
 581                         }
 582
 583                         if ( $attribute === 'id' )
 584                                 $value = Sanitizer::escapeId( $value );
 585
 586                         // If this attribute was previously set, override it.
 587                         // Output should only have one attribute of each name.
 588                         $out[$attribute] = $value;
 589                 }
 590                 return $out;
 591         }
 592
 593         /**
 594          * Pick apart some CSS and check it for forbidden or unsafe structures.
 595          * Returns a sanitized string, or false if it was just too evil.
 596          *
 597          * Currently URL references, 'expression', 'tps' are forbidden.
 598          *
 599          * @param string $value
 600          * @return mixed
 601          */
 602         static function checkCss( $value ) {
 603                 $stripped = Sanitizer::decodeCharReferences( $value );
 604
 605                 // Remove any comments; IE gets token splitting wrong
 606                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 607                 $value = $stripped;
 608
 609                 // ... and continue checks
 610                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 611                         'codepointToUtf8(hexdec("$1"))', $stripped );
 612                 $stripped = str_replace( '\\', '', $stripped );
 613                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 614                                 $stripped ) ) {
 615                         # haxx0r
 616                         return false;
 617                 }
 618
 619                 return $value;
 620         }
 621
 622         /**
 623          * Take a tag soup fragment listing an HTML element's attributes
 624          * and normalize it to well-formed XML, discarding unwanted attributes.
 625          * Output is safe for further wikitext processing, with escaping of
 626          * values that could trigger problems.
 627          *
 628          * - Normalizes attribute names to lowercase
 629          * - Discards attributes not on a whitelist for the given element
 630          * - Turns broken or invalid entities into plaintext
 631          * - Double-quotes all attribute values
 632          * - Attributes without values are given the name as attribute
 633          * - Double attributes are discarded
 634          * - Unsafe style attributes are discarded
 635          * - Prepends space if there are attributes.
 636          *
 637          * @param string $text
 638          * @param string $element
 639          * @return string
 640          */
 641         static function fixTagAttributes( $text, $element ) {
 642                 if( trim( $text ) == '' ) {
 643                         return '';
 644                 }
 645
 646                 $stripped = Sanitizer::validateTagAttributes(
 647                         Sanitizer::decodeTagAttributes( $text ), $element );
 648
 649                 $attribs = array();
 650                 foreach( $stripped as $attribute => $value ) {
 651                         $encAttribute = htmlspecialchars( $attribute );
 652                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 653
 654                         $attribs[] = "$encAttribute=\"$encValue\"";
 655                 }
 656                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 657         }
 658
 659         /**
 660          * Encode an attribute value for HTML output.
 661          * @param $text
 662          * @return HTML-encoded text fragment
 663          */
 664         static function encodeAttribute( $text ) {
 665                 $encValue = htmlspecialchars( $text );
 666
 667                 // Whitespace is normalized during attribute decoding,
 668                 // so if we've been passed non-spaces we must encode them
 669                 // ahead of time or they won't be preserved.
 670                 $encValue = strtr( $encValue, array(
 671                         "\n" => '&#10;',
 672                         "\r" => '&#13;',
 673                         "\t" => '&#9;',
 674                 ) );
 675
 676                 return $encValue;
 677         }
 678
 679         /**
 680          * Encode an attribute value for HTML tags, with extra armoring
 681          * against further wiki processing.
 682          * @param $text
 683          * @return HTML-encoded text fragment
 684          */
 685         static function safeEncodeAttribute( $text ) {
 686                 $encValue = Sanitizer::encodeAttribute( $text );
 687
 688                 # Templates and links may be expanded in later parsing,
 689                 # creating invalid or dangerous output. Suppress this.
 690                 $encValue = strtr( $encValue, array(
 691                         '<'    => '&lt;',   // This should never happen,
 692                         '>'    => '&gt;',   // we've received invalid input
 693                         '"'    => '&quot;', // which should have been escaped.
 694                         '{'    => '&#123;',
 695                         '['    => '&#91;',
 696                         "''"   => '&#39;&#39;',
 697                         'ISBN' => '&#73;SBN',
 698                         'RFC'  => '&#82;FC',
 699                         'PMID' => '&#80;MID',
 700                         '|'    => '&#124;',
 701                         '__'   => '&#95;_',
 702                 ) );
 703
 704                 # Stupid hack
 705                 $encValue = preg_replace_callback(
 706                         '/(' . wfUrlProtocols() . ')/',
 707                         array( 'Sanitizer', 'armorLinksCallback' ),
 708                         $encValue );
 709                 return $encValue;
 710         }
 711
 712         /**
 713          * Given a value escape it so that it can be used in an id attribute and
 714          * return it, this does not validate the value however (see first link)
 715          *
 716          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 717          *                                                          in the id and
 718          *                                                          name attributes
 719          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 720          *
 721          * @bug 4461
 722          *
 723          * @static
 724          *
 725          * @param string $id
 726          * @return string
 727          */
 728         static function escapeId( $id ) {
 729                 static $replace = array(
 730                         '%3A' => ':',
 731                         '%' => '.'
 732                 );
 733
 734                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 735
 736                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 737         }
 738
 739         /**
 740          * Regex replace callback for armoring links against further processing.
 741          * @param array $matches
 742          * @return string
 743          * @private
 744          */
 745         private static function armorLinksCallback( $matches ) {
 746                 return str_replace( ':', '&#58;', $matches[1] );
 747         }
 748
 749         /**
 750          * Return an associative array of attribute names and values from
 751          * a partial tag string. Attribute names are forces to lowercase,
 752          * character references are decoded to UTF-8 text.
 753          *
 754          * @param string
 755          * @return array
 756          */
 757         static function decodeTagAttributes( $text ) {
 758                 $attribs = array();
 759
 760                 if( trim( $text ) == '' ) {
 761                         return $attribs;
 762                 }
 763
 764                 $pairs = array();
 765                 if( !preg_match_all(
 766                         MW_ATTRIBS_REGEX,
 767                         $text,
 768                         $pairs,
 769                         PREG_SET_ORDER ) ) {
 770                         return $attribs;
 771                 }
 772
 773                 foreach( $pairs as $set ) {
 774                         $attribute = strtolower( $set[1] );
 775                         $value = Sanitizer::getTagAttributeCallback( $set );
 776
 777                         // Normalize whitespace
 778                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 779                         $value = trim( $value );
 780
 781                         // Decode character references
 782                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 783                 }
 784                 return $attribs;
 785         }
 786
 787         /**
 788          * Pick the appropriate attribute value from a match set from the
 789          * MW_ATTRIBS_REGEX matches.
 790          *
 791          * @param array $set
 792          * @return string
 793          * @private
 794          */
 795         private static function getTagAttributeCallback( $set ) {
 796                 if( isset( $set[6] ) ) {
 797                         # Illegal #XXXXXX color with no quotes.
 798                         return $set[6];
 799                 } elseif( isset( $set[5] ) ) {
 800                         # No quotes.
 801                         return $set[5];
 802                 } elseif( isset( $set[4] ) ) {
 803                         # Single-quoted
 804                         return $set[4];
 805                 } elseif( isset( $set[3] ) ) {
 806                         # Double-quoted
 807                         return $set[3];
 808                 } elseif( !isset( $set[2] ) ) {
 809                         # In XHTML, attributes must have a value.
 810                         # For 'reduced' form, return explicitly the attribute name here.
 811                         return $set[1];
 812                 } else {
 813                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 814                 }
 815         }
 816
 817         /**
 818          * Normalize whitespace and character references in an XML source-
 819          * encoded text for an attribute value.
 820          *
 821          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 822          * but note that we're not returning the value, but are returning
 823          * XML source fragments that will be slapped into output.
 824          *
 825          * @param string $text
 826          * @return string
 827          * @private
 828          */
 829         private static function normalizeAttributeValue( $text ) {
 830                 return str_replace( '"', '&quot;',
 831                         preg_replace(
 832                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 833                                 ' ',
 834                                 Sanitizer::normalizeCharReferences( $text ) ) );
 835         }
 836
 837         /**
 838          * Ensure that any entities and character references are legal
 839          * for XML and XHTML specifically. Any stray bits will be
 840          * &amp;-escaped to result in a valid text fragment.
 841          *
 842          * a. any named char refs must be known in XHTML
 843          * b. any numeric char refs must be legal chars, not invalid or forbidden
 844          * c. use &#x, not &#X
 845          * d. fix or reject non-valid attributes
 846          *
 847          * @param string $text
 848          * @return string
 849          * @private
 850          */
 851         static function normalizeCharReferences( $text ) {
 852                 return preg_replace_callback(
 853                         MW_CHAR_REFS_REGEX,
 854                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 855                         $text );
 856         }
 857         /**
 858          * @param string $matches
 859          * @return string
 860          */
 861         static function normalizeCharReferencesCallback( $matches ) {
 862                 $ret = null;
 863                 if( $matches[1] != '' ) {
 864                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 865                 } elseif( $matches[2] != '' ) {
 866                         $ret = Sanitizer::decCharReference( $matches[2] );
 867                 } elseif( $matches[3] != ''  ) {
 868                         $ret = Sanitizer::hexCharReference( $matches[3] );
 869                 } elseif( $matches[4] != '' ) {
 870                         $ret = Sanitizer::hexCharReference( $matches[4] );
 871                 }
 872                 if( is_null( $ret ) ) {
 873                         return htmlspecialchars( $matches[0] );
 874                 } else {
 875                         return $ret;
 876                 }
 877         }
 878
 879         /**
 880          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 881          * return the named entity reference as is. Otherwise, returns
 882          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 883          *
 884          * @param string $name
 885          * @return string
 886          * @static
 887          */
 888         static function normalizeEntity( $name ) {
 889                 global $wgHtmlEntities;
 890                 if( isset( $wgHtmlEntities[$name] ) ) {
 891                         return "&$name;";
 892                 } else {
 893                         return "&amp;$name;";
 894                 }
 895         }
 896
 897         static function decCharReference( $codepoint ) {
 898                 $point = intval( $codepoint );
 899                 if( Sanitizer::validateCodepoint( $point ) ) {
 900                         return sprintf( '&#%d;', $point );
 901                 } else {
 902                         return null;
 903                 }
 904         }
 905
 906         static function hexCharReference( $codepoint ) {
 907                 $point = hexdec( $codepoint );
 908                 if( Sanitizer::validateCodepoint( $point ) ) {
 909                         return sprintf( '&#x%x;', $point );
 910                 } else {
 911                         return null;
 912                 }
 913         }
 914
 915         /**
 916          * Returns true if a given Unicode codepoint is a valid character in XML.
 917          * @param int $codepoint
 918          * @return bool
 919          */
 920         private static function validateCodepoint( $codepoint ) {
 921                 return ($codepoint ==    0x09)
 922                         || ($codepoint ==    0x0a)
 923                         || ($codepoint ==    0x0d)
 924                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 925                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 926                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 927         }
 928
 929         /**
 930          * Decode any character references, numeric or named entities,
 931          * in the text and return a UTF-8 string.
 932          *
 933          * @param string $text
 934          * @return string
 935          * @public
 936          * @static
 937          */
 938         public static function decodeCharReferences( $text ) {
 939                 return preg_replace_callback(
 940                         MW_CHAR_REFS_REGEX,
 941                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 942                         $text );
 943         }
 944
 945         /**
 946          * @param string $matches
 947          * @return string
 948          */
 949         static function decodeCharReferencesCallback( $matches ) {
 950                 if( $matches[1] != '' ) {
 951                         return Sanitizer::decodeEntity( $matches[1] );
 952                 } elseif( $matches[2] != '' ) {
 953                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 954                 } elseif( $matches[3] != ''  ) {
 955                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 956                 } elseif( $matches[4] != '' ) {
 957                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 958                 }
 959                 # Last case should be an ampersand by itself
 960                 return $matches[0];
 961         }
 962
 963         /**
 964          * Return UTF-8 string for a codepoint if that is a valid
 965          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 966          * @param int $codepoint
 967          * @return string
 968          * @private
 969          */
 970         static function decodeChar( $codepoint ) {
 971                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 972                         return codepointToUtf8( $codepoint );
 973                 } else {
 974                         return UTF8_REPLACEMENT;
 975                 }
 976         }
 977
 978         /**
 979          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 980          * return the UTF-8 encoding of that character. Otherwise, returns
 981          * pseudo-entity source (eg &foo;)
 982          *
 983          * @param string $name
 984          * @return string
 985          */
 986         static function decodeEntity( $name ) {
 987                 global $wgHtmlEntities;
 988                 if( isset( $wgHtmlEntities[$name] ) ) {
 989                         return codepointToUtf8( $wgHtmlEntities[$name] );
 990                 } else {
 991                         return "&$name;";
 992                 }
 993         }
 994
 995         /**
 996          * Fetch the whitelist of acceptable attributes for a given
 997          * element name.
 998          *
 999          * @param string $element
1000          * @return array
1001          */
1002         static function attributeWhitelist( $element ) {
1003                 static $list;
1004                 if( !isset( $list ) ) {
1005                         $list = Sanitizer::setupAttributeWhitelist();
1006                 }
1007                 return isset( $list[$element] )
1008                         ? $list[$element]
1009                         : array();
1010         }
1011
1012         /**
1013          * @todo Document it a bit
1014          * @return array
1015          */
1016         static function setupAttributeWhitelist() {
1017                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1018                 $block = array_merge( $common, array( 'align' ) );
1019                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1020                 $tablecell = array( 'abbr',
1021                                     'axis',
1022                                     'headers',
1023                                     'scope',
1024                                     'rowspan',
1025                                     'colspan',
1026                                     'nowrap', # deprecated
1027                                     'width',  # deprecated
1028                                     'height', # deprecated
1029                                     'bgcolor' # deprecated
1030                                     );
1031
1032                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1033                 # See: http://www.w3.org/TR/html4/
1034                 $whitelist = array (
1035                         # 7.5.4
1036                         'div'        => $block,
1037                         'center'     => $common, # deprecated
1038                         'span'       => $block, # ??
1039
1040                         # 7.5.5
1041                         'h1'         => $block,
1042                         'h2'         => $block,
1043                         'h3'         => $block,
1044                         'h4'         => $block,
1045                         'h5'         => $block,
1046                         'h6'         => $block,
1047
1048                         # 7.5.6
1049                         # address
1050
1051                         # 8.2.4
1052                         # bdo
1053
1054                         # 9.2.1
1055                         'em'         => $common,
1056                         'strong'     => $common,
1057                         'cite'       => $common,
1058                         # dfn
1059                         'code'       => $common,
1060                         # samp
1061                         # kbd
1062                         'var'        => $common,
1063                         # abbr
1064                         # acronym
1065
1066                         # 9.2.2
1067                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1068                         # q
1069
1070                         # 9.2.3
1071                         'sub'        => $common,
1072                         'sup'        => $common,
1073
1074                         # 9.3.1
1075                         'p'          => $block,
1076
1077                         # 9.3.2
1078                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1079
1080                         # 9.3.4
1081                         'pre'        => array_merge( $common, array( 'width' ) ),
1082
1083                         # 9.4
1084                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1085                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1086
1087                         # 10.2
1088                         'ul'         => array_merge( $common, array( 'type' ) ),
1089                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1090                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1091
1092                         # 10.3
1093                         'dl'         => $common,
1094                         'dd'         => $common,
1095                         'dt'         => $common,
1096
1097                         # 11.2.1
1098                         'table'      => array_merge( $common,
1099                                                                 array( 'summary', 'width', 'border', 'frame',
1100                                                                                 'rules', 'cellspacing', 'cellpadding',
1101                                                                                 'align', 'bgcolor',
1102                                                                 ) ),
1103
1104                         # 11.2.2
1105                         'caption'    => array_merge( $common, array( 'align' ) ),
1106
1107                         # 11.2.3
1108                         'thead'      => array_merge( $common, $tablealign ),
1109                         'tfoot'      => array_merge( $common, $tablealign ),
1110                         'tbody'      => array_merge( $common, $tablealign ),
1111
1112                         # 11.2.4
1113                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1114                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1115
1116                         # 11.2.5
1117                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1118
1119                         # 11.2.6
1120                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1121                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1122
1123                         # 15.2.1
1124                         'tt'         => $common,
1125                         'b'          => $common,
1126                         'i'          => $common,
1127                         'big'        => $common,
1128                         'small'      => $common,
1129                         'strike'     => $common,
1130                         's'          => $common,
1131                         'u'          => $common,
1132
1133                         # 15.2.2
1134                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1135                         # basefont
1136
1137                         # 15.3
1138                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1139
1140                         # XHTML Ruby annotation text module, simple ruby only.
1141                         # http://www.w3c.org/TR/ruby/
1142                         'ruby'       => $common,
1143                         # rbc
1144                         # rtc
1145                         'rb'         => $common,
1146                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1147                         'rp'         => $common,
1148                         );
1149                 return $whitelist;
1150         }
1151
1152         /**
1153          * Take a fragment of (potentially invalid) HTML and return
1154          * a version with any tags removed, encoded suitably for literal
1155          * inclusion in an attribute value.
1156          *
1157          * @param string $text HTML fragment
1158          * @return string
1159          */
1160         static function stripAllTags( $text ) {
1161                 # Actual <tags>
1162                 $text = preg_replace( '/ < .*? > /x', '', $text );
1163
1164                 # Normalize &entities and whitespace
1165                 $text = Sanitizer::normalizeAttributeValue( $text );
1166
1167                 # Will be placed into "double-quoted" attributes,
1168                 # make sure remaining bits are safe.
1169                 $text = str_replace(
1170                         array('<', '>', '"'),
1171                         array('&lt;', '&gt;', '&quot;'),
1172                         $text );
1173
1174                 return $text;
1175         }
1176
1177         /**
1178          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1179          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1180          * PHP 5.1 doesn't.
1181          *
1182          * Use for passing XHTML fragments to PHP's XML parsing functions
1183          *
1184          * @return string
1185          * @static
1186          */
1187         static function hackDocType() {
1188                 global $wgHtmlEntities;
1189                 $out = "<!DOCTYPE html [\n";
1190                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1191                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1192                 }
1193                 $out .= "]>\n";
1194                 return $out;
1195         }
1196
1197         static function cleanUrl( $url, $hostname=true ) {
1198                 # Normalize any HTML entities in input. They will be
1199                 # re-escaped by makeExternalLink().
1200                 $url = Sanitizer::decodeCharReferences( $url );
1201
1202                 # Escape any control characters introduced by the above step
1203                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1204
1205                 # Validate hostname portion
1206                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1207                         list( $whole, $protocol, $host, $rest ) = $matches;
1208
1209                         // Characters that will be ignored in IDNs.
1210                         // http://tools.ietf.org/html/3454#section-3.1
1211                         // Strip them before further processing so blacklists and such work.
1212                         $strip = "/
1213                                 \\s|          # general whitespace
1214                                 \xc2\xad|     # 00ad SOFT HYPHEN
1215                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1216                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1217                                 \xe2\x81\xa0| # 2060 WORD JOINER
1218                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1219                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1220                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1221                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1222                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1223                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1224                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1225                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1226                                 /xuD";
1227
1228                         $host = preg_replace( $strip, '', $host );
1229
1230                         // @fixme: validate hostnames here
1231
1232                         return $protocol . $host . $rest;
1233                 } else {
1234                         return $url;
1235                 }
1236         }
1237
1238 }
1239
1240 ?>