includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr',
 355                         );
 356                         $htmllist = array( # Tags used by list
 357                                 'ul','ol',
 358                         );
 359                         $listtags = array( # Tags that can appear in a list
 360                                 'li',
 361                         );
 362
 363                 } else {
 364                         $htmlpairs = array();
 365                         $htmlsingle = array();
 366                         $htmlnest = array();
 367                         $tabletags = array();
 368                 }
 369
 370                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 371                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 372
 373                 # Remove HTML comments
 374                 $text = Sanitizer::removeHTMLcomments( $text );
 375                 $bits = explode( '<', $text );
 376                 $text = array_shift( $bits );
 377                 if(!$wgUseTidy) {
 378                         $tagstack = array(); $tablestack = array();
 379                         foreach ( $bits as $x ) {
 380                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 381                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 382                                 $x, $regs );
 383                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 384                                 error_reporting( $prev );
 385
 386                                 $badtag = 0 ;
 387                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 388                                         # Check our stack
 389                                         if ( $slash ) {
 390                                                 # Closing a tag...
 391                                                 if( in_array( $t, $htmlsingleonly ) ) {
 392                                                         $badtag = 1;
 393                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 394                                                         @array_push( $tagstack, $ot );
 395                                                         # <li> can be nested in <ul> or <ol>, skip those cases:
 396                                                         if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
 397                                                                 $badtag = 1;
 398                                                         }
 399                                                 } else {
 400                                                         if ( $t == 'table' ) {
 401                                                                 $tagstack = array_pop( $tablestack );
 402                                                         }
 403                                                         $newparams = '';
 404                                                 }
 405                                         } else {
 406                                                 # Keep track for later
 407                                                 if ( in_array( $t, $tabletags ) &&
 408                                                 ! in_array( 'table', $tagstack ) ) {
 409                                                         $badtag = 1;
 410                                                 } else if ( in_array( $t, $tagstack ) &&
 411                                                 ! in_array ( $t , $htmlnest ) ) {
 412                                                         $badtag = 1 ;
 413                                                 # Is it a self closed htmlpair ? (bug 5487)
 414                                                 } else if( $brace == '/>' &&
 415                                                 in_array($t, $htmlpairs) ) {
 416                                                         $badtag = 1;
 417                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 418                                                         # Hack to force empty tag for uncloseable elements
 419                                                         $brace = '/>';
 420                                                 } else if( in_array( $t, $htmlsingle ) ) {
 421                                                         # Hack to not close $htmlsingle tags
 422                                                         $brace = NULL;
 423                                                 } else {
 424                                                         if ( $t == 'table' ) {
 425                                                                 array_push( $tablestack, $tagstack );
 426                                                                 $tagstack = array();
 427                                                         }
 428                                                         array_push( $tagstack, $t );
 429                                                 }
 430
 431                                                 # Replace any variables or template parameters with
 432                                                 # plaintext results.
 433                                                 if( is_callable( $processCallback ) ) {
 434                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 435                                                 }
 436
 437                                                 # Strip non-approved attributes from the tag
 438                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 439                                         }
 440                                         if ( ! $badtag ) {
 441                                                 $rest = str_replace( '>', '&gt;', $rest );
 442                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 443                                                 $text .= "<$slash$t$newparams$close>$rest";
 444                                                 continue;
 445                                         }
 446                                 }
 447                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 448                         }
 449                         # Close off any remaining tags
 450                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 451                                 $text .= "</$t>\n";
 452                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 453                         }
 454                 } else {
 455                         # this might be possible using tidy itself
 456                         foreach ( $bits as $x ) {
 457                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 458                                 $x, $regs );
 459                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 460                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 461                                         if( is_callable( $processCallback ) ) {
 462                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 463                                         }
 464                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 465                                         $rest = str_replace( '>', '&gt;', $rest );
 466                                         $text .= "<$slash$t$newparams$brace$rest";
 467                                 } else {
 468                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 469                                 }
 470                         }
 471                 }
 472                 wfProfileOut( $fname );
 473                 return $text;
 474         }
 475
 476         /**
 477          * Remove '<!--', '-->', and everything between.
 478          * To avoid leaving blank lines, when a comment is both preceded
 479          * and followed by a newline (ignoring spaces), trim leading and
 480          * trailing spaces and one of the newlines.
 481          *
 482          * @private
 483          * @param string $text
 484          * @return string
 485          */
 486         function removeHTMLcomments( $text ) {
 487                 $fname='Parser::removeHTMLcomments';
 488                 wfProfileIn( $fname );
 489                 while (($start = strpos($text, '<!--')) !== false) {
 490                         $end = strpos($text, '-->', $start + 4);
 491                         if ($end === false) {
 492                                 # Unterminated comment; bail out
 493                                 break;
 494                         }
 495
 496                         $end += 3;
 497
 498                         # Trim space and newline if the comment is both
 499                         # preceded and followed by a newline
 500                         $spaceStart = max($start - 1, 0);
 501                         $spaceLen = $end - $spaceStart;
 502                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 503                                 $spaceStart--;
 504                                 $spaceLen++;
 505                         }
 506                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 507                                 $spaceLen++;
 508                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 509                                 # Remove the comment, leading and trailing
 510                                 # spaces, and leave only one newline.
 511                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 512                         }
 513                         else {
 514                                 # Remove just the comment.
 515                                 $text = substr_replace($text, '', $start, $end - $start);
 516                         }
 517                 }
 518                 wfProfileOut( $fname );
 519                 return $text;
 520         }
 521
 522         /**
 523          * Take a tag soup fragment listing an HTML element's attributes
 524          * and normalize it to well-formed XML, discarding unwanted attributes.
 525          *
 526          * - Normalizes attribute names to lowercase
 527          * - Discards attributes not on a whitelist for the given element
 528          * - Turns broken or invalid entities into plaintext
 529          * - Double-quotes all attribute values
 530          * - Attributes without values are given the name as attribute
 531          * - Double attributes are discarded
 532          * - Unsafe style attributes are discarded
 533          * - Prepends space if there are attributes.
 534          *
 535          * @param string $text
 536          * @param string $element
 537          * @return string
 538          *
 539          * @todo Check for legal values where the DTD limits things.
 540          * @todo Check for unique id attribute :P
 541          */
 542         function fixTagAttributes( $text, $element ) {
 543                 if( trim( $text ) == '' ) {
 544                         return '';
 545                 }
 546
 547                 # Unquoted attribute
 548                 # Since we quote this later, this can be anything distinguishable
 549                 # from the end of the attribute
 550                 $pairs = array();
 551                 if( !preg_match_all(
 552                         MW_ATTRIBS_REGEX,
 553                         $text,
 554                         $pairs,
 555                         PREG_SET_ORDER ) ) {
 556                         return '';
 557                 }
 558
 559                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 560                 $attribs = array();
 561                 foreach( $pairs as $set ) {
 562                         $attribute = strtolower( $set[1] );
 563                         if( !isset( $whitelist[$attribute] ) ) {
 564                                 continue;
 565                         }
 566
 567                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 568                         $value = Sanitizer::normalizeAttributeValue( $raw );
 569
 570                         # Strip javascript "expression" from stylesheets.
 571                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 572                         if( $attribute == 'style' ) {
 573                                 $stripped = Sanitizer::decodeCharReferences( $value );
 574
 575                                 // Remove any comments; IE gets token splitting wrong
 576                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 577                                 $value = htmlspecialchars( $stripped );
 578
 579                                 // ... and continue checks
 580                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 581                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 582                                 $stripped = str_replace( '\\', '', $stripped );
 583                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 584                                                 $stripped ) ) {
 585                                         # haxx0r
 586                                         continue;
 587                                 }
 588                         }
 589
 590                         if ( $attribute === 'id' )
 591                                 $value = Sanitizer::escapeId( $value );
 592
 593                         # Templates and links may be expanded in later parsing,
 594                         # creating invalid or dangerous output. Suppress this.
 595                         $value = strtr( $value, array(
 596                                 '<'    => '&lt;',   // This should never happen,
 597                                 '>'    => '&gt;',   // we've received invalid input
 598                                 '"'    => '&quot;', // which should have been escaped.
 599                                 '{'    => '&#123;',
 600                                 '['    => '&#91;',
 601                                 "''"   => '&#39;&#39;',
 602                                 'ISBN' => '&#73;SBN',
 603                                 'RFC'  => '&#82;FC',
 604                                 'PMID' => '&#80;MID',
 605                         ) );
 606
 607                         # Stupid hack
 608                         $value = preg_replace_callback(
 609                                 '/(' . wfUrlProtocols() . ')/',
 610                                 array( 'Sanitizer', 'armorLinksCallback' ),
 611                                 $value );
 612
 613                         // If this attribute was previously set, override it.
 614                         // Output should only have one attribute of each name.
 615                         $attribs[$attribute] = "$attribute=\"$value\"";
 616                 }
 617
 618                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 619         }
 620
 621         /**
 622          * Given a value escape it so that it can be used in an id attribute and
 623          * return it, this does not validate the value however (see first link)
 624          *
 625          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 626          *                                                          in the id and
 627          *                                                          name attributes
 628          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 629          *
 630          * @bug 4461
 631          *
 632          * @static
 633          *
 634          * @param string $id
 635          * @return string
 636          */
 637         static function escapeId( $id ) {
 638                 static $replace = array(
 639                         '%3A' => ':',
 640                         '%' => '.'
 641                 );
 642
 643                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 644
 645                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 646         }
 647
 648         /**
 649          * Regex replace callback for armoring links against further processing.
 650          * @param array $matches
 651          * @return string
 652          * @private
 653          */
 654         private static function armorLinksCallback( $matches ) {
 655                 return str_replace( ':', '&#58;', $matches[1] );
 656         }
 657
 658         /**
 659          * Return an associative array of attribute names and values from
 660          * a partial tag string. Attribute names are forces to lowercase,
 661          * character references are decoded to UTF-8 text.
 662          *
 663          * @param string
 664          * @return array
 665          */
 666         function decodeTagAttributes( $text ) {
 667                 $attribs = array();
 668
 669                 if( trim( $text ) == '' ) {
 670                         return $attribs;
 671                 }
 672
 673                 $pairs = array();
 674                 if( !preg_match_all(
 675                         MW_ATTRIBS_REGEX,
 676                         $text,
 677                         $pairs,
 678                         PREG_SET_ORDER ) ) {
 679                         return $attribs;
 680                 }
 681
 682                 foreach( $pairs as $set ) {
 683                         $attribute = strtolower( $set[1] );
 684                         $value = Sanitizer::getTagAttributeCallback( $set );
 685                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 686                 }
 687                 return $attribs;
 688         }
 689
 690         /**
 691          * Pick the appropriate attribute value from a match set from the
 692          * MW_ATTRIBS_REGEX matches.
 693          *
 694          * @param array $set
 695          * @return string
 696          * @private
 697          */
 698         function getTagAttributeCallback( $set ) {
 699                 if( isset( $set[6] ) ) {
 700                         # Illegal #XXXXXX color with no quotes.
 701                         return $set[6];
 702                 } elseif( isset( $set[5] ) ) {
 703                         # No quotes.
 704                         return $set[5];
 705                 } elseif( isset( $set[4] ) ) {
 706                         # Single-quoted
 707                         return $set[4];
 708                 } elseif( isset( $set[3] ) ) {
 709                         # Double-quoted
 710                         return $set[3];
 711                 } elseif( !isset( $set[2] ) ) {
 712                         # In XHTML, attributes must have a value.
 713                         # For 'reduced' form, return explicitly the attribute name here.
 714                         return $set[1];
 715                 } else {
 716                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 717                 }
 718         }
 719
 720         /**
 721          * Normalize whitespace and character references in an XML source-
 722          * encoded text for an attribute value.
 723          *
 724          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 725          * but note that we're not returning the value, but are returning
 726          * XML source fragments that will be slapped into output.
 727          *
 728          * @param string $text
 729          * @return string
 730          * @private
 731          */
 732         function normalizeAttributeValue( $text ) {
 733                 return str_replace( '"', '&quot;',
 734                         preg_replace(
 735                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 736                                 ' ',
 737                                 Sanitizer::normalizeCharReferences( $text ) ) );
 738         }
 739
 740         /**
 741          * Ensure that any entities and character references are legal
 742          * for XML and XHTML specifically. Any stray bits will be
 743          * &amp;-escaped to result in a valid text fragment.
 744          *
 745          * a. any named char refs must be known in XHTML
 746          * b. any numeric char refs must be legal chars, not invalid or forbidden
 747          * c. use &#x, not &#X
 748          * d. fix or reject non-valid attributes
 749          *
 750          * @param string $text
 751          * @return string
 752          * @private
 753          * @todo FIXME called from parser.php so not that much private
 754          */
 755         function normalizeCharReferences( $text ) {
 756                 return preg_replace_callback(
 757                         MW_CHAR_REFS_REGEX,
 758                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 759                         $text );
 760         }
 761         /**
 762          * @param string $matches
 763          * @return string
 764          */
 765         static function normalizeCharReferencesCallback( $matches ) {
 766                 $ret = null;
 767                 if( $matches[1] != '' ) {
 768                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 769                 } elseif( $matches[2] != '' ) {
 770                         $ret = Sanitizer::decCharReference( $matches[2] );
 771                 } elseif( $matches[3] != ''  ) {
 772                         $ret = Sanitizer::hexCharReference( $matches[3] );
 773                 } elseif( $matches[4] != '' ) {
 774                         $ret = Sanitizer::hexCharReference( $matches[4] );
 775                 }
 776                 if( is_null( $ret ) ) {
 777                         return htmlspecialchars( $matches[0] );
 778                 } else {
 779                         return $ret;
 780                 }
 781         }
 782
 783         /**
 784          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 785          * return the named entity reference as is. Otherwise, returns
 786          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 787          *
 788          * @param string $name
 789          * @return string
 790          */
 791         static function normalizeEntity( $name ) {
 792                 global $wgHtmlEntities;
 793                 if( isset( $wgHtmlEntities[$name] ) ) {
 794                         return "&$name;";
 795                 } else {
 796                         return "&amp;$name;";
 797                 }
 798         }
 799
 800         function decCharReference( $codepoint ) {
 801                 $point = intval( $codepoint );
 802                 if( Sanitizer::validateCodepoint( $point ) ) {
 803                         return sprintf( '&#%d;', $point );
 804                 } else {
 805                         return null;
 806                 }
 807         }
 808
 809         function hexCharReference( $codepoint ) {
 810                 $point = hexdec( $codepoint );
 811                 if( Sanitizer::validateCodepoint( $point ) ) {
 812                         return sprintf( '&#x%x;', $point );
 813                 } else {
 814                         return null;
 815                 }
 816         }
 817
 818         /**
 819          * Returns true if a given Unicode codepoint is a valid character in XML.
 820          * @param int $codepoint
 821          * @return bool
 822          */
 823         function validateCodepoint( $codepoint ) {
 824                 return ($codepoint ==    0x09)
 825                         || ($codepoint ==    0x0a)
 826                         || ($codepoint ==    0x0d)
 827                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 828                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 829                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 830         }
 831
 832         /**
 833          * Decode any character references, numeric or named entities,
 834          * in the text and return a UTF-8 string.
 835          *
 836          * @param string $text
 837          * @return string
 838          * @public
 839          * @static
 840          */
 841         public static function decodeCharReferences( $text ) {
 842                 return preg_replace_callback(
 843                         MW_CHAR_REFS_REGEX,
 844                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 845                         $text );
 846         }
 847
 848         /**
 849          * @param string $matches
 850          * @return string
 851          */
 852         static function decodeCharReferencesCallback( $matches ) {
 853                 if( $matches[1] != '' ) {
 854                         return Sanitizer::decodeEntity( $matches[1] );
 855                 } elseif( $matches[2] != '' ) {
 856                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 857                 } elseif( $matches[3] != ''  ) {
 858                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 859                 } elseif( $matches[4] != '' ) {
 860                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 861                 }
 862                 # Last case should be an ampersand by itself
 863                 return $matches[0];
 864         }
 865
 866         /**
 867          * Return UTF-8 string for a codepoint if that is a valid
 868          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 869          * @param int $codepoint
 870          * @return string
 871          * @private
 872          */
 873         function decodeChar( $codepoint ) {
 874                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 875                         return codepointToUtf8( $codepoint );
 876                 } else {
 877                         return UTF8_REPLACEMENT;
 878                 }
 879         }
 880
 881         /**
 882          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 883          * return the UTF-8 encoding of that character. Otherwise, returns
 884          * pseudo-entity source (eg &foo;)
 885          *
 886          * @param string $name
 887          * @return string
 888          */
 889         function decodeEntity( $name ) {
 890                 global $wgHtmlEntities;
 891                 if( isset( $wgHtmlEntities[$name] ) ) {
 892                         return codepointToUtf8( $wgHtmlEntities[$name] );
 893                 } else {
 894                         return "&$name;";
 895                 }
 896         }
 897
 898         /**
 899          * Fetch the whitelist of acceptable attributes for a given
 900          * element name.
 901          *
 902          * @param string $element
 903          * @return array
 904          */
 905         function attributeWhitelist( $element ) {
 906                 static $list;
 907                 if( !isset( $list ) ) {
 908                         $list = Sanitizer::setupAttributeWhitelist();
 909                 }
 910                 return isset( $list[$element] )
 911                         ? $list[$element]
 912                         : array();
 913         }
 914
 915         /**
 916          * @return array
 917          */
 918         function setupAttributeWhitelist() {
 919                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 920                 $block = array_merge( $common, array( 'align' ) );
 921                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 922                 $tablecell = array( 'abbr',
 923                                     'axis',
 924                                     'headers',
 925                                     'scope',
 926                                     'rowspan',
 927                                     'colspan',
 928                                     'nowrap', # deprecated
 929                                     'width',  # deprecated
 930                                     'height', # deprecated
 931                                     'bgcolor' # deprecated
 932                                     );
 933
 934                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 935                 # See: http://www.w3.org/TR/html4/
 936                 $whitelist = array (
 937                         # 7.5.4
 938                         'div'        => $block,
 939                         'center'     => $common, # deprecated
 940                         'span'       => $block, # ??
 941
 942                         # 7.5.5
 943                         'h1'         => $block,
 944                         'h2'         => $block,
 945                         'h3'         => $block,
 946                         'h4'         => $block,
 947                         'h5'         => $block,
 948                         'h6'         => $block,
 949
 950                         # 7.5.6
 951                         # address
 952
 953                         # 8.2.4
 954                         # bdo
 955
 956                         # 9.2.1
 957                         'em'         => $common,
 958                         'strong'     => $common,
 959                         'cite'       => $common,
 960                         # dfn
 961                         'code'       => $common,
 962                         # samp
 963                         # kbd
 964                         'var'        => $common,
 965                         # abbr
 966                         # acronym
 967
 968                         # 9.2.2
 969                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 970                         # q
 971
 972                         # 9.2.3
 973                         'sub'        => $common,
 974                         'sup'        => $common,
 975
 976                         # 9.3.1
 977                         'p'          => $block,
 978
 979                         # 9.3.2
 980                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 981
 982                         # 9.3.4
 983                         'pre'        => array_merge( $common, array( 'width' ) ),
 984
 985                         # 9.4
 986                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 987                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 988
 989                         # 10.2
 990                         'ul'         => array_merge( $common, array( 'type' ) ),
 991                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 992                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 993
 994                         # 10.3
 995                         'dl'         => $common,
 996                         'dd'         => $common,
 997                         'dt'         => $common,
 998
 999                         # 11.2.1
1000                         'table'      => array_merge( $common,
1001                                                                 array( 'summary', 'width', 'border', 'frame',
1002                                                                                          'rules', 'cellspacing', 'cellpadding',
1003                                                                                          'align', 'bgcolor', 'frame', 'rules',
1004                                                                                          'border' ) ),
1005
1006                         # 11.2.2
1007                         'caption'    => array_merge( $common, array( 'align' ) ),
1008
1009                         # 11.2.3
1010                         'thead'      => array_merge( $common, $tablealign ),
1011                         'tfoot'      => array_merge( $common, $tablealign ),
1012                         'tbody'      => array_merge( $common, $tablealign ),
1013
1014                         # 11.2.4
1015                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1016                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1017
1018                         # 11.2.5
1019                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1020
1021                         # 11.2.6
1022                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1023                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1024
1025                         # 15.2.1
1026                         'tt'         => $common,
1027                         'b'          => $common,
1028                         'i'          => $common,
1029                         'big'        => $common,
1030                         'small'      => $common,
1031                         'strike'     => $common,
1032                         's'          => $common,
1033                         'u'          => $common,
1034
1035                         # 15.2.2
1036                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1037                         # basefont
1038
1039                         # 15.3
1040                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1041
1042                         # XHTML Ruby annotation text module, simple ruby only.
1043                         # http://www.w3c.org/TR/ruby/
1044                         'ruby'       => $common,
1045                         # rbc
1046                         # rtc
1047                         'rb'         => $common,
1048                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1049                         'rp'         => $common,
1050                         );
1051                 return $whitelist;
1052         }
1053
1054         /**
1055          * Take a fragment of (potentially invalid) HTML and return
1056          * a version with any tags removed, encoded suitably for literal
1057          * inclusion in an attribute value.
1058          *
1059          * @param string $text HTML fragment
1060          * @return string
1061          */
1062         function stripAllTags( $text ) {
1063                 # Actual <tags>
1064                 $text = preg_replace( '/ < .*? > /x', '', $text );
1065
1066                 # Normalize &entities and whitespace
1067                 $text = Sanitizer::normalizeAttributeValue( $text );
1068
1069                 # Will be placed into "double-quoted" attributes,
1070                 # make sure remaining bits are safe.
1071                 $text = str_replace(
1072                         array('<', '>', '"'),
1073                         array('&lt;', '&gt;', '&quot;'),
1074                         $text );
1075
1076                 return $text;
1077         }
1078
1079         /**
1080          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1081          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1082          * PHP 5.1 doesn't.
1083          *
1084          * Use for passing XHTML fragments to PHP's XML parsing functions
1085          *
1086          * @return string
1087          * @static
1088          */
1089         static function hackDocType() {
1090                 global $wgHtmlEntities;
1091                 $out = "<!DOCTYPE html [\n";
1092                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1093                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1094                 }
1095                 $out .= "]>\n";
1096                 return $out;
1097         }
1098
1099 }
1100
1101 ?>