includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr',
 355                         );
 356                         $htmllist = array( # Tags used by list
 357                                 'ul','ol',
 358                         );
 359                         $listtags = array( # Tags that can appear in a list
 360                                 'li',
 361                         );
 362
 363                 } else {
 364                         $htmlpairs = array();
 365                         $htmlsingle = array();
 366                         $htmlnest = array();
 367                         $tabletags = array();
 368                 }
 369
 370                 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 371                 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 372
 373                 # Remove HTML comments
 374                 $text = Sanitizer::removeHTMLcomments( $text );
 375                 $bits = explode( '<', $text );
 376                 $text = array_shift( $bits );
 377                 if(!$wgUseTidy) {
 378                         $tagstack = array(); $tablestack = array();
 379                         foreach ( $bits as $x ) {
 380                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 381                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 382                                 $x, $regs );
 383                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 384                                 error_reporting( $prev );
 385
 386                                 $badtag = 0 ;
 387                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 388                                         # Check our stack
 389                                         if ( $slash ) {
 390                                                 # Closing a tag...
 391                                                 if( in_array( $t, $htmlsingleonly ) ) {
 392                                                         $badtag = 1;
 393                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 394                                                         if ( in_array($ot, $htmlsingleallowed) ) {
 395                                                                 # Pop all elements with an optional close tag
 396                                                                 # and see if we find a match below them
 397                                                                 $optstack = array();
 398                                                                 array_push ($optstack, $ot);
 399                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 400                                                                                                 in_array($ot, $htmlsingleallowed) ) {
 401                                                                         array_push ($optstack, $ot);
 402                                                                 }
 403                                                                 if ( $t != $ot ) {
 404                                                                         # No match. Push the optinal elements back again
 405                                                                         $badtag = 1;
 406                                                                         while ( $ot = @array_pop( $optstack ) ) {
 407                                                                                 array_push( $tagstack, $ot );
 408                                                                         }
 409                                                                 }
 410                                                         } else {
 411                                                                 @array_push( $tagstack, $ot );
 412                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 413                                                                 if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
 414                                                                         $badtag = 1;
 415                                                                 }
 416                                                         }
 417                                                 } else {
 418                                                         if ( $t == 'table' ) {
 419                                                                 $tagstack = array_pop( $tablestack );
 420                                                         }
 421                                                 }
 422                                                 $newparams = '';
 423                                         } else {
 424                                                 # Keep track for later
 425                                                 if ( in_array( $t, $tabletags ) &&
 426                                                 ! in_array( 'table', $tagstack ) ) {
 427                                                         $badtag = 1;
 428                                                 } else if ( in_array( $t, $tagstack ) &&
 429                                                 ! in_array ( $t , $htmlnest ) ) {
 430                                                         $badtag = 1 ;
 431                                                 # Is it a self closed htmlpair ? (bug 5487)
 432                                                 } else if( $brace == '/>' &&
 433                                                 in_array($t, $htmlpairs) ) {
 434                                                         $badtag = 1;
 435                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 436                                                         # Hack to force empty tag for uncloseable elements
 437                                                         $brace = '/>';
 438                                                 } else if( in_array( $t, $htmlsingle ) ) {
 439                                                         # Hack to not close $htmlsingle tags
 440                                                         $brace = NULL;
 441                                                 } else {
 442                                                         if ( $t == 'table' ) {
 443                                                                 array_push( $tablestack, $tagstack );
 444                                                                 $tagstack = array();
 445                                                         }
 446                                                         array_push( $tagstack, $t );
 447                                                 }
 448
 449                                                 # Replace any variables or template parameters with
 450                                                 # plaintext results.
 451                                                 if( is_callable( $processCallback ) ) {
 452                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 453                                                 }
 454
 455                                                 # Strip non-approved attributes from the tag
 456                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 457                                         }
 458                                         if ( ! $badtag ) {
 459                                                 $rest = str_replace( '>', '&gt;', $rest );
 460                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 461                                                 $text .= "<$slash$t$newparams$close>$rest";
 462                                                 continue;
 463                                         }
 464                                 }
 465                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 466                         }
 467                         # Close off any remaining tags
 468                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 469                                 $text .= "</$t>\n";
 470                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 471                         }
 472                 } else {
 473                         # this might be possible using tidy itself
 474                         foreach ( $bits as $x ) {
 475                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 476                                 $x, $regs );
 477                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 478                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 479                                         if( is_callable( $processCallback ) ) {
 480                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 481                                         }
 482                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 483                                         $rest = str_replace( '>', '&gt;', $rest );
 484                                         $text .= "<$slash$t$newparams$brace$rest";
 485                                 } else {
 486                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 487                                 }
 488                         }
 489                 }
 490                 wfProfileOut( $fname );
 491                 return $text;
 492         }
 493
 494         /**
 495          * Remove '<!--', '-->', and everything between.
 496          * To avoid leaving blank lines, when a comment is both preceded
 497          * and followed by a newline (ignoring spaces), trim leading and
 498          * trailing spaces and one of the newlines.
 499          *
 500          * @private
 501          * @param string $text
 502          * @return string
 503          */
 504         function removeHTMLcomments( $text ) {
 505                 $fname='Parser::removeHTMLcomments';
 506                 wfProfileIn( $fname );
 507                 while (($start = strpos($text, '<!--')) !== false) {
 508                         $end = strpos($text, '-->', $start + 4);
 509                         if ($end === false) {
 510                                 # Unterminated comment; bail out
 511                                 break;
 512                         }
 513
 514                         $end += 3;
 515
 516                         # Trim space and newline if the comment is both
 517                         # preceded and followed by a newline
 518                         $spaceStart = max($start - 1, 0);
 519                         $spaceLen = $end - $spaceStart;
 520                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 521                                 $spaceStart--;
 522                                 $spaceLen++;
 523                         }
 524                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 525                                 $spaceLen++;
 526                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 527                                 # Remove the comment, leading and trailing
 528                                 # spaces, and leave only one newline.
 529                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 530                         }
 531                         else {
 532                                 # Remove just the comment.
 533                                 $text = substr_replace($text, '', $start, $end - $start);
 534                         }
 535                 }
 536                 wfProfileOut( $fname );
 537                 return $text;
 538         }
 539
 540         /**
 541          * Take an array of attribute names and values and normalize or discard
 542          * illegal values for the given element type.
 543          *
 544          * - Discards attributes not on a whitelist for the given element
 545          * - Unsafe style attributes are discarded
 546          *
 547          * @param array $attribs
 548          * @param string $element
 549          * @return array
 550          *
 551          * @todo Check for legal values where the DTD limits things.
 552          * @todo Check for unique id attribute :P
 553          */
 554         function validateTagAttributes( $attribs, $element ) {
 555                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 556                 $out = array();
 557                 foreach( $attribs as $attribute => $value ) {
 558                         if( !isset( $whitelist[$attribute] ) ) {
 559                                 continue;
 560                         }
 561                         # Strip javascript "expression" from stylesheets.
 562                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 563                         if( $attribute == 'style' ) {
 564                                 $stripped = Sanitizer::decodeCharReferences( $value );
 565
 566                                 // Remove any comments; IE gets token splitting wrong
 567                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 568                                 $value = $stripped;
 569
 570                                 // ... and continue checks
 571                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 572                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 573                                 $stripped = str_replace( '\\', '', $stripped );
 574                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 575                                                 $stripped ) ) {
 576                                         # haxx0r
 577                                         continue;
 578                                 }
 579                         }
 580
 581                         if ( $attribute === 'id' )
 582                                 $value = Sanitizer::escapeId( $value );
 583
 584                         // If this attribute was previously set, override it.
 585                         // Output should only have one attribute of each name.
 586                         $out[$attribute] = $value;
 587                 }
 588                 return $out;
 589         }
 590
 591         /**
 592          * Take a tag soup fragment listing an HTML element's attributes
 593          * and normalize it to well-formed XML, discarding unwanted attributes.
 594          * Output is safe for further wikitext processing, with escaping of
 595          * values that could trigger problems.
 596          *
 597          * - Normalizes attribute names to lowercase
 598          * - Discards attributes not on a whitelist for the given element
 599          * - Turns broken or invalid entities into plaintext
 600          * - Double-quotes all attribute values
 601          * - Attributes without values are given the name as attribute
 602          * - Double attributes are discarded
 603          * - Unsafe style attributes are discarded
 604          * - Prepends space if there are attributes.
 605          *
 606          * @param string $text
 607          * @param string $element
 608          * @return string
 609          */
 610         function fixTagAttributes( $text, $element ) {
 611                 if( trim( $text ) == '' ) {
 612                         return '';
 613                 }
 614
 615                 $stripped = Sanitizer::validateTagAttributes(
 616                         Sanitizer::decodeTagAttributes( $text ), $element );
 617
 618                 $attribs = array();
 619                 foreach( $stripped as $attribute => $value ) {
 620                         $encAttribute = htmlspecialchars( $attribute );
 621
 622                         $encValue = htmlspecialchars( $value );
 623                         # Templates and links may be expanded in later parsing,
 624                         # creating invalid or dangerous output. Suppress this.
 625                         $encValue = strtr( $encValue, array(
 626                                 '<'    => '&lt;',   // This should never happen,
 627                                 '>'    => '&gt;',   // we've received invalid input
 628                                 '"'    => '&quot;', // which should have been escaped.
 629                                 '{'    => '&#123;',
 630                                 '['    => '&#91;',
 631                                 "''"   => '&#39;&#39;',
 632                                 'ISBN' => '&#73;SBN',
 633                                 'RFC'  => '&#82;FC',
 634                                 'PMID' => '&#80;MID',
 635                                 '|'    => '&#124;',
 636                                 '__'   => '&#95;_',
 637                         ) );
 638
 639                         # Stupid hack
 640                         $encValue = preg_replace_callback(
 641                                 '/(' . wfUrlProtocols() . ')/',
 642                                 array( 'Sanitizer', 'armorLinksCallback' ),
 643                                 $encValue );
 644
 645                         $attribs[] = "$encAttribute=\"$encValue\"";
 646                 }
 647                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 648         }
 649
 650         /**
 651          * Given a value escape it so that it can be used in an id attribute and
 652          * return it, this does not validate the value however (see first link)
 653          *
 654          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 655          *                                                          in the id and
 656          *                                                          name attributes
 657          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 658          *
 659          * @bug 4461
 660          *
 661          * @static
 662          *
 663          * @param string $id
 664          * @return string
 665          */
 666         function escapeId( $id ) {
 667                 static $replace = array(
 668                         '%3A' => ':',
 669                         '%' => '.'
 670                 );
 671
 672                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 673
 674                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 675         }
 676
 677         /**
 678          * Regex replace callback for armoring links against further processing.
 679          * @param array $matches
 680          * @return string
 681          * @private
 682          */
 683         function armorLinksCallback( $matches ) {
 684                 return str_replace( ':', '&#58;', $matches[1] );
 685         }
 686
 687         /**
 688          * Return an associative array of attribute names and values from
 689          * a partial tag string. Attribute names are forces to lowercase,
 690          * character references are decoded to UTF-8 text.
 691          *
 692          * @param string
 693          * @return array
 694          */
 695         function decodeTagAttributes( $text ) {
 696                 $attribs = array();
 697
 698                 if( trim( $text ) == '' ) {
 699                         return $attribs;
 700                 }
 701
 702                 $pairs = array();
 703                 if( !preg_match_all(
 704                         MW_ATTRIBS_REGEX,
 705                         $text,
 706                         $pairs,
 707                         PREG_SET_ORDER ) ) {
 708                         return $attribs;
 709                 }
 710
 711                 foreach( $pairs as $set ) {
 712                         $attribute = strtolower( $set[1] );
 713                         $value = Sanitizer::getTagAttributeCallback( $set );
 714                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 715                 }
 716                 return $attribs;
 717         }
 718
 719         /**
 720          * Pick the appropriate attribute value from a match set from the
 721          * MW_ATTRIBS_REGEX matches.
 722          *
 723          * @param array $set
 724          * @return string
 725          * @private
 726          */
 727         function getTagAttributeCallback( $set ) {
 728                 if( isset( $set[6] ) ) {
 729                         # Illegal #XXXXXX color with no quotes.
 730                         return $set[6];
 731                 } elseif( isset( $set[5] ) ) {
 732                         # No quotes.
 733                         return $set[5];
 734                 } elseif( isset( $set[4] ) ) {
 735                         # Single-quoted
 736                         return $set[4];
 737                 } elseif( isset( $set[3] ) ) {
 738                         # Double-quoted
 739                         return $set[3];
 740                 } elseif( !isset( $set[2] ) ) {
 741                         # In XHTML, attributes must have a value.
 742                         # For 'reduced' form, return explicitly the attribute name here.
 743                         return $set[1];
 744                 } else {
 745                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 746                 }
 747         }
 748
 749         /**
 750          * Normalize whitespace and character references in an XML source-
 751          * encoded text for an attribute value.
 752          *
 753          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 754          * but note that we're not returning the value, but are returning
 755          * XML source fragments that will be slapped into output.
 756          *
 757          * @param string $text
 758          * @return string
 759          * @private
 760          */
 761         function normalizeAttributeValue( $text ) {
 762                 return str_replace( '"', '&quot;',
 763                         preg_replace(
 764                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 765                                 ' ',
 766                                 Sanitizer::normalizeCharReferences( $text ) ) );
 767         }
 768
 769         /**
 770          * Ensure that any entities and character references are legal
 771          * for XML and XHTML specifically. Any stray bits will be
 772          * &amp;-escaped to result in a valid text fragment.
 773          *
 774          * a. any named char refs must be known in XHTML
 775          * b. any numeric char refs must be legal chars, not invalid or forbidden
 776          * c. use &#x, not &#X
 777          * d. fix or reject non-valid attributes
 778          *
 779          * @param string $text
 780          * @return string
 781          * @private
 782          */
 783         function normalizeCharReferences( $text ) {
 784                 return preg_replace_callback(
 785                         MW_CHAR_REFS_REGEX,
 786                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 787                         $text );
 788         }
 789         /**
 790          * @param string $matches
 791          * @return string
 792          */
 793         function normalizeCharReferencesCallback( $matches ) {
 794                 $ret = null;
 795                 if( $matches[1] != '' ) {
 796                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 797                 } elseif( $matches[2] != '' ) {
 798                         $ret = Sanitizer::decCharReference( $matches[2] );
 799                 } elseif( $matches[3] != ''  ) {
 800                         $ret = Sanitizer::hexCharReference( $matches[3] );
 801                 } elseif( $matches[4] != '' ) {
 802                         $ret = Sanitizer::hexCharReference( $matches[4] );
 803                 }
 804                 if( is_null( $ret ) ) {
 805                         return htmlspecialchars( $matches[0] );
 806                 } else {
 807                         return $ret;
 808                 }
 809         }
 810
 811         /**
 812          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 813          * return the named entity reference as is. Otherwise, returns
 814          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 815          *
 816          * @param string $name
 817          * @return string
 818          */
 819         function normalizeEntity( $name ) {
 820                 global $wgHtmlEntities;
 821                 if( isset( $wgHtmlEntities[$name] ) ) {
 822                         return "&$name;";
 823                 } else {
 824                         return "&amp;$name;";
 825                 }
 826         }
 827
 828         function decCharReference( $codepoint ) {
 829                 $point = intval( $codepoint );
 830                 if( Sanitizer::validateCodepoint( $point ) ) {
 831                         return sprintf( '&#%d;', $point );
 832                 } else {
 833                         return null;
 834                 }
 835         }
 836
 837         function hexCharReference( $codepoint ) {
 838                 $point = hexdec( $codepoint );
 839                 if( Sanitizer::validateCodepoint( $point ) ) {
 840                         return sprintf( '&#x%x;', $point );
 841                 } else {
 842                         return null;
 843                 }
 844         }
 845
 846         /**
 847          * Returns true if a given Unicode codepoint is a valid character in XML.
 848          * @param int $codepoint
 849          * @return bool
 850          */
 851         function validateCodepoint( $codepoint ) {
 852                 return ($codepoint ==    0x09)
 853                         || ($codepoint ==    0x0a)
 854                         || ($codepoint ==    0x0d)
 855                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 856                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 857                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 858         }
 859
 860         /**
 861          * Decode any character references, numeric or named entities,
 862          * in the text and return a UTF-8 string.
 863          *
 864          * @param string $text
 865          * @return string
 866          * @public
 867          */
 868         function decodeCharReferences( $text ) {
 869                 return preg_replace_callback(
 870                         MW_CHAR_REFS_REGEX,
 871                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 872                         $text );
 873         }
 874
 875         /**
 876          * @param string $matches
 877          * @return string
 878          */
 879         function decodeCharReferencesCallback( $matches ) {
 880                 if( $matches[1] != '' ) {
 881                         return Sanitizer::decodeEntity( $matches[1] );
 882                 } elseif( $matches[2] != '' ) {
 883                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 884                 } elseif( $matches[3] != ''  ) {
 885                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 886                 } elseif( $matches[4] != '' ) {
 887                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 888                 }
 889                 # Last case should be an ampersand by itself
 890                 return $matches[0];
 891         }
 892
 893         /**
 894          * Return UTF-8 string for a codepoint if that is a valid
 895          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 896          * @param int $codepoint
 897          * @return string
 898          * @private
 899          */
 900         function decodeChar( $codepoint ) {
 901                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 902                         return codepointToUtf8( $codepoint );
 903                 } else {
 904                         return UTF8_REPLACEMENT;
 905                 }
 906         }
 907
 908         /**
 909          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 910          * return the UTF-8 encoding of that character. Otherwise, returns
 911          * pseudo-entity source (eg &foo;)
 912          *
 913          * @param string $name
 914          * @return string
 915          */
 916         function decodeEntity( $name ) {
 917                 global $wgHtmlEntities;
 918                 if( isset( $wgHtmlEntities[$name] ) ) {
 919                         return codepointToUtf8( $wgHtmlEntities[$name] );
 920                 } else {
 921                         return "&$name;";
 922                 }
 923         }
 924
 925         /**
 926          * Fetch the whitelist of acceptable attributes for a given
 927          * element name.
 928          *
 929          * @param string $element
 930          * @return array
 931          */
 932         function attributeWhitelist( $element ) {
 933                 static $list;
 934                 if( !isset( $list ) ) {
 935                         $list = Sanitizer::setupAttributeWhitelist();
 936                 }
 937                 return isset( $list[$element] )
 938                         ? $list[$element]
 939                         : array();
 940         }
 941
 942         /**
 943          * @return array
 944          */
 945         function setupAttributeWhitelist() {
 946                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 947                 $block = array_merge( $common, array( 'align' ) );
 948                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 949                 $tablecell = array( 'abbr',
 950                                     'axis',
 951                                     'headers',
 952                                     'scope',
 953                                     'rowspan',
 954                                     'colspan',
 955                                     'nowrap', # deprecated
 956                                     'width',  # deprecated
 957                                     'height', # deprecated
 958                                     'bgcolor' # deprecated
 959                                     );
 960
 961                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 962                 # See: http://www.w3.org/TR/html4/
 963                 $whitelist = array (
 964                         # 7.5.4
 965                         'div'        => $block,
 966                         'center'     => $common, # deprecated
 967                         'span'       => $block, # ??
 968
 969                         # 7.5.5
 970                         'h1'         => $block,
 971                         'h2'         => $block,
 972                         'h3'         => $block,
 973                         'h4'         => $block,
 974                         'h5'         => $block,
 975                         'h6'         => $block,
 976
 977                         # 7.5.6
 978                         # address
 979
 980                         # 8.2.4
 981                         # bdo
 982
 983                         # 9.2.1
 984                         'em'         => $common,
 985                         'strong'     => $common,
 986                         'cite'       => $common,
 987                         # dfn
 988                         'code'       => $common,
 989                         # samp
 990                         # kbd
 991                         'var'        => $common,
 992                         # abbr
 993                         # acronym
 994
 995                         # 9.2.2
 996                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 997                         # q
 998
 999                         # 9.2.3
1000                         'sub'        => $common,
1001                         'sup'        => $common,
1002
1003                         # 9.3.1
1004                         'p'          => $block,
1005
1006                         # 9.3.2
1007                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1008
1009                         # 9.3.4
1010                         'pre'        => array_merge( $common, array( 'width' ) ),
1011
1012                         # 9.4
1013                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1014                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1015
1016                         # 10.2
1017                         'ul'         => array_merge( $common, array( 'type' ) ),
1018                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1019                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1020
1021                         # 10.3
1022                         'dl'         => $common,
1023                         'dd'         => $common,
1024                         'dt'         => $common,
1025
1026                         # 11.2.1
1027                         'table'      => array_merge( $common,
1028                                                                 array( 'summary', 'width', 'border', 'frame',
1029                                                                                          'rules', 'cellspacing', 'cellpadding',
1030                                                                                          'align', 'bgcolor', 'frame', 'rules',
1031                                                                                          'border' ) ),
1032
1033                         # 11.2.2
1034                         'caption'    => array_merge( $common, array( 'align' ) ),
1035
1036                         # 11.2.3
1037                         'thead'      => array_merge( $common, $tablealign ),
1038                         'tfoot'      => array_merge( $common, $tablealign ),
1039                         'tbody'      => array_merge( $common, $tablealign ),
1040
1041                         # 11.2.4
1042                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1043                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1044
1045                         # 11.2.5
1046                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1047
1048                         # 11.2.6
1049                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1050                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1051
1052                         # 15.2.1
1053                         'tt'         => $common,
1054                         'b'          => $common,
1055                         'i'          => $common,
1056                         'big'        => $common,
1057                         'small'      => $common,
1058                         'strike'     => $common,
1059                         's'          => $common,
1060                         'u'          => $common,
1061
1062                         # 15.2.2
1063                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1064                         # basefont
1065
1066                         # 15.3
1067                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1068
1069                         # XHTML Ruby annotation text module, simple ruby only.
1070                         # http://www.w3c.org/TR/ruby/
1071                         'ruby'       => $common,
1072                         # rbc
1073                         # rtc
1074                         'rb'         => $common,
1075                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1076                         'rp'         => $common,
1077                         );
1078                 return $whitelist;
1079         }
1080
1081         /**
1082          * Take a fragment of (potentially invalid) HTML and return
1083          * a version with any tags removed, encoded suitably for literal
1084          * inclusion in an attribute value.
1085          *
1086          * @param string $text HTML fragment
1087          * @return string
1088          */
1089         function stripAllTags( $text ) {
1090                 # Actual <tags>
1091                 $text = preg_replace( '/ < .*? > /x', '', $text );
1092
1093                 # Normalize &entities and whitespace
1094                 $text = Sanitizer::normalizeAttributeValue( $text );
1095
1096                 # Will be placed into "double-quoted" attributes,
1097                 # make sure remaining bits are safe.
1098                 $text = str_replace(
1099                         array('<', '>', '"'),
1100                         array('&lt;', '&gt;', '&quot;'),
1101                         $text );
1102
1103                 return $text;
1104         }
1105
1106         /**
1107          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1108          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1109          * PHP 5.1 doesn't.
1110          *
1111          * Use for passing XHTML fragments to PHP's XML parsing functions
1112          *
1113          * @return string
1114          * @static
1115          */
1116         function hackDocType() {
1117                 global $wgHtmlEntities;
1118                 $out = "<!DOCTYPE html [\n";
1119                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1120                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1121                 }
1122                 $out .= "]>\n";
1123                 return $out;
1124         }
1125
1126 }
1127
1128 ?>