includes/Sanitizer.php

   1 <?php
   2 /**
   3  * (X)HTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @access private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @access private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr'
 355                         );
 356                 } else {
 357                         $htmlpairs = array();
 358                         $htmlsingle = array();
 359                         $htmlnest = array();
 360                         $tabletags = array();
 361                 }
 362
 363                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 364                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 365
 366                 # Remove HTML comments
 367                 $text = Sanitizer::removeHTMLcomments( $text );
 368
 369                 $bits = explode( '<', $text );
 370                 $text = array_shift( $bits );
 371                 if(!$wgUseTidy) {
 372                         $tagstack = array(); $tablestack = array();
 373                         foreach ( $bits as $x ) {
 374                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 375                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 376                                 $x, $regs );
 377                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 378                                 error_reporting( $prev );
 379
 380                                 $badtag = 0 ;
 381                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 382                                         # Check our stack
 383                                         if ( $slash ) {
 384                                                 # Closing a tag...
 385                                                 if( in_array( $t, $htmlsingleonly ) ) {
 386                                                         $badtag = 1;
 387                                                 } elseif( !in_array( $t, $htmlsingle ) &&
 388                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
 389                                                         @array_push( $tagstack, $ot );
 390                                                         $badtag = 1;
 391                                                 } else {
 392                                                         if ( $t == 'table' ) {
 393                                                                 $tagstack = array_pop( $tablestack );
 394                                                         }
 395                                                         $newparams = '';
 396                                                 }
 397                                         } else {
 398                                                 # Keep track for later
 399                                                 if ( in_array( $t, $tabletags ) &&
 400                                                 ! in_array( 'table', $tagstack ) ) {
 401                                                         $badtag = 1;
 402                                                 } else if ( in_array( $t, $tagstack ) &&
 403                                                 ! in_array ( $t , $htmlnest ) ) {
 404                                                         $badtag = 1 ;
 405                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 406                                                         # Hack to force empty tag for uncloseable elements
 407                                                         $brace = '/>';
 408                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 409                                                         if ( $t == 'table' ) {
 410                                                                 array_push( $tablestack, $tagstack );
 411                                                                 $tagstack = array();
 412                                                         }
 413                                                         array_push( $tagstack, $t );
 414                                                 }
 415
 416                                                 # Replace any variables or template parameters with
 417                                                 # plaintext results.
 418                                                 if( is_callable( $processCallback ) ) {
 419                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 420                                                 }
 421
 422                                                 # Strip non-approved attributes from the tag
 423                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 424                                         }
 425                                         if ( ! $badtag ) {
 426                                                 $rest = str_replace( '>', '&gt;', $rest );
 427                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 428                                                 $text .= "<$slash$t$newparams$close>$rest";
 429                                                 continue;
 430                                         }
 431                                 }
 432                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 433                         }
 434                         # Close off any remaining tags
 435                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 436                                 $text .= "</$t>\n";
 437                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 438                         }
 439                 } else {
 440                         # this might be possible using tidy itself
 441                         foreach ( $bits as $x ) {
 442                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 443                                 $x, $regs );
 444                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 445                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 446                                         if( is_callable( $processCallback ) ) {
 447                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 448                                         }
 449                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 450                                         $rest = str_replace( '>', '&gt;', $rest );
 451                                         $text .= "<$slash$t$newparams$brace$rest";
 452                                 } else {
 453                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 454                                 }
 455                         }
 456                 }
 457                 wfProfileOut( $fname );
 458                 return $text;
 459         }
 460
 461         /**
 462          * Remove '<!--', '-->', and everything between.
 463          * To avoid leaving blank lines, when a comment is both preceded
 464          * and followed by a newline (ignoring spaces), trim leading and
 465          * trailing spaces and one of the newlines.
 466          *
 467          * @access private
 468          * @param string $text
 469          * @return string
 470          */
 471         function removeHTMLcomments( $text ) {
 472                 $fname='Parser::removeHTMLcomments';
 473                 wfProfileIn( $fname );
 474                 while (($start = strpos($text, '<!--')) !== false) {
 475                         $end = strpos($text, '-->', $start + 4);
 476                         if ($end === false) {
 477                                 # Unterminated comment; bail out
 478                                 break;
 479                         }
 480
 481                         $end += 3;
 482
 483                         # Trim space and newline if the comment is both
 484                         # preceded and followed by a newline
 485                         $spaceStart = max($start - 1, 0);
 486                         $spaceLen = $end - $spaceStart;
 487                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 488                                 $spaceStart--;
 489                                 $spaceLen++;
 490                         }
 491                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 492                                 $spaceLen++;
 493                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 494                                 # Remove the comment, leading and trailing
 495                                 # spaces, and leave only one newline.
 496                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 497                         }
 498                         else {
 499                                 # Remove just the comment.
 500                                 $text = substr_replace($text, '', $start, $end - $start);
 501                         }
 502                 }
 503                 wfProfileOut( $fname );
 504                 return $text;
 505         }
 506
 507         /**
 508          * Take a tag soup fragment listing an HTML element's attributes
 509          * and normalize it to well-formed XML, discarding unwanted attributes.
 510          *
 511          * - Normalizes attribute names to lowercase
 512          * - Discards attributes not on a whitelist for the given element
 513          * - Turns broken or invalid entities into plaintext
 514          * - Double-quotes all attribute values
 515          * - Attributes without values are given the name as attribute
 516          * - Double attributes are discarded
 517          * - Unsafe style attributes are discarded
 518          * - Prepends space if there are attributes.
 519          *
 520          * @param string $text
 521          * @param string $element
 522          * @return string
 523          *
 524          * @todo Check for legal values where the DTD limits things.
 525          * @todo Check for unique id attribute :P
 526          */
 527         function fixTagAttributes( $text, $element ) {
 528                 global $wgUrlProtocols;
 529                 if( trim( $text ) == '' ) {
 530                         return '';
 531                 }
 532
 533                 # Unquoted attribute
 534                 # Since we quote this later, this can be anything distinguishable
 535                 # from the end of the attribute
 536                 if( !preg_match_all(
 537                         MW_ATTRIBS_REGEX,
 538                         $text,
 539                         $pairs,
 540                         PREG_SET_ORDER ) ) {
 541                         return '';
 542                 }
 543
 544                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 545                 $attribs = array();
 546                 foreach( $pairs as $set ) {
 547                         $attribute = strtolower( $set[1] );
 548                         if( !isset( $whitelist[$attribute] ) ) {
 549                                 continue;
 550                         }
 551
 552                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 553                         $value = Sanitizer::normalizeAttributeValue( $raw );
 554
 555                         # Strip javascript "expression" from stylesheets.
 556                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 557                         if( $attribute == 'style' ) {
 558                                 // Remove any comments; IE gets token splitting wrong
 559                                 $value = preg_replace( '!/\\*.*?\\*/!S', ' ', $value );
 560
 561                                 $stripped = Sanitizer::decodeCharReferences( $value );
 562                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 563                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 564                                 $stripped = str_replace( '\\', '', $stripped );
 565                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 566                                                 $stripped ) ) {
 567                                         # haxx0r
 568                                         continue;
 569                                 }
 570                         }
 571
 572                         # Templates and links may be expanded in later parsing,
 573                         # creating invalid or dangerous output. Suppress this.
 574                         $value = strtr( $value, array(
 575                                 '{'    => '&#123;',
 576                                 '['    => '&#91;',
 577                                 "''"   => '&#39;&#39;',
 578                                 'ISBN' => '&#73;SBN',
 579                                 'RFC'  => '&#82;FC',
 580                                 'PMID' => '&#80;MID',
 581                         ) );
 582
 583                         # Stupid hack
 584                         $value = preg_replace_callback(
 585                                 '/(' . $wgUrlProtocols . ')/',
 586                                 array( 'Sanitizer', 'armorLinksCallback' ),
 587                                 $value );
 588
 589                         // If this attribute was previously set, override it.
 590                         // Output should only have one attribute of each name.
 591                         $attribs[$attribute] = "$attribute=\"$value\"";
 592                 }
 593                 if( empty( $attribs ) ) {
 594                         return '';
 595                 } else {
 596                         return ' ' . implode( ' ', $attribs );
 597                 }
 598         }
 599
 600         /**
 601          * Regex replace callback for armoring links against further processing.
 602          * @param array $matches
 603          * @return string
 604          * @access private
 605          */
 606         function armorLinksCallback( $matches ) {
 607                 return str_replace( ':', '&#58;', $matches[1] );
 608         }
 609
 610         /**
 611          * Return an associative array of attribute names and values from
 612          * a partial tag string. Attribute names are forces to lowercase,
 613          * character references are decoded to UTF-8 text.
 614          *
 615          * @param string
 616          * @return array
 617          */
 618         function decodeTagAttributes( $text ) {
 619                 $attribs = array();
 620
 621                 if( trim( $text ) == '' ) {
 622                         return $attribs;
 623                 }
 624
 625                 if( !preg_match_all(
 626                         MW_ATTRIBS_REGEX,
 627                         $text,
 628                         $pairs,
 629                         PREG_SET_ORDER ) ) {
 630                         return $attribs;
 631                 }
 632
 633                 foreach( $pairs as $set ) {
 634                         $attribute = strtolower( $set[1] );
 635                         $value = Sanitizer::getTagAttributeCallback( $set );
 636                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 637                 }
 638                 return $attribs;
 639         }
 640
 641         /**
 642          * Pick the appropriate attribute value from a match set from the
 643          * MW_ATTRIBS_REGEX matches.
 644          *
 645          * @param array $set
 646          * @return string
 647          * @access private
 648          */
 649         function getTagAttributeCallback( $set ) {
 650                 if( isset( $set[6] ) ) {
 651                         # Illegal #XXXXXX color with no quotes.
 652                         return $set[6];
 653                 } elseif( isset( $set[5] ) ) {
 654                         # No quotes.
 655                         return $set[5];
 656                 } elseif( isset( $set[4] ) ) {
 657                         # Single-quoted
 658                         return $set[4];
 659                 } elseif( isset( $set[3] ) ) {
 660                         # Double-quoted
 661                         return $set[3];
 662                 } elseif( !isset( $set[2] ) ) {
 663                         # In XHTML, attributes must have a value.
 664                         # For 'reduced' form, return explicitly the attribute name here.
 665                         return $set[1];
 666                 } else {
 667                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 668                 }
 669         }
 670
 671         /**
 672          * Normalize whitespace and character references in an XML source-
 673          * encoded text for an attribute value.
 674          *
 675          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 676          * but note that we're not returning the value, but are returning
 677          * XML source fragments that will be slapped into output.
 678          *
 679          * @param string $text
 680          * @return string
 681          * @access private
 682          */
 683         function normalizeAttributeValue( $text ) {
 684                 return str_replace( '"', '&quot;',
 685                         preg_replace(
 686                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 687                                 ' ',
 688                                 Sanitizer::normalizeCharReferences( $text ) ) );
 689         }
 690
 691         /**
 692          * Ensure that any entities and character references are legal
 693          * for XML and XHTML specifically. Any stray bits will be
 694          * &amp;-escaped to result in a valid text fragment.
 695          *
 696          * a. any named char refs must be known in XHTML
 697          * b. any numeric char refs must be legal chars, not invalid or forbidden
 698          * c. use &#x, not &#X
 699          * d. fix or reject non-valid attributes
 700          *
 701          * @param string $text
 702          * @return string
 703          * @access private
 704          */
 705         function normalizeCharReferences( $text ) {
 706                 return preg_replace_callback(
 707                         MW_CHAR_REFS_REGEX,
 708                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 709                         $text );
 710         }
 711         /**
 712          * @param string $matches
 713          * @return string
 714          */
 715         function normalizeCharReferencesCallback( $matches ) {
 716                 $ret = null;
 717                 if( $matches[1] != '' ) {
 718                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 719                 } elseif( $matches[2] != '' ) {
 720                         $ret = Sanitizer::decCharReference( $matches[2] );
 721                 } elseif( $matches[3] != ''  ) {
 722                         $ret = Sanitizer::hexCharReference( $matches[3] );
 723                 } elseif( $matches[4] != '' ) {
 724                         $ret = Sanitizer::hexCharReference( $matches[4] );
 725                 }
 726                 if( is_null( $ret ) ) {
 727                         return htmlspecialchars( $matches[0] );
 728                 } else {
 729                         return $ret;
 730                 }
 731         }
 732
 733         /**
 734          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 735          * return the named entity reference as is. Otherwise, returns
 736          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 737          *
 738          * @param string $name
 739          * @return string
 740          */
 741         function normalizeEntity( $name ) {
 742                 global $wgHtmlEntities;
 743                 if( isset( $wgHtmlEntities[$name] ) ) {
 744                         return "&$name;";
 745                 } else {
 746                         return "&amp;$name;";
 747                 }
 748         }
 749
 750         function decCharReference( $codepoint ) {
 751                 $point = intval( $codepoint );
 752                 if( Sanitizer::validateCodepoint( $point ) ) {
 753                         return sprintf( '&#%d;', $point );
 754                 } else {
 755                         return null;
 756                 }
 757         }
 758
 759         function hexCharReference( $codepoint ) {
 760                 $point = hexdec( $codepoint );
 761                 if( Sanitizer::validateCodepoint( $point ) ) {
 762                         return sprintf( '&#x%x;', $point );
 763                 } else {
 764                         return null;
 765                 }
 766         }
 767
 768         /**
 769          * Returns true if a given Unicode codepoint is a valid character in XML.
 770          * @param int $codepoint
 771          * @return bool
 772          */
 773         function validateCodepoint( $codepoint ) {
 774                 return ($codepoint ==    0x09)
 775                         || ($codepoint ==    0x0a)
 776                         || ($codepoint ==    0x0d)
 777                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 778                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 779                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 780         }
 781
 782         /**
 783          * Decode any character references, numeric or named entities,
 784          * in the text and return a UTF-8 string.
 785          *
 786          * @param string $text
 787          * @return string
 788          * @access public
 789          */
 790         function decodeCharReferences( $text ) {
 791                 return preg_replace_callback(
 792                         MW_CHAR_REFS_REGEX,
 793                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 794                         $text );
 795         }
 796
 797         /**
 798          * @param string $matches
 799          * @return string
 800          */
 801         function decodeCharReferencesCallback( $matches ) {
 802                 if( $matches[1] != '' ) {
 803                         return Sanitizer::decodeEntity( $matches[1] );
 804                 } elseif( $matches[2] != '' ) {
 805                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 806                 } elseif( $matches[3] != ''  ) {
 807                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 808                 } elseif( $matches[4] != '' ) {
 809                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 810                 }
 811                 # Last case should be an ampersand by itself
 812                 return $matches[0];
 813         }
 814
 815         /**
 816          * Return UTF-8 string for a codepoint if that is a valid
 817          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 818          * @param int $codepoint
 819          * @return string
 820          * @access private
 821          */
 822         function decodeChar( $codepoint ) {
 823                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 824                         return codepointToUtf8( $codepoint );
 825                 } else {
 826                         return UTF8_REPLACEMENT;
 827                 }
 828         }
 829
 830         /**
 831          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 832          * return the UTF-8 encoding of that character. Otherwise, returns
 833          * pseudo-entity source (eg &foo;)
 834          *
 835          * @param string $name
 836          * @return string
 837          */
 838         function decodeEntity( $name ) {
 839                 global $wgHtmlEntities;
 840                 if( isset( $wgHtmlEntities[$name] ) ) {
 841                         return codepointToUtf8( $wgHtmlEntities[$name] );
 842                 } else {
 843                         return "&$name;";
 844                 }
 845         }
 846
 847         /**
 848          * Fetch the whitelist of acceptable attributes for a given
 849          * element name.
 850          *
 851          * @param string $element
 852          * @return array
 853          */
 854         function attributeWhitelist( $element ) {
 855                 static $list;
 856                 if( !isset( $list ) ) {
 857                         $list = Sanitizer::setupAttributeWhitelist();
 858                 }
 859                 return isset( $list[$element] )
 860                         ? $list[$element]
 861                         : array();
 862         }
 863
 864         /**
 865          * @return array
 866          */
 867         function setupAttributeWhitelist() {
 868                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 869                 $block = array_merge( $common, array( 'align' ) );
 870                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 871                 $tablecell = array( 'abbr',
 872                                     'axis',
 873                                     'headers',
 874                                     'scope',
 875                                     'rowspan',
 876                                     'colspan',
 877                                     'nowrap', # deprecated
 878                                     'width',  # deprecated
 879                                     'height', # deprecated
 880                                     'bgcolor' # deprecated
 881                                     );
 882
 883                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 884                 # See: http://www.w3.org/TR/html4/
 885                 $whitelist = array (
 886                         # 7.5.4
 887                         'div'        => $block,
 888                         'center'     => $common, # deprecated
 889                         'span'       => $block, # ??
 890
 891                         # 7.5.5
 892                         'h1'         => $block,
 893                         'h2'         => $block,
 894                         'h3'         => $block,
 895                         'h4'         => $block,
 896                         'h5'         => $block,
 897                         'h6'         => $block,
 898
 899                         # 7.5.6
 900                         # address
 901
 902                         # 8.2.4
 903                         # bdo
 904
 905                         # 9.2.1
 906                         'em'         => $common,
 907                         'strong'     => $common,
 908                         'cite'       => $common,
 909                         # dfn
 910                         'code'       => $common,
 911                         # samp
 912                         # kbd
 913                         'var'        => $common,
 914                         # abbr
 915                         # acronym
 916
 917                         # 9.2.2
 918                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 919                         # q
 920
 921                         # 9.2.3
 922                         'sub'        => $common,
 923                         'sup'        => $common,
 924
 925                         # 9.3.1
 926                         'p'          => $block,
 927
 928                         # 9.3.2
 929                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 930
 931                         # 9.3.4
 932                         'pre'        => array_merge( $common, array( 'width' ) ),
 933
 934                         # 9.4
 935                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 936                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 937
 938                         # 10.2
 939                         'ul'         => array_merge( $common, array( 'type' ) ),
 940                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 941                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 942
 943                         # 10.3
 944                         'dl'         => $common,
 945                         'dd'         => $common,
 946                         'dt'         => $common,
 947
 948                         # 11.2.1
 949                         'table'      => array_merge( $common,
 950                                                                 array( 'summary', 'width', 'border', 'frame',
 951                                                                                          'rules', 'cellspacing', 'cellpadding',
 952                                                                                          'align', 'bgcolor', 'frame', 'rules',
 953                                                                                          'border' ) ),
 954
 955                         # 11.2.2
 956                         'caption'    => array_merge( $common, array( 'align' ) ),
 957
 958                         # 11.2.3
 959                         'thead'      => array_merge( $common, $tablealign ),
 960                         'tfoot'      => array_merge( $common, $tablealign ),
 961                         'tbody'      => array_merge( $common, $tablealign ),
 962
 963                         # 11.2.4
 964                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 965                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 966
 967                         # 11.2.5
 968                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 969
 970                         # 11.2.6
 971                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 972                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 973
 974                         # 15.2.1
 975                         'tt'         => $common,
 976                         'b'          => $common,
 977                         'i'          => $common,
 978                         'big'        => $common,
 979                         'small'      => $common,
 980                         'strike'     => $common,
 981                         's'          => $common,
 982                         'u'          => $common,
 983
 984                         # 15.2.2
 985                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 986                         # basefont
 987
 988                         # 15.3
 989                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 990
 991                         # XHTML Ruby annotation text module, simple ruby only.
 992                         # http://www.w3c.org/TR/ruby/
 993                         'ruby'       => $common,
 994                         # rbc
 995                         # rtc
 996                         'rb'         => $common,
 997                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 998                         'rp'         => $common,
 999                         );
1000                 return $whitelist;
1001         }
1002
1003         /**
1004          * Take a fragment of (potentially invalid) HTML and return
1005          * a version with any tags removed, encoded suitably for literal
1006          * inclusion in an attribute value.
1007          *
1008          * @param string $text HTML fragment
1009          * @return string
1010          */
1011         function stripAllTags( $text ) {
1012                 # Actual <tags>
1013                 $text = preg_replace( '/<[^>]*>/', '', $text );
1014
1015                 # Normalize &entities and whitespace
1016                 $text = Sanitizer::normalizeAttributeValue( $text );
1017
1018                 # Will be placed into "double-quoted" attributes,
1019                 # make sure remaining bits are safe.
1020                 $text = str_replace(
1021                         array('<', '>', '"'),
1022                         array('&lt;', '&gt;', '&quot;'),
1023                         $text );
1024
1025                 return $text;
1026         }
1027
1028 }
1029
1030 ?>