includes/parser/Sanitizer.php

   1 <?php
   2 /**
   3  * HTML sanitizer for %MediaWiki.
   4  *
   5  * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * https://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 use MediaWiki\MediaWikiServices;
  28
  29 /**
  30  * HTML sanitizer for MediaWiki
  31  * @ingroup Parser
  32  */
  33 class Sanitizer {
  34         /**
  35          * Regular expression to match various types of character references in
  36          * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  37          */
  38         const CHAR_REFS_REGEX =
  39                 '/&([A-Za-z0-9\x80-\xff]+);
  40                  |&\#([0-9]+);
  41                  |&\#[xX]([0-9A-Fa-f]+);
  42                  |(&)/x';
  43
  44         /**
  45          * Acceptable tag name charset from HTML5 parsing spec
  46          * https://www.w3.org/TR/html5/syntax.html#tag-open-state
  47          */
  48         const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
  49
  50         /**
  51          * Blacklist for evil uris like javascript:
  52          * WARNING: DO NOT use this in any place that actually requires blacklisting
  53          * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
  54          * only way to be secure from javascript: uri based xss vectors is to whitelist
  55          * things that you know are safe and deny everything else.
  56          * [1]: http://ha.ckers.org/xss.html
  57          */
  58         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
  59         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
  60
  61         /**
  62          * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
  63          *
  64          * @since 1.30
  65          */
  66         const ID_PRIMARY = 0;
  67
  68         /**
  69          * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
  70          * if no fallback is configured.
  71          *
  72          * @since 1.30
  73          */
  74         const ID_FALLBACK = 1;
  75
  76         /**
  77          * List of all named character entities defined in HTML 4.01
  78          * https://www.w3.org/TR/html4/sgml/entities.html
  79          * As well as &apos; which is only defined starting in XHTML1.
  80          */
  81         private static $htmlEntities = [
  82                 'Aacute'   => 193,
  83                 'aacute'   => 225,
  84                 'Acirc'    => 194,
  85                 'acirc'    => 226,
  86                 'acute'    => 180,
  87                 'AElig'    => 198,
  88                 'aelig'    => 230,
  89                 'Agrave'   => 192,
  90                 'agrave'   => 224,
  91                 'alefsym'  => 8501,
  92                 'Alpha'    => 913,
  93                 'alpha'    => 945,
  94                 'amp'      => 38,
  95                 'and'      => 8743,
  96                 'ang'      => 8736,
  97                 'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
  98                 'Aring'    => 197,
  99                 'aring'    => 229,
 100                 'asymp'    => 8776,
 101                 'Atilde'   => 195,
 102                 'atilde'   => 227,
 103                 'Auml'     => 196,
 104                 'auml'     => 228,
 105                 'bdquo'    => 8222,
 106                 'Beta'     => 914,
 107                 'beta'     => 946,
 108                 'brvbar'   => 166,
 109                 'bull'     => 8226,
 110                 'cap'      => 8745,
 111                 'Ccedil'   => 199,
 112                 'ccedil'   => 231,
 113                 'cedil'    => 184,
 114                 'cent'     => 162,
 115                 'Chi'      => 935,
 116                 'chi'      => 967,
 117                 'circ'     => 710,
 118                 'clubs'    => 9827,
 119                 'cong'     => 8773,
 120                 'copy'     => 169,
 121                 'crarr'    => 8629,
 122                 'cup'      => 8746,
 123                 'curren'   => 164,
 124                 'dagger'   => 8224,
 125                 'Dagger'   => 8225,
 126                 'darr'     => 8595,
 127                 'dArr'     => 8659,
 128                 'deg'      => 176,
 129                 'Delta'    => 916,
 130                 'delta'    => 948,
 131                 'diams'    => 9830,
 132                 'divide'   => 247,
 133                 'Eacute'   => 201,
 134                 'eacute'   => 233,
 135                 'Ecirc'    => 202,
 136                 'ecirc'    => 234,
 137                 'Egrave'   => 200,
 138                 'egrave'   => 232,
 139                 'empty'    => 8709,
 140                 'emsp'     => 8195,
 141                 'ensp'     => 8194,
 142                 'Epsilon'  => 917,
 143                 'epsilon'  => 949,
 144                 'equiv'    => 8801,
 145                 'Eta'      => 919,
 146                 'eta'      => 951,
 147                 'ETH'      => 208,
 148                 'eth'      => 240,
 149                 'Euml'     => 203,
 150                 'euml'     => 235,
 151                 'euro'     => 8364,
 152                 'exist'    => 8707,
 153                 'fnof'     => 402,
 154                 'forall'   => 8704,
 155                 'frac12'   => 189,
 156                 'frac14'   => 188,
 157                 'frac34'   => 190,
 158                 'frasl'    => 8260,
 159                 'Gamma'    => 915,
 160                 'gamma'    => 947,
 161                 'ge'       => 8805,
 162                 'gt'       => 62,
 163                 'harr'     => 8596,
 164                 'hArr'     => 8660,
 165                 'hearts'   => 9829,
 166                 'hellip'   => 8230,
 167                 'Iacute'   => 205,
 168                 'iacute'   => 237,
 169                 'Icirc'    => 206,
 170                 'icirc'    => 238,
 171                 'iexcl'    => 161,
 172                 'Igrave'   => 204,
 173                 'igrave'   => 236,
 174                 'image'    => 8465,
 175                 'infin'    => 8734,
 176                 'int'      => 8747,
 177                 'Iota'     => 921,
 178                 'iota'     => 953,
 179                 'iquest'   => 191,
 180                 'isin'     => 8712,
 181                 'Iuml'     => 207,
 182                 'iuml'     => 239,
 183                 'Kappa'    => 922,
 184                 'kappa'    => 954,
 185                 'Lambda'   => 923,
 186                 'lambda'   => 955,
 187                 'lang'     => 9001,
 188                 'laquo'    => 171,
 189                 'larr'     => 8592,
 190                 'lArr'     => 8656,
 191                 'lceil'    => 8968,
 192                 'ldquo'    => 8220,
 193                 'le'       => 8804,
 194                 'lfloor'   => 8970,
 195                 'lowast'   => 8727,
 196                 'loz'      => 9674,
 197                 'lrm'      => 8206,
 198                 'lsaquo'   => 8249,
 199                 'lsquo'    => 8216,
 200                 'lt'       => 60,
 201                 'macr'     => 175,
 202                 'mdash'    => 8212,
 203                 'micro'    => 181,
 204                 'middot'   => 183,
 205                 'minus'    => 8722,
 206                 'Mu'       => 924,
 207                 'mu'       => 956,
 208                 'nabla'    => 8711,
 209                 'nbsp'     => 160,
 210                 'ndash'    => 8211,
 211                 'ne'       => 8800,
 212                 'ni'       => 8715,
 213                 'not'      => 172,
 214                 'notin'    => 8713,
 215                 'nsub'     => 8836,
 216                 'Ntilde'   => 209,
 217                 'ntilde'   => 241,
 218                 'Nu'       => 925,
 219                 'nu'       => 957,
 220                 'Oacute'   => 211,
 221                 'oacute'   => 243,
 222                 'Ocirc'    => 212,
 223                 'ocirc'    => 244,
 224                 'OElig'    => 338,
 225                 'oelig'    => 339,
 226                 'Ograve'   => 210,
 227                 'ograve'   => 242,
 228                 'oline'    => 8254,
 229                 'Omega'    => 937,
 230                 'omega'    => 969,
 231                 'Omicron'  => 927,
 232                 'omicron'  => 959,
 233                 'oplus'    => 8853,
 234                 'or'       => 8744,
 235                 'ordf'     => 170,
 236                 'ordm'     => 186,
 237                 'Oslash'   => 216,
 238                 'oslash'   => 248,
 239                 'Otilde'   => 213,
 240                 'otilde'   => 245,
 241                 'otimes'   => 8855,
 242                 'Ouml'     => 214,
 243                 'ouml'     => 246,
 244                 'para'     => 182,
 245                 'part'     => 8706,
 246                 'permil'   => 8240,
 247                 'perp'     => 8869,
 248                 'Phi'      => 934,
 249                 'phi'      => 966,
 250                 'Pi'       => 928,
 251                 'pi'       => 960,
 252                 'piv'      => 982,
 253                 'plusmn'   => 177,
 254                 'pound'    => 163,
 255                 'prime'    => 8242,
 256                 'Prime'    => 8243,
 257                 'prod'     => 8719,
 258                 'prop'     => 8733,
 259                 'Psi'      => 936,
 260                 'psi'      => 968,
 261                 'quot'     => 34,
 262                 'radic'    => 8730,
 263                 'rang'     => 9002,
 264                 'raquo'    => 187,
 265                 'rarr'     => 8594,
 266                 'rArr'     => 8658,
 267                 'rceil'    => 8969,
 268                 'rdquo'    => 8221,
 269                 'real'     => 8476,
 270                 'reg'      => 174,
 271                 'rfloor'   => 8971,
 272                 'Rho'      => 929,
 273                 'rho'      => 961,
 274                 'rlm'      => 8207,
 275                 'rsaquo'   => 8250,
 276                 'rsquo'    => 8217,
 277                 'sbquo'    => 8218,
 278                 'Scaron'   => 352,
 279                 'scaron'   => 353,
 280                 'sdot'     => 8901,
 281                 'sect'     => 167,
 282                 'shy'      => 173,
 283                 'Sigma'    => 931,
 284                 'sigma'    => 963,
 285                 'sigmaf'   => 962,
 286                 'sim'      => 8764,
 287                 'spades'   => 9824,
 288                 'sub'      => 8834,
 289                 'sube'     => 8838,
 290                 'sum'      => 8721,
 291                 'sup'      => 8835,
 292                 'sup1'     => 185,
 293                 'sup2'     => 178,
 294                 'sup3'     => 179,
 295                 'supe'     => 8839,
 296                 'szlig'    => 223,
 297                 'Tau'      => 932,
 298                 'tau'      => 964,
 299                 'there4'   => 8756,
 300                 'Theta'    => 920,
 301                 'theta'    => 952,
 302                 'thetasym' => 977,
 303                 'thinsp'   => 8201,
 304                 'THORN'    => 222,
 305                 'thorn'    => 254,
 306                 'tilde'    => 732,
 307                 'times'    => 215,
 308                 'trade'    => 8482,
 309                 'Uacute'   => 218,
 310                 'uacute'   => 250,
 311                 'uarr'     => 8593,
 312                 'uArr'     => 8657,
 313                 'Ucirc'    => 219,
 314                 'ucirc'    => 251,
 315                 'Ugrave'   => 217,
 316                 'ugrave'   => 249,
 317                 'uml'      => 168,
 318                 'upsih'    => 978,
 319                 'Upsilon'  => 933,
 320                 'upsilon'  => 965,
 321                 'Uuml'     => 220,
 322                 'uuml'     => 252,
 323                 'weierp'   => 8472,
 324                 'Xi'       => 926,
 325                 'xi'       => 958,
 326                 'Yacute'   => 221,
 327                 'yacute'   => 253,
 328                 'yen'      => 165,
 329                 'Yuml'     => 376,
 330                 'yuml'     => 255,
 331                 'Zeta'     => 918,
 332                 'zeta'     => 950,
 333                 'zwj'      => 8205,
 334                 'zwnj'     => 8204
 335         ];
 336
 337         /**
 338          * Character entity aliases accepted by MediaWiki
 339          */
 340         private static $htmlEntityAliases = [
 341                 'רלמ' => 'rlm',
 342                 'رلم' => 'rlm',
 343         ];
 344
 345         /**
 346          * Lazy-initialised attributes regex, see getAttribsRegex()
 347          */
 348         private static $attribsRegex;
 349
 350         /**
 351          * Regular expression to match HTML/XML attribute pairs within a tag.
 352          * Allows some... latitude. Based on,
 353          * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 354          * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
 355          * @return string
 356          */
 357         static function getAttribsRegex() {
 358                 if ( self::$attribsRegex === null ) {
 359                         $attribFirst = "[:_\p{L}\p{N}]";
 360                         $attrib = "[:_\.\-\p{L}\p{N}]";
 361                         $space = '[\x09\x0a\x0c\x0d\x20]';
 362                         self::$attribsRegex =
 363                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
 364                                         ($space*=$space*
 365                                         (?:
 366                                                 # The attribute value: quoted or alone
 367                                                 \"([^\"]*)(?:\"|\$)
 368                                                 | '([^']*)(?:'|\$)
 369                                                 | (((?!$space|>).)*)
 370                                         )
 371                                 )?(?=$space|\$)/sxu";
 372                 }
 373                 return self::$attribsRegex;
 374         }
 375
 376         /**
 377          * Return the various lists of recognized tags
 378          * @param array $extratags For any extra tags to include
 379          * @param array $removetags For any tags (default or extra) to exclude
 380          * @return array
 381          */
 382         public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
 383                 global $wgAllowImageTag;
 384
 385                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 386                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 387
 388                 // Base our staticInitialised variable off of the global config state so that if the globals
 389                 // are changed (like in the screwed up test system) we will re-initialise the settings.
 390                 $globalContext = $wgAllowImageTag;
 391                 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
 392                         $htmlpairsStatic = [ # Tags that must be closed
 393                                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 394                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 395                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 396                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 397                                 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
 398                                 'kbd', 'samp', 'data', 'time', 'mark'
 399                         ];
 400                         $htmlsingle = [
 401                                 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
 402                         ];
 403
 404                         # Elements that cannot have close tags. This is (not coincidentally)
 405                         # also the list of tags for which the HTML 5 parsing algorithm
 406                         # requires you to "acknowledge the token's self-closing flag", i.e.
 407                         # a self-closing tag like <br/> is not an HTML 5 parse error only
 408                         # for this list.
 409                         $htmlsingleonly = [
 410                                 'br', 'wbr', 'hr', 'meta', 'link'
 411                         ];
 412
 413                         $htmlnest = [ # Tags that can be nested--??
 414                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 415                                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
 416                                 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
 417                         ];
 418                         $tabletags = [ # Can only appear inside table, we will close them
 419                                 'td', 'th', 'tr',
 420                         ];
 421                         $htmllist = [ # Tags used by list
 422                                 'ul', 'ol',
 423                         ];
 424                         $listtags = [ # Tags that can appear in a list
 425                                 'li',
 426                         ];
 427
 428                         if ( $wgAllowImageTag ) {
 429                                 $htmlsingle[] = 'img';
 430                                 $htmlsingleonly[] = 'img';
 431                         }
 432
 433                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 434                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 435
 436                         # Convert them all to hashtables for faster lookup
 437                         $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 438                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
 439                         foreach ( $vars as $var ) {
 440                                 $$var = array_flip( $$var );
 441                         }
 442                         $staticInitialised = $globalContext;
 443                 }
 444
 445                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 446                 $extratags = array_flip( $extratags );
 447                 $removetags = array_flip( $removetags );
 448                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 449                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
 450
 451                 return [
 452                         'htmlpairs' => $htmlpairs,
 453                         'htmlsingle' => $htmlsingle,
 454                         'htmlsingleonly' => $htmlsingleonly,
 455                         'htmlnest' => $htmlnest,
 456                         'tabletags' => $tabletags,
 457                         'htmllist' => $htmllist,
 458                         'listtags' => $listtags,
 459                         'htmlsingleallowed' => $htmlsingleallowed,
 460                         'htmlelements' => $htmlelements,
 461                 ];
 462         }
 463
 464         /**
 465          * Cleans up HTML, removes dangerous tags and attributes, and
 466          * removes HTML comments
 467          * @param string $text
 468          * @param callable|null $processCallback Callback to do any variable or parameter
 469          *   replacements in HTML attribute values
 470          * @param array|bool $args Arguments for the processing callback
 471          * @param array $extratags For any extra tags to include
 472          * @param array $removetags For any tags (default or extra) to exclude
 473          * @param callable|null $warnCallback (Deprecated) Callback allowing the
 474          *   addition of a tracking category when bad input is encountered.
 475          *   DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be
 476          *   removed shortly.
 477          * @return string
 478          */
 479         public static function removeHTMLtags( $text, $processCallback = null,
 480                 $args = [], $extratags = [], $removetags = [], $warnCallback = null
 481         ) {
 482                 $tagData = self::getRecognizedTagData( $extratags, $removetags );
 483                 $htmlpairs = $tagData['htmlpairs'];
 484                 $htmlsingle = $tagData['htmlsingle'];
 485                 $htmlsingleonly = $tagData['htmlsingleonly'];
 486                 $htmlnest = $tagData['htmlnest'];
 487                 $tabletags = $tagData['tabletags'];
 488                 $htmllist = $tagData['htmllist'];
 489                 $listtags = $tagData['listtags'];
 490                 $htmlsingleallowed = $tagData['htmlsingleallowed'];
 491                 $htmlelements = $tagData['htmlelements'];
 492
 493                 # Remove HTML comments
 494                 $text = self::removeHTMLcomments( $text );
 495                 $bits = explode( '<', $text );
 496                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 497                 if ( !MWTidy::isEnabled() ) {
 498                         wfDeprecated( 'disabling tidy', '1.33' );
 499                         $tagstack = $tablestack = [];
 500                         foreach ( $bits as $x ) {
 501                                 $regs = [];
 502                                 # $slash: Does the current element start with a '/'?
 503                                 # $t: Current element name
 504                                 # $params: String between element name and >
 505                                 # $brace: Ending '>' or '/>'
 506                                 # $rest: Everything until the next element of $bits
 507                                 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
 508                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 509                                 } else {
 510                                         $slash = $t = $params = $brace = $rest = null;
 511                                 }
 512
 513                                 $badtag = false;
 514                                 $t = strtolower( $t );
 515                                 if ( isset( $htmlelements[$t] ) ) {
 516                                         # Check our stack
 517                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
 518                                                 $badtag = true;
 519                                         } elseif ( $slash ) {
 520                                                 # Closing a tag... is it the one we just opened?
 521                                                 Wikimedia\suppressWarnings();
 522                                                 $ot = array_pop( $tagstack );
 523                                                 Wikimedia\restoreWarnings();
 524
 525                                                 if ( $ot != $t ) {
 526                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 527                                                                 # Pop all elements with an optional close tag
 528                                                                 # and see if we find a match below them
 529                                                                 $optstack = [];
 530                                                                 array_push( $optstack, $ot );
 531                                                                 Wikimedia\suppressWarnings();
 532                                                                 $ot = array_pop( $tagstack );
 533                                                                 Wikimedia\restoreWarnings();
 534                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
 535                                                                         array_push( $optstack, $ot );
 536                                                                         Wikimedia\suppressWarnings();
 537                                                                         $ot = array_pop( $tagstack );
 538                                                                         Wikimedia\restoreWarnings();
 539                                                                 }
 540                                                                 if ( $t != $ot ) {
 541                                                                         # No match. Push the optional elements back again
 542                                                                         $badtag = true;
 543                                                                         Wikimedia\suppressWarnings();
 544                                                                         $ot = array_pop( $optstack );
 545                                                                         Wikimedia\restoreWarnings();
 546                                                                         while ( $ot ) {
 547                                                                                 array_push( $tagstack, $ot );
 548                                                                                 Wikimedia\suppressWarnings();
 549                                                                                 $ot = array_pop( $optstack );
 550                                                                                 Wikimedia\restoreWarnings();
 551                                                                         }
 552                                                                 }
 553                                                         } else {
 554                                                                 Wikimedia\suppressWarnings();
 555                                                                 array_push( $tagstack, $ot );
 556                                                                 Wikimedia\restoreWarnings();
 557
 558                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 559                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
 560                                                                         $badtag = true;
 561                                                                 }
 562                                                         }
 563                                                 } else {
 564                                                         if ( $t == 'table' ) {
 565                                                                 $tagstack = array_pop( $tablestack );
 566                                                         }
 567                                                 }
 568                                                 $newparams = '';
 569                                         } else {
 570                                                 # Keep track for later
 571                                                 if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
 572                                                         $badtag = true;
 573                                                 } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
 574                                                         $badtag = true;
 575                                                 #  Is it a self closed htmlpair ? (T7487)
 576                                                 } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
 577                                                         // Eventually we'll just remove the self-closing
 578                                                         // slash, in order to be consistent with HTML5
 579                                                         // semantics.
 580                                                         // $brace = '>';
 581                                                         // For now, let's just warn authors to clean up.
 582                                                         if ( is_callable( $warnCallback ) ) {
 583                                                                 call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
 584                                                         }
 585                                                         $badtag = true;
 586                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
 587                                                         # Hack to force empty tag for unclosable elements
 588                                                         $brace = '/>';
 589                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
 590                                                         # Hack to not close $htmlsingle tags
 591                                                         $brace = null;
 592                                                         # Still need to push this optionally-closed tag to
 593                                                         # the tag stack so that we can match end tags
 594                                                         # instead of marking them as bad.
 595                                                         array_push( $tagstack, $t );
 596                                                 } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
 597                                                         // New table tag but forgot to close the previous one
 598                                                         $text .= "</$t>";
 599                                                 } else {
 600                                                         if ( $t == 'table' ) {
 601                                                                 array_push( $tablestack, $tagstack );
 602                                                                 $tagstack = [];
 603                                                         }
 604                                                         array_push( $tagstack, $t );
 605                                                 }
 606
 607                                                 # Replace any variables or template parameters with
 608                                                 # plaintext results.
 609                                                 if ( is_callable( $processCallback ) ) {
 610                                                         call_user_func_array( $processCallback, [ &$params, $args ] );
 611                                                 }
 612
 613                                                 if ( !self::validateTag( $params, $t ) ) {
 614                                                         $badtag = true;
 615                                                 }
 616
 617                                                 # Strip non-approved attributes from the tag
 618                                                 $newparams = self::fixTagAttributes( $params, $t );
 619                                         }
 620                                         if ( !$badtag ) {
 621                                                 $rest = str_replace( '>', '&gt;', $rest );
 622                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 623                                                 $text .= "<$slash$t$newparams$close>$rest";
 624                                                 continue;
 625                                         }
 626                                 }
 627                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
 628                         }
 629                         # Close off any remaining tags
 630                         while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
 631                                 $text .= "</$t>\n";
 632                                 if ( $t == 'table' ) {
 633                                         $tagstack = array_pop( $tablestack );
 634                                 }
 635                         }
 636                 } else {
 637                         # this might be possible using tidy itself
 638                         foreach ( $bits as $x ) {
 639                                 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
 640                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 641
 642                                         $badtag = false;
 643                                         $t = strtolower( $t );
 644                                         if ( isset( $htmlelements[$t] ) ) {
 645                                                 if ( is_callable( $processCallback ) ) {
 646                                                         call_user_func_array( $processCallback, [ &$params, $args ] );
 647                                                 }
 648
 649                                                 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
 650                                                         // Eventually we'll just remove the self-closing
 651                                                         // slash, in order to be consistent with HTML5
 652                                                         // semantics.
 653                                                         // $brace = '>';
 654                                                         // For now, let's just warn authors to clean up.
 655                                                         if ( is_callable( $warnCallback ) ) {
 656                                                                 call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
 657                                                         }
 658                                                 }
 659                                                 if ( !self::validateTag( $params, $t ) ) {
 660                                                         $badtag = true;
 661                                                 }
 662
 663                                                 $newparams = self::fixTagAttributes( $params, $t );
 664                                                 if ( !$badtag ) {
 665                                                         if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
 666                                                                 # Interpret self-closing tags as empty tags even when
 667                                                                 # HTML 5 would interpret them as start tags. Such input
 668                                                                 # is commonly seen on Wikimedia wikis with this intention.
 669                                                                 $brace = "></$t>";
 670                                                         }
 671
 672                                                         $rest = str_replace( '>', '&gt;', $rest );
 673                                                         $text .= "<$slash$t$newparams$brace$rest";
 674                                                         continue;
 675                                                 }
 676                                         }
 677                                 }
 678                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
 679                         }
 680                 }
 681                 return $text;
 682         }
 683
 684         /**
 685          * Remove '<!--', '-->', and everything between.
 686          * To avoid leaving blank lines, when a comment is both preceded
 687          * and followed by a newline (ignoring spaces), trim leading and
 688          * trailing spaces and one of the newlines.
 689          *
 690          * @param string $text
 691          * @return string
 692          */
 693         public static function removeHTMLcomments( $text ) {
 694                 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
 695                         $end = strpos( $text, '-->', $start + 4 );
 696                         if ( $end === false ) {
 697                                 # Unterminated comment; bail out
 698                                 break;
 699                         }
 700
 701                         $end += 3;
 702
 703                         # Trim space and newline if the comment is both
 704                         # preceded and followed by a newline
 705                         $spaceStart = max( $start - 1, 0 );
 706                         $spaceLen = $end - $spaceStart;
 707                         while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
 708                                 $spaceStart--;
 709                                 $spaceLen++;
 710                         }
 711                         while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
 712                                 $spaceLen++;
 713                         }
 714                         if ( substr( $text, $spaceStart, 1 ) === "\n"
 715                                 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
 716                                 # Remove the comment, leading and trailing
 717                                 # spaces, and leave only one newline.
 718                                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
 719                         } else {
 720                                 # Remove just the comment.
 721                                 $text = substr_replace( $text, '', $start, $end - $start );
 722                         }
 723                 }
 724                 return $text;
 725         }
 726
 727         /**
 728          * Takes attribute names and values for a tag and the tag name and
 729          * validates that the tag is allowed to be present.
 730          * This DOES NOT validate the attributes, nor does it validate the
 731          * tags themselves. This method only handles the special circumstances
 732          * where we may want to allow a tag within content but ONLY when it has
 733          * specific attributes set.
 734          *
 735          * @param string $params
 736          * @param string $element
 737          * @return bool
 738          */
 739         static function validateTag( $params, $element ) {
 740                 $params = self::decodeTagAttributes( $params );
 741
 742                 if ( $element == 'meta' || $element == 'link' ) {
 743                         if ( !isset( $params['itemprop'] ) ) {
 744                                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
 745                                 return false;
 746                         }
 747                         if ( $element == 'meta' && !isset( $params['content'] ) ) {
 748                                 // <meta> must have a content="" for the itemprop
 749                                 return false;
 750                         }
 751                         if ( $element == 'link' && !isset( $params['href'] ) ) {
 752                                 // <link> must have an associated href=""
 753                                 return false;
 754                         }
 755                 }
 756
 757                 return true;
 758         }
 759
 760         /**
 761          * Take an array of attribute names and values and normalize or discard
 762          * illegal values for the given element type.
 763          *
 764          * - Discards attributes not on a whitelist for the given element
 765          * - Unsafe style attributes are discarded
 766          * - Invalid id attributes are re-encoded
 767          *
 768          * @param array $attribs
 769          * @param string $element
 770          * @return array
 771          *
 772          * @todo Check for legal values where the DTD limits things.
 773          * @todo Check for unique id attribute :P
 774          */
 775         static function validateTagAttributes( $attribs, $element ) {
 776                 return self::validateAttributes( $attribs,
 777                         self::attributeWhitelist( $element ) );
 778         }
 779
 780         /**
 781          * Take an array of attribute names and values and normalize or discard
 782          * illegal values for the given whitelist.
 783          *
 784          * - Discards attributes not on the given whitelist
 785          * - Unsafe style attributes are discarded
 786          * - Invalid id attributes are re-encoded
 787          *
 788          * @param array $attribs
 789          * @param array $whitelist List of allowed attribute names
 790          * @return array
 791          *
 792          * @todo Check for legal values where the DTD limits things.
 793          * @todo Check for unique id attribute :P
 794          */
 795         static function validateAttributes( $attribs, $whitelist ) {
 796                 $whitelist = array_flip( $whitelist );
 797                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 798
 799                 $out = [];
 800                 foreach ( $attribs as $attribute => $value ) {
 801                         # Allow XML namespace declaration to allow RDFa
 802                         if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
 803                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 804                                         $out[$attribute] = $value;
 805                                 }
 806
 807                                 continue;
 808                         }
 809
 810                         # Allow any attribute beginning with "data-"
 811                         # However:
 812                         # * Disallow data attributes used by MediaWiki code
 813                         # * Ensure that the attribute is not namespaced by banning
 814                         #   colons.
 815                         if ( !preg_match( '/^data-[^:]*$/i', $attribute )
 816                                 && !isset( $whitelist[$attribute] )
 817                                 || self::isReservedDataAttribute( $attribute )
 818                         ) {
 819                                 continue;
 820                         }
 821
 822                         # Strip javascript "expression" from stylesheets.
 823                         # https://msdn.microsoft.com/en-us/library/ms537634.aspx
 824                         if ( $attribute == 'style' ) {
 825                                 $value = self::checkCss( $value );
 826                         }
 827
 828                         # Escape HTML id attributes
 829                         if ( $attribute === 'id' ) {
 830                                 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
 831                         }
 832
 833                         # Escape HTML id reference lists
 834                         if ( $attribute === 'aria-describedby'
 835                                 || $attribute === 'aria-flowto'
 836                                 || $attribute === 'aria-labelledby'
 837                                 || $attribute === 'aria-owns'
 838                         ) {
 839                                 $value = self::escapeIdReferenceList( $value );
 840                         }
 841
 842                         // RDFa and microdata properties allow URLs, URIs and/or CURIs.
 843                         // Check them for sanity.
 844                         if ( $attribute === 'rel' || $attribute === 'rev'
 845                                 # RDFa
 846                                 || $attribute === 'about' || $attribute === 'property'
 847                                 || $attribute === 'resource' || $attribute === 'datatype'
 848                                 || $attribute === 'typeof'
 849                                 # HTML5 microdata
 850                                 || $attribute === 'itemid' || $attribute === 'itemprop'
 851                                 || $attribute === 'itemref' || $attribute === 'itemscope'
 852                                 || $attribute === 'itemtype'
 853                         ) {
 854                                 // Paranoia. Allow "simple" values but suppress javascript
 855                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 856                                         continue;
 857                                 }
 858                         }
 859
 860                         # NOTE: even though elements using href/src are not allowed directly, supply
 861                         #       validation code that can be used by tag hook handlers, etc
 862                         if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
 863                                 if ( !preg_match( $hrefExp, $value ) ) {
 864                                         continue; // drop any href or src attributes not using an allowed protocol.
 865                                         // NOTE: this also drops all relative URLs
 866                                 }
 867                         }
 868
 869                         // If this attribute was previously set, override it.
 870                         // Output should only have one attribute of each name.
 871                         $out[$attribute] = $value;
 872                 }
 873
 874                 # itemtype, itemid, itemref don't make sense without itemscope
 875                 if ( !array_key_exists( 'itemscope', $out ) ) {
 876                         unset( $out['itemtype'] );
 877                         unset( $out['itemid'] );
 878                         unset( $out['itemref'] );
 879                 }
 880                 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
 881
 882                 return $out;
 883         }
 884
 885         /**
 886          * Given an attribute name, checks whether it is a reserved data attribute
 887          * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
 888          * core and extension code can safely use it to communicate with frontend code.
 889          * @param string $attr Attribute name.
 890          * @return bool
 891          */
 892         public static function isReservedDataAttribute( $attr ) {
 893                 // data-ooui is reserved for ooui.
 894                 // data-mw and data-parsoid are reserved for parsoid.
 895                 // data-mw-<name here> is reserved for extensions (or core) if
 896                 // they need to communicate some data to the client and want to be
 897                 // sure that it isn't coming from an untrusted user.
 898                 // We ignore the possibility of namespaces since user-generated HTML
 899                 // can't use them anymore.
 900                 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
 901         }
 902
 903         /**
 904          * Merge two sets of HTML attributes.  Conflicting items in the second set
 905          * will override those in the first, except for 'class' attributes which
 906          * will be combined (if they're both strings).
 907          *
 908          * @todo implement merging for other attributes such as style
 909          * @param array $a
 910          * @param array $b
 911          * @return array
 912          */
 913         static function mergeAttributes( $a, $b ) {
 914                 $out = array_merge( $a, $b );
 915                 if ( isset( $a['class'] ) && isset( $b['class'] )
 916                         && is_string( $a['class'] ) && is_string( $b['class'] )
 917                         && $a['class'] !== $b['class']
 918                 ) {
 919                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 920                                 -1, PREG_SPLIT_NO_EMPTY );
 921                         $out['class'] = implode( ' ', array_unique( $classes ) );
 922                 }
 923                 return $out;
 924         }
 925
 926         /**
 927          * Normalize CSS into a format we can easily search for hostile input
 928          *  - decode character references
 929          *  - decode escape sequences
 930          *  - convert characters that IE6 interprets into ascii
 931          *  - remove comments, unless the entire value is one single comment
 932          * @param string $value the css string
 933          * @return string normalized css
 934          */
 935         public static function normalizeCss( $value ) {
 936                 // Decode character references like &#123;
 937                 $value = self::decodeCharReferences( $value );
 938
 939                 // Decode escape sequences and line continuation
 940                 // See the grammar in the CSS 2 spec, appendix D.
 941                 // This has to be done AFTER decoding character references.
 942                 // This means it isn't possible for this function to return
 943                 // unsanitized escape sequences. It is possible to manufacture
 944                 // input that contains character references that decode to
 945                 // escape sequences that decode to character references, but
 946                 // it's OK for the return value to contain character references
 947                 // because the caller is supposed to escape those anyway.
 948                 static $decodeRegex;
 949                 if ( !$decodeRegex ) {
 950                         $space = '[\\x20\\t\\r\\n\\f]';
 951                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
 952                         $backslash = '\\\\';
 953                         $decodeRegex = "/ $backslash
 954                                 (?:
 955                                         ($nl) |  # 1. Line continuation
 956                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
 957                                         (.) | # 3. backslash cancelling special meaning
 958                                         () | # 4. backslash at end of string
 959                                 )/xu";
 960                 }
 961                 $value = preg_replace_callback( $decodeRegex,
 962                         [ __CLASS__, 'cssDecodeCallback' ], $value );
 963
 964                 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
 965                 $value = preg_replace_callback(
 966                         '/[！-［］-ｚ]/u', // U+FF01 to U+FF5A, excluding U+FF3C (T60088)
 967                         function ( $matches ) {
 968                                 $cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] );
 969                                 if ( $cp === false ) {
 970                                         return '';
 971                                 }
 972                                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
 973                         },
 974                         $value
 975                 );
 976
 977                 // Convert more characters IE6 might treat as ascii
 978                 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
 979                 $value = str_replace(
 980                         [ 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ],
 981                         [ 'r', 'n', 'n', 'l', 'i', '(', '(' ],
 982                         $value
 983                 );
 984
 985                 // Let the value through if it's nothing but a single comment, to
 986                 // allow other functions which may reject it to pass some error
 987                 // message through.
 988                 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
 989                         // Remove any comments; IE gets token splitting wrong
 990                         // This must be done AFTER decoding character references and
 991                         // escape sequences, because those steps can introduce comments
 992                         // This step cannot introduce character references or escape
 993                         // sequences, because it replaces comments with spaces rather
 994                         // than removing them completely.
 995                         $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
 996
 997                         // Remove anything after a comment-start token, to guard against
 998                         // incorrect client implementations.
 999                         $commentPos = strpos( $value, '/*' );
1000                         if ( $commentPos !== false ) {
1001                                 $value = substr( $value, 0, $commentPos );
1002                         }
1003                 }
1004
1005                 // S followed by repeat, iteration, or prolonged sound marks,
1006                 // which IE will treat as "ss"
1007                 $value = preg_replace(
1008                         '/s(?:
1009                                 \xE3\x80\xB1 | # U+3031
1010                                 \xE3\x82\x9D | # U+309D
1011                                 \xE3\x83\xBC | # U+30FC
1012                                 \xE3\x83\xBD | # U+30FD
1013                                 \xEF\xB9\xBC | # U+FE7C
1014                                 \xEF\xB9\xBD | # U+FE7D
1015                                 \xEF\xBD\xB0   # U+FF70
1016                         )/ix',
1017                         'ss',
1018                         $value
1019                 );
1020
1021                 return $value;
1022         }
1023
1024         /**
1025          * Pick apart some CSS and check it for forbidden or unsafe structures.
1026          * Returns a sanitized string. This sanitized string will have
1027          * character references and escape sequences decoded and comments
1028          * stripped (unless it is itself one valid comment, in which case the value
1029          * will be passed through). If the input is just too evil, only a comment
1030          * complaining about evilness will be returned.
1031          *
1032          * Currently URL references, 'expression', 'tps' are forbidden.
1033          *
1034          * NOTE: Despite the fact that character references are decoded, the
1035          * returned string may contain character references given certain
1036          * clever input strings. These character references must
1037          * be escaped before the return value is embedded in HTML.
1038          *
1039          * @param string $value
1040          * @return string
1041          */
1042         static function checkCss( $value ) {
1043                 $value = self::normalizeCss( $value );
1044
1045                 // Reject problematic keywords and control characters
1046                 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
1047                         strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
1048                         return '/* invalid control char */';
1049                 } elseif ( preg_match(
1050                         '! expression
1051                                 | filter\s*:
1052                                 | accelerator\s*:
1053                                 | -o-link\s*:
1054                                 | -o-link-source\s*:
1055                                 | -o-replace\s*:
1056                                 | url\s*\(
1057                                 | image\s*\(
1058                                 | image-set\s*\(
1059                                 | attr\s*\([^)]+[\s,]+url
1060                         !ix', $value ) ) {
1061                         return '/* insecure input */';
1062                 }
1063                 return $value;
1064         }
1065
1066         /**
1067          * @param array $matches
1068          * @return string
1069          */
1070         static function cssDecodeCallback( $matches ) {
1071                 if ( $matches[1] !== '' ) {
1072                         // Line continuation
1073                         return '';
1074                 } elseif ( $matches[2] !== '' ) {
1075                         $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
1076                 } elseif ( $matches[3] !== '' ) {
1077                         $char = $matches[3];
1078                 } else {
1079                         $char = '\\';
1080                 }
1081                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
1082                         // These characters need to be escaped in strings
1083                         // Clean up the escape sequence to avoid parsing errors by clients
1084                         return '\\' . dechex( ord( $char ) ) . ' ';
1085                 } else {
1086                         // Decode unnecessary escape
1087                         return $char;
1088                 }
1089         }
1090
1091         /**
1092          * Take a tag soup fragment listing an HTML element's attributes
1093          * and normalize it to well-formed XML, discarding unwanted attributes.
1094          * Output is safe for further wikitext processing, with escaping of
1095          * values that could trigger problems.
1096          *
1097          * - Normalizes attribute names to lowercase
1098          * - Discards attributes not on a whitelist for the given element
1099          * - Turns broken or invalid entities into plaintext
1100          * - Double-quotes all attribute values
1101          * - Attributes without values are given the name as attribute
1102          * - Double attributes are discarded
1103          * - Unsafe style attributes are discarded
1104          * - Prepends space if there are attributes.
1105          * - (Optionally) Sorts attributes by name.
1106          *
1107          * @param string $text
1108          * @param string $element
1109          * @param bool $sorted Whether to sort the attributes (default: false)
1110          * @return string
1111          */
1112         static function fixTagAttributes( $text, $element, $sorted = false ) {
1113                 if ( trim( $text ) == '' ) {
1114                         return '';
1115                 }
1116
1117                 $decoded = self::decodeTagAttributes( $text );
1118                 $stripped = self::validateTagAttributes( $decoded, $element );
1119
1120                 if ( $sorted ) {
1121                         ksort( $stripped );
1122                 }
1123
1124                 return self::safeEncodeTagAttributes( $stripped );
1125         }
1126
1127         /**
1128          * Encode an attribute value for HTML output.
1129          * @param string $text
1130          * @return string HTML-encoded text fragment
1131          */
1132         static function encodeAttribute( $text ) {
1133                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1134
1135                 // Whitespace is normalized during attribute decoding,
1136                 // so if we've been passed non-spaces we must encode them
1137                 // ahead of time or they won't be preserved.
1138                 $encValue = strtr( $encValue, [
1139                         "\n" => '&#10;',
1140                         "\r" => '&#13;',
1141                         "\t" => '&#9;',
1142                 ] );
1143
1144                 return $encValue;
1145         }
1146
1147         /**
1148          * Armor French spaces with a replacement character
1149          *
1150          * @since 1.32
1151          * @param string $text Text to armor
1152          * @param string $space Space character for the French spaces, defaults to '&#160;'
1153          * @return string Armored text
1154          */
1155         public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
1156                 // Replace $ with \$ and \ with \\
1157                 $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space );
1158                 $fixtags = [
1159                         # French spaces, last one Guillemet-left
1160                         # only if there is something before the space
1161                         # and a non-word character after the punctuation.
1162                         '/(\S) (?=[?:;!%»›](?!\w))/u' => "\\1$space",
1163                         # French spaces, Guillemet-right
1164                         '/([«‹]) /u' => "\\1$space",
1165                 ];
1166                 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
1167         }
1168
1169         /**
1170          * Encode an attribute value for HTML tags, with extra armoring
1171          * against further wiki processing.
1172          * @param string $text
1173          * @return string HTML-encoded text fragment
1174          */
1175         static function safeEncodeAttribute( $text ) {
1176                 $encValue = self::encodeAttribute( $text );
1177
1178                 # Templates and links may be expanded in later parsing,
1179                 # creating invalid or dangerous output. Suppress this.
1180                 $encValue = strtr( $encValue, [
1181                         '<'    => '&lt;',   // This should never happen,
1182                         '>'    => '&gt;',   // we've received invalid input
1183                         '"'    => '&quot;', // which should have been escaped.
1184                         '{'    => '&#123;',
1185                         '}'    => '&#125;', // prevent unpaired language conversion syntax
1186                         '['    => '&#91;',
1187                         ']'    => '&#93;',
1188                         "''"   => '&#39;&#39;',
1189                         'ISBN' => '&#73;SBN',
1190                         'RFC'  => '&#82;FC',
1191                         'PMID' => '&#80;MID',
1192                         '|'    => '&#124;',
1193                         '__'   => '&#95;_',
1194                 ] );
1195
1196                 # Armor against French spaces detection (T5158)
1197                 $encValue = self::armorFrenchSpaces( $encValue, '&#32;' );
1198
1199                 # Stupid hack
1200                 $encValue = preg_replace_callback(
1201                         '/((?i)' . wfUrlProtocols() . ')/',
1202                         function ( $matches ) {
1203                                 return str_replace( ':', '&#58;', $matches[1] );
1204                         },
1205                         $encValue );
1206                 return $encValue;
1207         }
1208
1209         /**
1210          * Given a value, escape it so that it can be used in an id attribute and
1211          * return it.  This will use HTML5 validation, allowing anything but ASCII
1212          * whitespace.
1213          *
1214          * To ensure we don't have to bother escaping anything, we also strip ', ".
1215          * TODO: Is this the best tactic?
1216          *
1217          * We also strip # because it upsets IE, and % because it could be
1218          * ambiguous if it's part of something that looks like a percent escape
1219          * (which don't work reliably in fragments cross-browser).
1220          *
1221          * @deprecated since 1.30, use one of this class' escapeIdFor*() functions
1222          *
1223          * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
1224          *   in the id and name attributes
1225          * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
1226          *   the id attribute
1227          * @see https://www.w3.org/TR/html5/dom.html#the-id-attribute
1228          *   HTML5 definition of id attribute
1229          *
1230          * @param string $id Id to escape
1231          * @param string|array $options String or array of strings (default is array()):
1232          *   'noninitial': This is a non-initial fragment of an id, not a full id,
1233          *       so don't pay attention if the first character isn't valid at the
1234          *       beginning of an id.
1235          * @return string
1236          */
1237         static function escapeId( $id, $options = [] ) {
1238                 $options = (array)$options;
1239
1240                 // HTML4-style escaping
1241                 static $replace = [
1242                         '%3A' => ':',
1243                         '%' => '.'
1244                 ];
1245
1246                 $id = urlencode( strtr( $id, ' ', '_' ) );
1247                 $id = strtr( $id, $replace );
1248
1249                 if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1250                         // Initial character must be a letter!
1251                         $id = "x$id";
1252                 }
1253                 return $id;
1254         }
1255
1256         /**
1257          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1258          * a valid HTML id attribute.
1259          *
1260          * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
1261          * be sure to use proper escaping.
1262          *
1263          * @param string $id String to escape
1264          * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
1265          *     should be used.
1266          * @return string|bool Escaped ID or false if fallback encoding is requested but it's not
1267          *     configured.
1268          *
1269          * @since 1.30
1270          */
1271         public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1272                 global $wgFragmentMode;
1273
1274                 if ( !isset( $wgFragmentMode[$mode] ) ) {
1275                         if ( $mode === self::ID_PRIMARY ) {
1276                                 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1277                         }
1278                         return false;
1279                 }
1280
1281                 $internalMode = $wgFragmentMode[$mode];
1282
1283                 return self::escapeIdInternal( $id, $internalMode );
1284         }
1285
1286         /**
1287          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1288          * a valid URL fragment.
1289          *
1290          * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
1291          * be sure to use proper escaping.
1292          *
1293          * @param string $id String to escape
1294          * @return string Escaped ID
1295          *
1296          * @since 1.30
1297          */
1298         public static function escapeIdForLink( $id ) {
1299                 global $wgFragmentMode;
1300
1301                 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
1302                         throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1303                 }
1304
1305                 $mode = $wgFragmentMode[self::ID_PRIMARY];
1306
1307                 $id = self::escapeIdInternal( $id, $mode );
1308
1309                 return $id;
1310         }
1311
1312         /**
1313          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1314          * a valid URL fragment for external interwikis.
1315          *
1316          * @param string $id String to escape
1317          * @return string Escaped ID
1318          *
1319          * @since 1.30
1320          */
1321         public static function escapeIdForExternalInterwiki( $id ) {
1322                 global $wgExternalInterwikiFragmentMode;
1323
1324                 $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
1325
1326                 return $id;
1327         }
1328
1329         /**
1330          * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
1331          *
1332          * @param string $id String to escape
1333          * @param string $mode One of modes from $wgFragmentMode
1334          * @return string
1335          */
1336         private static function escapeIdInternal( $id, $mode ) {
1337                 switch ( $mode ) {
1338                         case 'html5':
1339                                 $id = str_replace( ' ', '_', $id );
1340                                 break;
1341                         case 'legacy':
1342                                 // This corresponds to 'noninitial' mode of the old escapeId()
1343                                 static $replace = [
1344                                         '%3A' => ':',
1345                                         '%' => '.'
1346                                 ];
1347
1348                                 $id = urlencode( str_replace( ' ', '_', $id ) );
1349                                 $id = strtr( $id, $replace );
1350                                 break;
1351                         default:
1352                                 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1353                 }
1354
1355                 return $id;
1356         }
1357
1358         /**
1359          * Given a string containing a space delimited list of ids, escape each id
1360          * to match ids escaped by the escapeId() function.
1361          *
1362          * @todo remove $options completely in 1.32
1363          *
1364          * @since 1.27
1365          *
1366          * @param string $referenceString Space delimited list of ids
1367          * @param string|array $options Deprecated and does nothing.
1368          * @return string
1369          */
1370         static function escapeIdReferenceList( $referenceString, $options = [] ) {
1371                 if ( $options ) {
1372                         wfDeprecated( __METHOD__ . ' with $options', '1.31' );
1373                 }
1374                 # Explode the space delimited list string into an array of tokens
1375                 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1376
1377                 # Escape each token as an id
1378                 foreach ( $references as &$ref ) {
1379                         $ref = self::escapeIdForAttribute( $ref );
1380                 }
1381
1382                 # Merge the array back to a space delimited list string
1383                 # If the array is empty, the result will be an empty string ('')
1384                 $referenceString = implode( ' ', $references );
1385
1386                 return $referenceString;
1387         }
1388
1389         /**
1390          * Given a value, escape it so that it can be used as a CSS class and
1391          * return it.
1392          *
1393          * @todo For extra validity, input should be validated UTF-8.
1394          *
1395          * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
1396          *
1397          * @param string $class
1398          * @return string
1399          */
1400         static function escapeClass( $class ) {
1401                 // Convert ugly stuff to underscores and kill underscores in ugly places
1402                 return rtrim( preg_replace(
1403                         [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1404                         '_',
1405                         $class ), '_' );
1406         }
1407
1408         /**
1409          * Given HTML input, escape with htmlspecialchars but un-escape entities.
1410          * This allows (generally harmless) entities like &#160; to survive.
1411          *
1412          * @param string $html HTML to escape
1413          * @return string Escaped input
1414          */
1415         static function escapeHtmlAllowEntities( $html ) {
1416                 $html = self::decodeCharReferences( $html );
1417                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
1418                 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1419                 # don't cause the entire string to disappear.
1420                 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1421                 return $html;
1422         }
1423
1424         /**
1425          * Return an associative array of attribute names and values from
1426          * a partial tag string. Attribute names are forced to lowercase,
1427          * character references are decoded to UTF-8 text.
1428          *
1429          * @param string $text
1430          * @return array
1431          */
1432         public static function decodeTagAttributes( $text ) {
1433                 if ( trim( $text ) == '' ) {
1434                         return [];
1435                 }
1436
1437                 $attribs = [];
1438                 $pairs = [];
1439                 if ( !preg_match_all(
1440                         self::getAttribsRegex(),
1441                         $text,
1442                         $pairs,
1443                         PREG_SET_ORDER ) ) {
1444                         return $attribs;
1445                 }
1446
1447                 foreach ( $pairs as $set ) {
1448                         $attribute = strtolower( $set[1] );
1449                         $value = self::getTagAttributeCallback( $set );
1450
1451                         // Normalize whitespace
1452                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1453                         $value = trim( $value );
1454
1455                         // Decode character references
1456                         $attribs[$attribute] = self::decodeCharReferences( $value );
1457                 }
1458                 return $attribs;
1459         }
1460
1461         /**
1462          * Build a partial tag string from an associative array of attribute
1463          * names and values as returned by decodeTagAttributes.
1464          *
1465          * @param array $assoc_array
1466          * @return string
1467          */
1468         public static function safeEncodeTagAttributes( $assoc_array ) {
1469                 $attribs = [];
1470                 foreach ( $assoc_array as $attribute => $value ) {
1471                         $encAttribute = htmlspecialchars( $attribute );
1472                         $encValue = self::safeEncodeAttribute( $value );
1473
1474                         $attribs[] = "$encAttribute=\"$encValue\"";
1475                 }
1476                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1477         }
1478
1479         /**
1480          * Pick the appropriate attribute value from a match set from the
1481          * attribs regex matches.
1482          *
1483          * @param array $set
1484          * @throws MWException When tag conditions are not met.
1485          * @return string
1486          */
1487         private static function getTagAttributeCallback( $set ) {
1488                 if ( isset( $set[5] ) ) {
1489                         # No quotes.
1490                         return $set[5];
1491                 } elseif ( isset( $set[4] ) ) {
1492                         # Single-quoted
1493                         return $set[4];
1494                 } elseif ( isset( $set[3] ) ) {
1495                         # Double-quoted
1496                         return $set[3];
1497                 } elseif ( !isset( $set[2] ) ) {
1498                         # In XHTML, attributes must have a value so return an empty string.
1499                         # See "Empty attribute syntax",
1500                         # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1501                         return "";
1502                 } else {
1503                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1504                 }
1505         }
1506
1507         /**
1508          * @param string $text
1509          * @return string
1510          */
1511         private static function normalizeWhitespace( $text ) {
1512                 return trim( preg_replace(
1513                         '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1514                         ' ',
1515                         $text ) );
1516         }
1517
1518         /**
1519          * Normalizes whitespace in a section name, such as might be returned
1520          * by Parser::stripSectionName(), for use in the id's that are used for
1521          * section links.
1522          *
1523          * @param string $section
1524          * @return string
1525          */
1526         static function normalizeSectionNameWhitespace( $section ) {
1527                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1528         }
1529
1530         /**
1531          * Ensure that any entities and character references are legal
1532          * for XML and XHTML specifically. Any stray bits will be
1533          * &amp;-escaped to result in a valid text fragment.
1534          *
1535          * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
1536          *   numericized (this way we're well-formed even without a DTD)
1537          * b. any numeric char refs must be legal chars, not invalid or forbidden
1538          * c. use lower cased "&#x", not "&#X"
1539          * d. fix or reject non-valid attributes
1540          *
1541          * @param string $text
1542          * @return string
1543          * @private
1544          */
1545         static function normalizeCharReferences( $text ) {
1546                 return preg_replace_callback(
1547                         self::CHAR_REFS_REGEX,
1548                         [ self::class, 'normalizeCharReferencesCallback' ],
1549                         $text );
1550         }
1551
1552         /**
1553          * @param string $matches
1554          * @return string
1555          */
1556         static function normalizeCharReferencesCallback( $matches ) {
1557                 $ret = null;
1558                 if ( $matches[1] != '' ) {
1559                         $ret = self::normalizeEntity( $matches[1] );
1560                 } elseif ( $matches[2] != '' ) {
1561                         $ret = self::decCharReference( $matches[2] );
1562                 } elseif ( $matches[3] != '' ) {
1563                         $ret = self::hexCharReference( $matches[3] );
1564                 }
1565                 if ( is_null( $ret ) ) {
1566                         return htmlspecialchars( $matches[0] );
1567                 } else {
1568                         return $ret;
1569                 }
1570         }
1571
1572         /**
1573          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1574          * return the equivalent numeric entity reference (except for the core &lt;
1575          * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
1576          * the HTML equivalent. Otherwise, returns HTML-escaped text of
1577          * pseudo-entity source (eg &amp;foo;)
1578          *
1579          * @param string $name
1580          * @return string
1581          */
1582         static function normalizeEntity( $name ) {
1583                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
1584                         return '&' . self::$htmlEntityAliases[$name] . ';';
1585                 } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) {
1586                         return "&$name;";
1587                 } elseif ( isset( self::$htmlEntities[$name] ) ) {
1588                         return '&#' . self::$htmlEntities[$name] . ';';
1589                 } else {
1590                         return "&amp;$name;";
1591                 }
1592         }
1593
1594         /**
1595          * @param int $codepoint
1596          * @return null|string
1597          */
1598         static function decCharReference( $codepoint ) {
1599                 $point = intval( $codepoint );
1600                 if ( self::validateCodepoint( $point ) ) {
1601                         return sprintf( '&#%d;', $point );
1602                 } else {
1603                         return null;
1604                 }
1605         }
1606
1607         /**
1608          * @param int $codepoint
1609          * @return null|string
1610          */
1611         static function hexCharReference( $codepoint ) {
1612                 $point = hexdec( $codepoint );
1613                 if ( self::validateCodepoint( $point ) ) {
1614                         return sprintf( '&#x%x;', $point );
1615                 } else {
1616                         return null;
1617                 }
1618         }
1619
1620         /**
1621          * Returns true if a given Unicode codepoint is a valid character in
1622          * both HTML5 and XML.
1623          * @param int $codepoint
1624          * @return bool
1625          */
1626         private static function validateCodepoint( $codepoint ) {
1627                 # U+000C is valid in HTML5 but not allowed in XML.
1628                 # U+000D is valid in XML but not allowed in HTML5.
1629                 # U+007F - U+009F are disallowed in HTML5 (control characters).
1630                 return $codepoint == 0x09
1631                         || $codepoint == 0x0a
1632                         || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1633                         || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1634                         || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1635                         || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1636         }
1637
1638         /**
1639          * Decode any character references, numeric or named entities,
1640          * in the text and return a UTF-8 string.
1641          *
1642          * @param string $text
1643          * @return string
1644          */
1645         public static function decodeCharReferences( $text ) {
1646                 return preg_replace_callback(
1647                         self::CHAR_REFS_REGEX,
1648                         [ self::class, 'decodeCharReferencesCallback' ],
1649                         $text );
1650         }
1651
1652         /**
1653          * Decode any character references, numeric or named entities,
1654          * in the next and normalize the resulting string. (T16952)
1655          *
1656          * This is useful for page titles, not for text to be displayed,
1657          * MediaWiki allows HTML entities to escape normalization as a feature.
1658          *
1659          * @param string $text Already normalized, containing entities
1660          * @return string Still normalized, without entities
1661          */
1662         public static function decodeCharReferencesAndNormalize( $text ) {
1663                 $text = preg_replace_callback(
1664                         self::CHAR_REFS_REGEX,
1665                         [ self::class, 'decodeCharReferencesCallback' ],
1666                         $text,
1667                         -1, //limit
1668                         $count
1669                 );
1670
1671                 if ( $count ) {
1672                         return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1673                 } else {
1674                         return $text;
1675                 }
1676         }
1677
1678         /**
1679          * @param string $matches
1680          * @return string
1681          */
1682         static function decodeCharReferencesCallback( $matches ) {
1683                 if ( $matches[1] != '' ) {
1684                         return self::decodeEntity( $matches[1] );
1685                 } elseif ( $matches[2] != '' ) {
1686                         return self::decodeChar( intval( $matches[2] ) );
1687                 } elseif ( $matches[3] != '' ) {
1688                         return self::decodeChar( hexdec( $matches[3] ) );
1689                 }
1690                 # Last case should be an ampersand by itself
1691                 return $matches[0];
1692         }
1693
1694         /**
1695          * Return UTF-8 string for a codepoint if that is a valid
1696          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1697          * @param int $codepoint
1698          * @return string
1699          * @private
1700          */
1701         static function decodeChar( $codepoint ) {
1702                 if ( self::validateCodepoint( $codepoint ) ) {
1703                         return UtfNormal\Utils::codepointToUtf8( $codepoint );
1704                 } else {
1705                         return UtfNormal\Constants::UTF8_REPLACEMENT;
1706                 }
1707         }
1708
1709         /**
1710          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1711          * return the UTF-8 encoding of that character. Otherwise, returns
1712          * pseudo-entity source (eg "&foo;")
1713          *
1714          * @param string $name
1715          * @return string
1716          */
1717         static function decodeEntity( $name ) {
1718                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
1719                         $name = self::$htmlEntityAliases[$name];
1720                 }
1721                 if ( isset( self::$htmlEntities[$name] ) ) {
1722                         return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
1723                 } else {
1724                         return "&$name;";
1725                 }
1726         }
1727
1728         /**
1729          * Fetch the whitelist of acceptable attributes for a given element name.
1730          *
1731          * @param string $element
1732          * @return array
1733          */
1734         static function attributeWhitelist( $element ) {
1735                 $list = self::setupAttributeWhitelist();
1736                 return $list[$element] ?? [];
1737         }
1738
1739         /**
1740          * Foreach array key (an allowed HTML element), return an array
1741          * of allowed attributes
1742          * @return array
1743          */
1744         static function setupAttributeWhitelist() {
1745                 static $whitelist;
1746
1747                 if ( $whitelist !== null ) {
1748                         return $whitelist;
1749                 }
1750
1751                 $common = [
1752                         # HTML
1753                         'id',
1754                         'class',
1755                         'style',
1756                         'lang',
1757                         'dir',
1758                         'title',
1759
1760                         # WAI-ARIA
1761                         'aria-describedby',
1762                         'aria-flowto',
1763                         'aria-label',
1764                         'aria-labelledby',
1765                         'aria-owns',
1766                         'role',
1767
1768                         # RDFa
1769                         # These attributes are specified in section 9 of
1770                         # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1771                         'about',
1772                         'property',
1773                         'resource',
1774                         'datatype',
1775                         'typeof',
1776
1777                         # Microdata. These are specified by
1778                         # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1779                         'itemid',
1780                         'itemprop',
1781                         'itemref',
1782                         'itemscope',
1783                         'itemtype',
1784                 ];
1785
1786                 $block = array_merge( $common, [ 'align' ] );
1787                 $tablealign = [ 'align', 'valign' ];
1788                 $tablecell = [
1789                         'abbr',
1790                         'axis',
1791                         'headers',
1792                         'scope',
1793                         'rowspan',
1794                         'colspan',
1795                         'nowrap', # deprecated
1796                         'width', # deprecated
1797                         'height', # deprecated
1798                         'bgcolor', # deprecated
1799                 ];
1800
1801                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1802                 # See: https://www.w3.org/TR/html4/
1803                 $whitelist = [
1804                         # 7.5.4
1805                         'div'        => $block,
1806                         'center'     => $common, # deprecated
1807                         'span'       => $common,
1808
1809                         # 7.5.5
1810                         'h1'         => $block,
1811                         'h2'         => $block,
1812                         'h3'         => $block,
1813                         'h4'         => $block,
1814                         'h5'         => $block,
1815                         'h6'         => $block,
1816
1817                         # 7.5.6
1818                         # address
1819
1820                         # 8.2.4
1821                         'bdo'        => $common,
1822
1823                         # 9.2.1
1824                         'em'         => $common,
1825                         'strong'     => $common,
1826                         'cite'       => $common,
1827                         'dfn'        => $common,
1828                         'code'       => $common,
1829                         'samp'       => $common,
1830                         'kbd'        => $common,
1831                         'var'        => $common,
1832                         'abbr'       => $common,
1833                         # acronym
1834
1835                         # 9.2.2
1836                         'blockquote' => array_merge( $common, [ 'cite' ] ),
1837                         'q'          => array_merge( $common, [ 'cite' ] ),
1838
1839                         # 9.2.3
1840                         'sub'        => $common,
1841                         'sup'        => $common,
1842
1843                         # 9.3.1
1844                         'p'          => $block,
1845
1846                         # 9.3.2
1847                         'br'         => array_merge( $common, [ 'clear' ] ),
1848
1849                         # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1850                         'wbr'        => $common,
1851
1852                         # 9.3.4
1853                         'pre'        => array_merge( $common, [ 'width' ] ),
1854
1855                         # 9.4
1856                         'ins'        => array_merge( $common, [ 'cite', 'datetime' ] ),
1857                         'del'        => array_merge( $common, [ 'cite', 'datetime' ] ),
1858
1859                         # 10.2
1860                         'ul'         => array_merge( $common, [ 'type' ] ),
1861                         'ol'         => array_merge( $common, [ 'type', 'start', 'reversed' ] ),
1862                         'li'         => array_merge( $common, [ 'type', 'value' ] ),
1863
1864                         # 10.3
1865                         'dl'         => $common,
1866                         'dd'         => $common,
1867                         'dt'         => $common,
1868
1869                         # 11.2.1
1870                         'table'      => array_merge( $common,
1871                                                                 [ 'summary', 'width', 'border', 'frame',
1872                                                                                 'rules', 'cellspacing', 'cellpadding',
1873                                                                                 'align', 'bgcolor',
1874                                                                 ] ),
1875
1876                         # 11.2.2
1877                         'caption'    => $block,
1878
1879                         # 11.2.3
1880                         'thead'      => $common,
1881                         'tfoot'      => $common,
1882                         'tbody'      => $common,
1883
1884                         # 11.2.4
1885                         'colgroup'   => array_merge( $common, [ 'span' ] ),
1886                         'col'        => array_merge( $common, [ 'span' ] ),
1887
1888                         # 11.2.5
1889                         'tr'         => array_merge( $common, [ 'bgcolor' ], $tablealign ),
1890
1891                         # 11.2.6
1892                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1893                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1894
1895                         # 12.2
1896                         # NOTE: <a> is not allowed directly, but the attrib
1897                         # whitelist is used from the Parser object
1898                         'a'          => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1899
1900                         # 13.2
1901                         # Not usually allowed, but may be used for extension-style hooks
1902                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
1903                         # true
1904                         'img'        => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1905
1906                         'video'      => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1907                         'source'     => array_merge( $common, [ 'type', 'src' ] ),
1908                         'track'      => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1909
1910                         # 15.2.1
1911                         'tt'         => $common,
1912                         'b'          => $common,
1913                         'i'          => $common,
1914                         'big'        => $common,
1915                         'small'      => $common,
1916                         'strike'     => $common,
1917                         's'          => $common,
1918                         'u'          => $common,
1919
1920                         # 15.2.2
1921                         'font'       => array_merge( $common, [ 'size', 'color', 'face' ] ),
1922                         # basefont
1923
1924                         # 15.3
1925                         'hr'         => array_merge( $common, [ 'width' ] ),
1926
1927                         # HTML Ruby annotation text module, simple ruby only.
1928                         # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1929                         'ruby'       => $common,
1930                         # rbc
1931                         'rb'         => $common,
1932                         'rp'         => $common,
1933                         'rt'         => $common, # array_merge( $common, array( 'rbspan' ) ),
1934                         'rtc'        => $common,
1935
1936                         # MathML root element, where used for extensions
1937                         # 'title' may not be 100% valid here; it's XHTML
1938                         # https://www.w3.org/TR/REC-MathML/
1939                         'math'       => [ 'class', 'style', 'id', 'title' ],
1940
1941                         // HTML 5 section 4.5
1942                         'figure'     => $common,
1943                         'figcaption' => $common,
1944
1945                         # HTML 5 section 4.6
1946                         'bdi' => $common,
1947
1948                         # HTML5 elements, defined by:
1949                         # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1950                         'data' => array_merge( $common, [ 'value' ] ),
1951                         'time' => array_merge( $common, [ 'datetime' ] ),
1952                         'mark' => $common,
1953
1954                         // meta and link are only permitted by removeHTMLtags when Microdata
1955                         // is enabled so we don't bother adding a conditional to hide these
1956                         // Also meta and link are only valid in WikiText as Microdata elements
1957                         // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1958                         // So we don't bother including $common attributes that have no purpose.
1959                         'meta' => [ 'itemprop', 'content' ],
1960                         'link' => [ 'itemprop', 'href', 'title' ],
1961                 ];
1962
1963                 return $whitelist;
1964         }
1965
1966         /**
1967          * Take a fragment of (potentially invalid) HTML and return
1968          * a version with any tags removed, encoded as plain text.
1969          *
1970          * Warning: this return value must be further escaped for literal
1971          * inclusion in HTML output as of 1.10!
1972          *
1973          * @param string $html HTML fragment
1974          * @return string
1975          */
1976         static function stripAllTags( $html ) {
1977                 // Use RemexHtml to tokenize $html and extract the text
1978                 $handler = new RemexStripTagHandler;
1979                 $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
1980                         'ignoreErrors' => true,
1981                         // don't ignore char refs, we want them to be decoded
1982                         'ignoreNulls' => true,
1983                         'skipPreprocess' => true,
1984                 ] );
1985                 $tokenizer->execute();
1986                 $text = $handler->getResult();
1987
1988                 $text = self::normalizeWhitespace( $text );
1989                 return $text;
1990         }
1991
1992         /**
1993          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1994          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1995          * PHP 5.1 doesn't.
1996          *
1997          * Use for passing XHTML fragments to PHP's XML parsing functions
1998          *
1999          * @return string
2000          */
2001         static function hackDocType() {
2002                 $out = "<!DOCTYPE html [\n";
2003                 foreach ( self::$htmlEntities as $entity => $codepoint ) {
2004                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
2005                 }
2006                 $out .= "]>\n";
2007                 return $out;
2008         }
2009
2010         /**
2011          * @param string $url
2012          * @return mixed|string
2013          */
2014         static function cleanUrl( $url ) {
2015                 # Normalize any HTML entities in input. They will be
2016                 # re-escaped by makeExternalLink().
2017                 $url = self::decodeCharReferences( $url );
2018
2019                 # Escape any control characters introduced by the above step
2020                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
2021                         [ __CLASS__, 'cleanUrlCallback' ], $url );
2022
2023                 # Validate hostname portion
2024                 $matches = [];
2025                 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
2026                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
2027
2028                         // Characters that will be ignored in IDNs.
2029                         // https://tools.ietf.org/html/rfc3454#section-3.1
2030                         // Strip them before further processing so blacklists and such work.
2031                         $strip = "/
2032                                 \\s|          # general whitespace
2033                                 \xc2\xad|     # 00ad SOFT HYPHEN
2034                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2035                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2036                                 \xe2\x81\xa0| # 2060 WORD JOINER
2037                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2038                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
2039                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2040                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2041                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2042                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2043                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2044                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2045                                 /xuD";
2046
2047                         $host = preg_replace( $strip, '', $host );
2048
2049                         // IPv6 host names are bracketed with [].  Url-decode these.
2050                         if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
2051                                 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
2052                         ) {
2053                                 $host = '//[' . $matches[1] . ']' . $matches[2];
2054                         }
2055
2056                         // @todo FIXME: Validate hostnames here
2057
2058                         return $protocol . $host . $rest;
2059                 } else {
2060                         return $url;
2061                 }
2062         }
2063
2064         /**
2065          * @param array $matches
2066          * @return string
2067          */
2068         static function cleanUrlCallback( $matches ) {
2069                 return urlencode( $matches[0] );
2070         }
2071
2072         /**
2073          * Does a string look like an e-mail address?
2074          *
2075          * This validates an email address using an HTML5 specification found at:
2076          * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
2077          * Which as of 2011-01-24 says:
2078          *
2079          *   A valid e-mail address is a string that matches the ABNF production
2080          *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
2081          *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
2082          *   3.5.
2083          *
2084          * This function is an implementation of the specification as requested in
2085          * T24449.
2086          *
2087          * Client-side forms will use the same standard validation rules via JS or
2088          * HTML 5 validation; additional restrictions can be enforced server-side
2089          * by extensions via the 'isValidEmailAddr' hook.
2090          *
2091          * Note that this validation doesn't 100% match RFC 2822, but is believed
2092          * to be liberal enough for wide use. Some invalid addresses will still
2093          * pass validation here.
2094          *
2095          * @since 1.18
2096          *
2097          * @param string $addr E-mail address
2098          * @return bool
2099          */
2100         public static function validateEmail( $addr ) {
2101                 $result = null;
2102                 if ( !Hooks::run( 'isValidEmailAddr', [ $addr, &$result ] ) ) {
2103                         return $result;
2104                 }
2105
2106                 // Please note strings below are enclosed in brackets [], this make the
2107                 // hyphen "-" a range indicator. Hence it is double backslashed below.
2108                 // See T28948
2109                 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2110                 $rfc1034_ldh_str = "a-z0-9\\-";
2111
2112                 $html5_email_regexp = "/
2113                 ^                      # start of string
2114                 [$rfc5322_atext\\.]+    # user part which is liberal :p
2115                 @                      # 'apostrophe'
2116                 [$rfc1034_ldh_str]+       # First domain part
2117                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
2118                 $                      # End of string
2119                 /ix"; // case Insensitive, eXtended
2120
2121                 return (bool)preg_match( $html5_email_regexp, $addr );
2122         }
2123 }