et al * https://www.mediawiki.org/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * * @file * @ingroup Parser */ use MediaWiki\MediaWikiServices; /** * HTML sanitizer for MediaWiki * @ingroup Parser */ class Sanitizer { /** * Regular expression to match various types of character references in * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences */ const CHAR_REFS_REGEX = '/&([A-Za-z0-9\x80-\xff]+); |&\#([0-9]+); |&\#[xX]([0-9A-Fa-f]+); |(&)/x'; /** * Acceptable tag name charset from HTML5 parsing spec * https://www.w3.org/TR/html5/syntax.html#tag-open-state */ const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; /** * Blacklist for evil uris like javascript: * WARNING: DO NOT use this in any place that actually requires blacklisting * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the * only way to be secure from javascript: uri based xss vectors is to whitelist * things that you know are safe and deny everything else. * [1]: http://ha.ckers.org/xss.html */ const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; /** * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. * * @since 1.30 */ const ID_PRIMARY = 0; /** * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false * if no fallback is configured. * * @since 1.30 */ const ID_FALLBACK = 1; /** * List of all named character entities defined in HTML 4.01 * https://www.w3.org/TR/html4/sgml/entities.html * As well as ' which is only defined starting in XHTML1. */ private static $htmlEntities = [ 'Aacute' => 193, 'aacute' => 225, 'Acirc' => 194, 'acirc' => 226, 'acute' => 180, 'AElig' => 198, 'aelig' => 230, 'Agrave' => 192, 'agrave' => 224, 'alefsym' => 8501, 'Alpha' => 913, 'alpha' => 945, 'amp' => 38, 'and' => 8743, 'ang' => 8736, 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 'Aring' => 197, 'aring' => 229, 'asymp' => 8776, 'Atilde' => 195, 'atilde' => 227, 'Auml' => 196, 'auml' => 228, 'bdquo' => 8222, 'Beta' => 914, 'beta' => 946, 'brvbar' => 166, 'bull' => 8226, 'cap' => 8745, 'Ccedil' => 199, 'ccedil' => 231, 'cedil' => 184, 'cent' => 162, 'Chi' => 935, 'chi' => 967, 'circ' => 710, 'clubs' => 9827, 'cong' => 8773, 'copy' => 169, 'crarr' => 8629, 'cup' => 8746, 'curren' => 164, 'dagger' => 8224, 'Dagger' => 8225, 'darr' => 8595, 'dArr' => 8659, 'deg' => 176, 'Delta' => 916, 'delta' => 948, 'diams' => 9830, 'divide' => 247, 'Eacute' => 201, 'eacute' => 233, 'Ecirc' => 202, 'ecirc' => 234, 'Egrave' => 200, 'egrave' => 232, 'empty' => 8709, 'emsp' => 8195, 'ensp' => 8194, 'Epsilon' => 917, 'epsilon' => 949, 'equiv' => 8801, 'Eta' => 919, 'eta' => 951, 'ETH' => 208, 'eth' => 240, 'Euml' => 203, 'euml' => 235, 'euro' => 8364, 'exist' => 8707, 'fnof' => 402, 'forall' => 8704, 'frac12' => 189, 'frac14' => 188, 'frac34' => 190, 'frasl' => 8260, 'Gamma' => 915, 'gamma' => 947, 'ge' => 8805, 'gt' => 62, 'harr' => 8596, 'hArr' => 8660, 'hearts' => 9829, 'hellip' => 8230, 'Iacute' => 205, 'iacute' => 237, 'Icirc' => 206, 'icirc' => 238, 'iexcl' => 161, 'Igrave' => 204, 'igrave' => 236, 'image' => 8465, 'infin' => 8734, 'int' => 8747, 'Iota' => 921, 'iota' => 953, 'iquest' => 191, 'isin' => 8712, 'Iuml' => 207, 'iuml' => 239, 'Kappa' => 922, 'kappa' => 954, 'Lambda' => 923, 'lambda' => 955, 'lang' => 9001, 'laquo' => 171, 'larr' => 8592, 'lArr' => 8656, 'lceil' => 8968, 'ldquo' => 8220, 'le' => 8804, 'lfloor' => 8970, 'lowast' => 8727, 'loz' => 9674, 'lrm' => 8206, 'lsaquo' => 8249, 'lsquo' => 8216, 'lt' => 60, 'macr' => 175, 'mdash' => 8212, 'micro' => 181, 'middot' => 183, 'minus' => 8722, 'Mu' => 924, 'mu' => 956, 'nabla' => 8711, 'nbsp' => 160, 'ndash' => 8211, 'ne' => 8800, 'ni' => 8715, 'not' => 172, 'notin' => 8713, 'nsub' => 8836, 'Ntilde' => 209, 'ntilde' => 241, 'Nu' => 925, 'nu' => 957, 'Oacute' => 211, 'oacute' => 243, 'Ocirc' => 212, 'ocirc' => 244, 'OElig' => 338, 'oelig' => 339, 'Ograve' => 210, 'ograve' => 242, 'oline' => 8254, 'Omega' => 937, 'omega' => 969, 'Omicron' => 927, 'omicron' => 959, 'oplus' => 8853, 'or' => 8744, 'ordf' => 170, 'ordm' => 186, 'Oslash' => 216, 'oslash' => 248, 'Otilde' => 213, 'otilde' => 245, 'otimes' => 8855, 'Ouml' => 214, 'ouml' => 246, 'para' => 182, 'part' => 8706, 'permil' => 8240, 'perp' => 8869, 'Phi' => 934, 'phi' => 966, 'Pi' => 928, 'pi' => 960, 'piv' => 982, 'plusmn' => 177, 'pound' => 163, 'prime' => 8242, 'Prime' => 8243, 'prod' => 8719, 'prop' => 8733, 'Psi' => 936, 'psi' => 968, 'quot' => 34, 'radic' => 8730, 'rang' => 9002, 'raquo' => 187, 'rarr' => 8594, 'rArr' => 8658, 'rceil' => 8969, 'rdquo' => 8221, 'real' => 8476, 'reg' => 174, 'rfloor' => 8971, 'Rho' => 929, 'rho' => 961, 'rlm' => 8207, 'rsaquo' => 8250, 'rsquo' => 8217, 'sbquo' => 8218, 'Scaron' => 352, 'scaron' => 353, 'sdot' => 8901, 'sect' => 167, 'shy' => 173, 'Sigma' => 931, 'sigma' => 963, 'sigmaf' => 962, 'sim' => 8764, 'spades' => 9824, 'sub' => 8834, 'sube' => 8838, 'sum' => 8721, 'sup' => 8835, 'sup1' => 185, 'sup2' => 178, 'sup3' => 179, 'supe' => 8839, 'szlig' => 223, 'Tau' => 932, 'tau' => 964, 'there4' => 8756, 'Theta' => 920, 'theta' => 952, 'thetasym' => 977, 'thinsp' => 8201, 'THORN' => 222, 'thorn' => 254, 'tilde' => 732, 'times' => 215, 'trade' => 8482, 'Uacute' => 218, 'uacute' => 250, 'uarr' => 8593, 'uArr' => 8657, 'Ucirc' => 219, 'ucirc' => 251, 'Ugrave' => 217, 'ugrave' => 249, 'uml' => 168, 'upsih' => 978, 'Upsilon' => 933, 'upsilon' => 965, 'Uuml' => 220, 'uuml' => 252, 'weierp' => 8472, 'Xi' => 926, 'xi' => 958, 'Yacute' => 221, 'yacute' => 253, 'yen' => 165, 'Yuml' => 376, 'yuml' => 255, 'Zeta' => 918, 'zeta' => 950, 'zwj' => 8205, 'zwnj' => 8204 ]; /** * Character entity aliases accepted by MediaWiki */ private static $htmlEntityAliases = [ 'רלמ' => 'rlm', 'رلم' => 'rlm', ]; /** * Lazy-initialised attributes regex, see getAttribsRegex() */ private static $attribsRegex; /** * Regular expression to match HTML/XML attribute pairs within a tag. * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state * Used in Sanitizer::decodeTagAttributes * @return string */ static function getAttribsRegex() { if ( self::$attribsRegex === null ) { $spaceChars = '\x09\x0a\x0c\x0d\x20'; $space = "[{$spaceChars}]"; $attrib = "[^{$spaceChars}\/>=]"; $attribFirst = "(?:{$attrib}|=)"; self::$attribsRegex = "/({$attribFirst}{$attrib}*) ($space*=$space* (?: # The attribute value: quoted or alone \"([^\"]*)(?:\"|\$) | '([^']*)(?:'|\$) | (((?!$space|>).)*) ) )?/sxu"; } return self::$attribsRegex; } /** * Lazy-initialised attribute name regex, see getAttribNameRegex() */ private static $attribNameRegex; /** * Used in Sanitizer::decodeTagAttributes to filter attributes. * @return string */ static function getAttribNameRegex() { if ( self::$attribNameRegex === null ) { $attribFirst = "[:_\p{L}\p{N}]"; $attrib = "[:_\.\-\p{L}\p{N}]"; self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu"; } return self::$attribNameRegex; } /** * Return the various lists of recognized tags * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @return array */ public static function getRecognizedTagData( $extratags = [], $removetags = [] ) { global $wgAllowImageTag; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; // Base our staticInitialised variable off of the global config state so that if the globals // are changed (like in the screwed up test system) we will re-initialise the settings. $globalContext = $wgAllowImageTag; if ( !$staticInitialised || $staticInitialised != $globalContext ) { $htmlpairsStatic = [ # Tags that must be closed 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', 'kbd', 'samp', 'data', 'time', 'mark' ]; $htmlsingle = [ 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' ]; # Elements that cannot have close tags. This is (not coincidentally) # also the list of tags for which the HTML 5 parsing algorithm # requires you to "acknowledge the token's self-closing flag", i.e. # a self-closing tag like
is not an HTML 5 parse error only # for this list. $htmlsingleonly = [ 'br', 'wbr', 'hr', 'meta', 'link' ]; $htmlnest = [ # Tags that can be nested--?? 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' ]; $tabletags = [ # Can only appear inside table, we will close them 'td', 'th', 'tr', ]; $htmllist = [ # Tags used by list 'ul', 'ol', ]; $listtags = [ # Tags that can appear in a list 'li', ]; if ( $wgAllowImageTag ) { $htmlsingle[] = 'img'; $htmlsingleonly[] = 'img'; } $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); # Convert them all to hashtables for faster lookup $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; foreach ( $vars as $var ) { $$var = array_flip( $$var ); } $staticInitialised = $globalContext; } # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays $extratags = array_flip( $extratags ); $removetags = array_flip( $removetags ); $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); return [ 'htmlpairs' => $htmlpairs, 'htmlsingle' => $htmlsingle, 'htmlsingleonly' => $htmlsingleonly, 'htmlnest' => $htmlnest, 'tabletags' => $tabletags, 'htmllist' => $htmllist, 'listtags' => $listtags, 'htmlsingleallowed' => $htmlsingleallowed, 'htmlelements' => $htmlelements, ]; } /** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @param string $text * @param callable|null $processCallback Callback to do any variable or parameter * replacements in HTML attribute values * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude * @param callable|null $warnCallback (Deprecated) Callback allowing the * addition of a tracking category when bad input is encountered. * DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be * removed shortly. * @return string */ public static function removeHTMLtags( $text, $processCallback = null, $args = [], $extratags = [], $removetags = [], $warnCallback = null ) { $tagData = self::getRecognizedTagData( $extratags, $removetags ); $htmlpairs = $tagData['htmlpairs']; $htmlsingle = $tagData['htmlsingle']; $htmlsingleonly = $tagData['htmlsingleonly']; $htmlnest = $tagData['htmlnest']; $tabletags = $tagData['tabletags']; $htmllist = $tagData['htmllist']; $listtags = $tagData['listtags']; $htmlsingleallowed = $tagData['htmlsingleallowed']; $htmlelements = $tagData['htmlelements']; # Remove HTML comments $text = self::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = str_replace( '>', '>', array_shift( $bits ) ); if ( !MWTidy::isEnabled() ) { wfDeprecated( 'disabling tidy', '1.33' ); $tagstack = $tablestack = []; foreach ( $bits as $x ) { $regs = []; # $slash: Does the current element start with a '/'? # $t: Current element name # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } $badtag = false; $t = strtolower( $t ); if ( isset( $htmlelements[$t] ) ) { # Check our stack if ( $slash && isset( $htmlsingleonly[$t] ) ) { $badtag = true; } elseif ( $slash ) { # Closing a tag... is it the one we just opened? Wikimedia\suppressWarnings(); $ot = array_pop( $tagstack ); Wikimedia\restoreWarnings(); if ( $ot != $t ) { if ( isset( $htmlsingleallowed[$ot] ) ) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = []; array_push( $optstack, $ot ); Wikimedia\suppressWarnings(); $ot = array_pop( $tagstack ); Wikimedia\restoreWarnings(); while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { array_push( $optstack, $ot ); Wikimedia\suppressWarnings(); $ot = array_pop( $tagstack ); Wikimedia\restoreWarnings(); } if ( $t != $ot ) { # No match. Push the optional elements back again $badtag = true; Wikimedia\suppressWarnings(); $ot = array_pop( $optstack ); Wikimedia\restoreWarnings(); while ( $ot ) { array_push( $tagstack, $ot ); Wikimedia\suppressWarnings(); $ot = array_pop( $optstack ); Wikimedia\restoreWarnings(); } } } else { Wikimedia\suppressWarnings(); array_push( $tagstack, $ot ); Wikimedia\restoreWarnings(); #
  • can be nested in