X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=c1c8daf37affbb40f27444e115068ab392dee7c9;hb=56af4f4d3849aa29bdb3883ef090ded39ab32e39;hp=51c751cef324c42ff9c6a4da1882d1c04fdf987f;hpb=c771fc9c96aacb44b86ade5ecca68334c5d8213f;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 51c751cef3..c1c8daf37a 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -28,7 +28,7 @@ * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences */ define( 'MW_CHAR_REFS_REGEX', - '/&([A-Za-z0-9]+); + '/&([A-Za-z0-9\x80-\xff]+); |&\#([0-9]+); |&\#x([0-9A-Za-z]+); |&\#X([0-9A-Za-z]+); @@ -315,7 +315,24 @@ $wgHtmlEntities = array( 'zwj' => 8205, 'zwnj' => 8204 ); +/** + * Character entity aliases accepted by MediaWiki + */ +global $wgHtmlEntityAliases; +$wgHtmlEntityAliases = array( + 'רלמ' => 'rlm', + 'رلم' => 'rlm', +); + + +/** + * XHTML sanitizer for MediaWiki + * @addtogroup Parser + */ class Sanitizer { + const NONE = 0; + const INITIAL_NONLETTER = 1; + /** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments @@ -325,8 +342,8 @@ class Sanitizer { * @param array $args for the processing callback * @return string */ - static function removeHTMLtags( $text, $processCallback = null, $args = array() ) { - global $wgUseTidy, $wgUserHtml; + static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) { + global $wgUseTidy; static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised; @@ -334,40 +351,33 @@ class Sanitizer { wfProfileIn( __METHOD__ ); if ( !$staticInitialised ) { - if( $wgUserHtml ) { - $htmlpairs = array( # Tags that must be closed - 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', - 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', - 'strike', 'strong', 'tt', 'var', 'div', 'center', - 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' - ); - $htmlsingle = array( - 'br', 'hr', 'li', 'dt', 'dd' - ); - $htmlsingleonly = array( # Elements that cannot have close tags - 'br', 'hr' - ); - $htmlnest = array( # Tags that can be nested--?? - 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', - 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' - ); - $tabletags = array( # Can only appear inside table, we will close them - 'td', 'th', 'tr', - ); - $htmllist = array( # Tags used by list - 'ul','ol', - ); - $listtags = array( # Tags that can appear in a list - 'li', - ); - - } else { - $htmlpairs = array(); - $htmlsingle = array(); - $htmlnest = array(); - $tabletags = array(); - } + + $htmlpairs = array_merge( $extratags, array( # Tags that must be closed + 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', + 'strike', 'strong', 'tt', 'var', 'div', 'center', + 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' + ) ); + $htmlsingle = array( + 'br', 'hr', 'li', 'dt', 'dd' + ); + $htmlsingleonly = array( # Elements that cannot have close tags + 'br', 'hr' + ); + $htmlnest = array( # Tags that can be nested--?? + 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', + 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' + ); + $tabletags = array( # Can only appear inside table, we will close them + 'td', 'th', 'tr', + ); + $htmllist = array( # Tags used by list + 'ul','ol', + ); + $listtags = array( # Tags that can appear in a list + 'li', + ); $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); @@ -559,6 +569,7 @@ class Sanitizer { * * - Discards attributes not on a whitelist for the given element * - Unsafe style attributes are discarded + * - Invalid id attributes are reencoded * * @param array $attribs * @param string $element @@ -568,7 +579,27 @@ class Sanitizer { * @todo Check for unique id attribute :P */ static function validateTagAttributes( $attribs, $element ) { - $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) ); + return Sanitizer::validateAttributes( $attribs, + Sanitizer::attributeWhitelist( $element ) ); + } + + /** + * Take an array of attribute names and values and normalize or discard + * illegal values for the given whitelist. + * + * - Discards attributes not the given whitelist + * - Unsafe style attributes are discarded + * - Invalid id attributes are reencoded + * + * @param array $attribs + * @param array $whitelist list of allowed attribute names + * @return array + * + * @todo Check for legal values where the DTD limits things. + * @todo Check for unique id attribute :P + */ + static function validateAttributes( $attribs, $whitelist ) { + $whitelist = array_flip( $whitelist ); $out = array(); foreach( $attribs as $attribute => $value ) { if( !isset( $whitelist[$attribute] ) ) { @@ -594,6 +625,33 @@ class Sanitizer { return $out; } + /** + * Merge two sets of HTML attributes. + * Conflicting items in the second set will override those + * in the first, except for 'class' attributes which will be + * combined. + * + * @todo implement merging for other attributes such as style + * @param array $a + * @param array $b + * @return array + */ + static function mergeAttributes( $a, $b ) { + $out = array_merge( $a, $b ); + if( isset( $a['class'] ) + && isset( $b['class'] ) + && $a['class'] !== $b['class'] ) { + + $out['class'] = implode( ' ', + array_unique( + preg_split( '/\s+/', + $a['class'] . ' ' . $b['class'], + -1, + PREG_SPLIT_NO_EMPTY ) ) ); + } + return $out; + } + /** * Pick apart some CSS and check it for forbidden or unsafe structures. * Returns a sanitized string, or false if it was just too evil. @@ -615,7 +673,7 @@ class Sanitizer { $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', 'codepointToUtf8(hexdec("$1"))', $stripped ); $stripped = str_replace( '\\', '', $stripped ); - if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', + if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is', $stripped ) ) { # haxx0r return false; @@ -667,7 +725,7 @@ class Sanitizer { * @return HTML-encoded text fragment */ static function encodeAttribute( $text ) { - $encValue = htmlspecialchars( $text ); + $encValue = htmlspecialchars( $text, ENT_QUOTES ); // Whitespace is normalized during attribute decoding, // so if we've been passed non-spaces we must encode them @@ -718,27 +776,34 @@ class Sanitizer { * Given a value escape it so that it can be used in an id attribute and * return it, this does not validate the value however (see first link) * - * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters + * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and * name attributes - * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute - * - * @bug 4461 + * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute * - * @static - * - * @param string $id + * @param string $id Id to validate + * @param int $flags Currently only two values: Sanitizer::INITIAL_NONLETTER + * (default) permits initial non-letter characters, + * such as if you're adding a prefix to them. + * Sanitizer::NONE will prepend an 'x' if the id + * would otherwise start with a nonletter. * @return string */ - static function escapeId( $id ) { + static function escapeId( $id, $flags = Sanitizer::INITIAL_NONLETTER ) { static $replace = array( '%3A' => ':', '%' => '.' ); $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); - - return str_replace( array_keys( $replace ), array_values( $replace ), $id ); + $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); + + if( ~$flags & Sanitizer::INITIAL_NONLETTER + && !preg_match( '/[a-zA-Z]/', $id[0] ) ) { + // Initial character must be a letter! + $id = "x$id"; + } + return $id; } /** @@ -747,7 +812,7 @@ class Sanitizer { * * @todo For extra validity, input should be validated UTF-8. * - * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format + * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format * * @param string $class * @return string @@ -852,11 +917,16 @@ class Sanitizer { */ private static function normalizeAttributeValue( $text ) { return str_replace( '"', '"', - preg_replace( - '/\r\n|[\x20\x0d\x0a\x09]/', - ' ', + self::normalizeWhitespace( Sanitizer::normalizeCharReferences( $text ) ) ); } + + private static function normalizeWhitespace( $text ) { + return preg_replace( + '/\r\n|[\x20\x0d\x0a\x09]/', + ' ', + $text ); + } /** * Ensure that any entities and character references are legal @@ -902,16 +972,19 @@ class Sanitizer { /** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, - * return the named entity reference as is. Otherwise, returns - * HTML-escaped text of pseudo-entity source (eg &foo;) + * return the named entity reference as is. If the entity is a + * MediaWiki-specific alias, returns the HTML equivalent. Otherwise, + * returns HTML-escaped text of pseudo-entity source (eg &foo;) * * @param string $name * @return string * @static */ static function normalizeEntity( $name ) { - global $wgHtmlEntities; - if( isset( $wgHtmlEntities[$name] ) ) { + global $wgHtmlEntities, $wgHtmlEntityAliases; + if ( isset( $wgHtmlEntityAliases[$name] ) ) { + return "&{$wgHtmlEntityAliases[$name]};"; + } elseif( isset( $wgHtmlEntities[$name] ) ) { return "&$name;"; } else { return "&$name;"; @@ -1008,7 +1081,10 @@ class Sanitizer { * @return string */ static function decodeEntity( $name ) { - global $wgHtmlEntities; + global $wgHtmlEntities, $wgHtmlEntityAliases; + if ( isset( $wgHtmlEntityAliases[$name] ) ) { + $name = $wgHtmlEntityAliases[$name]; + } if( isset( $wgHtmlEntities[$name] ) ) { return codepointToUtf8( $wgHtmlEntities[$name] ); } else { @@ -1143,6 +1219,11 @@ class Sanitizer { # 11.2.6 'td' => array_merge( $common, $tablecell, $tablealign ), 'th' => array_merge( $common, $tablecell, $tablealign ), + + # 13.2 + # Not usually allowed, but may be used for extension-style hooks + # such as when it is rasterized + 'img' => array_merge( $common, array( 'alt' ) ), # 15.2.1 'tt' => $common, @@ -1169,14 +1250,21 @@ class Sanitizer { 'rb' => $common, 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 'rp' => $common, + + # MathML root element, where used for extensions + # 'title' may not be 100% valid here; it's XHTML + # http://www.w3.org/TR/REC-MathML/ + 'math' => array( 'class', 'style', 'id', 'title' ), ); return $whitelist; } /** * Take a fragment of (potentially invalid) HTML and return - * a version with any tags removed, encoded suitably for literal - * inclusion in an attribute value. + * a version with any tags removed, encoded as plain text. + * + * Warning: this return value must be further escaped for literal + * inclusion in HTML output as of 1.10! * * @param string $text HTML fragment * @return string @@ -1186,14 +1274,8 @@ class Sanitizer { $text = StringUtils::delimiterReplace( '<', '>', '', $text ); # Normalize &entities and whitespace - $text = Sanitizer::normalizeAttributeValue( $text ); - - # Will be placed into "double-quoted" attributes, - # make sure remaining bits are safe. - $text = str_replace( - array('<', '>', '"'), - array('<', '>', '"'), - $text ); + $text = self::decodeCharReferences( $text ); + $text = self::normalizeWhitespace( $text ); return $text; } @@ -1262,4 +1344,4 @@ class Sanitizer { } -?> +