X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=00837fdadf5a105b1290dfbf2706367606ede627;hb=0ccd98bdb63c97862d0d7096992d471f4db8a19d;hp=8f45cfd11988e6abb3f1824b3c06ad685312f2bc;hpb=f8879bfd2bef6962fe02be812df2cd657ac73e17;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 8f45cfd119..00837fdadf 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -389,6 +389,12 @@ class Sanitizer { 'li', ); + global $wgAllowImageTag; + if ( $wgAllowImageTag ) { + $htmlsingle[] = 'img'; + $htmlsingleonly[] = 'img'; + } + $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); @@ -644,10 +650,6 @@ class Sanitizer { # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if( $attribute == 'style' ) { $value = Sanitizer::checkCss( $value ); - if( $value === false ) { - # haxx0r - continue; - } } if ( $attribute === 'id' ) { @@ -738,26 +740,104 @@ class Sanitizer { * @return Mixed */ static function checkCss( $value ) { - $stripped = Sanitizer::decodeCharReferences( $value ); + $value = Sanitizer::decodeCharReferences( $value ); // Remove any comments; IE gets token splitting wrong - $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped ); - - $value = $stripped; - - // ... and continue checks - $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', - 'codepointToUtf8(hexdec("$1"))', $stripped ); - $stripped = str_replace( '\\', '', $stripped ); - if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is', - $stripped ) ) { - # haxx0r - return false; + $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); + + // Decode escape sequences and line continuation + // See the grammar in the CSS 2 spec, appendix D. + static $decodeRegex, $reencodeTable; + if ( !$decodeRegex ) { + $space = '[\\x20\\t\\r\\n\\f]'; + $nl = '(?:\\n|\\r\\n|\\r|\\f)'; + $backslash = '\\\\'; + $decodeRegex = "/ $backslash + (?: + ($nl) | # 1. Line continuation + ([0-9A-Fa-f]{1,6})$space? | # 2. character number + (.) | # 3. backslash cancelling special meaning + () | # 4. backslash at end of string + )/xu"; + } + $value = preg_replace_callback( $decodeRegex, + array( __CLASS__, 'cssDecodeCallback' ), $value ); + + // Reject problematic keywords and control characters + if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) { + return '/* invalid control char */'; + } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) { + return '/* insecure input */'; } - return $value; } + static function cssDecodeCallback( $matches ) { + if ( $matches[1] !== '' ) { + // Line continuation + return ''; + } elseif ( $matches[2] !== '' ) { + $char = codepointToUtf8( hexdec( $matches[2] ) ); + } elseif ( $matches[3] !== '' ) { + $char = $matches[3]; + } else { + $char = '\\'; + } + if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { + // These characters need to be escaped in strings + // Clean up the escape sequence to avoid parsing errors by clients + return '\\' . dechex( ord( $char ) ) . ' '; + } else { + // Decode unnecessary escape + return $char; + } + } + + /** + * Take an associative array of attribute name/value pairs + * and generate a css style representing all the style-related + * attributes. If there already a style attribute in the array, + * it is also included in the value returned. + */ + static function styleFromAttributes( $attributes ) { + $styles = array(); + + foreach ( $attributes as $attribute => $value ) { + if ( $attribute == 'bgcolor' ) { + $styles[] = "background-color: $value"; + } else if ( $attribute == 'border' ) { + $styles[] = "border-width: $value"; + } else if ( $attribute == 'align' ) { + $styles[] = "text-align: $value"; + } else if ( $attribute == 'valign' ) { + $styles[] = "vertical-align: $value"; + } else if ( $attribute == 'width' ) { + if ( preg_match( '/\d+/', $value ) === false ) { + $value .= 'px'; + } + + $styles[] = "width: $value"; + } else if ( $attribute == 'height' ) { + if ( preg_match( '/\d+/', $value ) === false ) { + $value .= 'px'; + } + + $styles[] = "height: $value"; + } else if ( $attribute == 'nowrap' ) { + if ( $value ) { + $styles[] = "white-space: nowrap"; + } + } + } + + if ( isset( $attributes[ 'style' ] ) ) { + $styles[] = $attributes[ 'style' ]; + } + + if ( !$styles ) return ''; + else return implode( '; ', $styles ); + } + /** * Take a tag soup fragment listing an HTML element's attributes * and normalize it to well-formed XML, discarding unwanted attributes. @@ -775,24 +855,66 @@ class Sanitizer { * * @param $text String * @param $element String + * @param $defaults Array (optional) associative array of default attributes to splice in. + * class and style attributes are combined. Otherwise, values from + * $attributes take precedence over values from $defaults. * @return String */ - static function fixTagAttributes( $text, $element ) { + static function fixTagAttributes( $text, $element, $defaults = null ) { if( trim( $text ) == '' ) { return ''; } - $stripped = Sanitizer::validateTagAttributes( - Sanitizer::decodeTagAttributes( $text ), $element ); + $decoded = Sanitizer::decodeTagAttributes( $text ); + $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); + $attribs = Sanitizer::collapseTagAttributes( $stripped, $defaults ); - $attribs = array(); - foreach( $stripped as $attribute => $value ) { + return $attribs; + } + + /** + * Take an associative array or attribute name/value pairs + * and collapses it to well-formed XML. + * Does not filter attributes. + * Output is safe for further wikitext processing, with escaping of + * values that could trigger problems. + * + * - Double-quotes all attribute values + * - Prepends space if there are attributes. + * + * @param $attributes Array is an associative array of attribute name/value pairs. + * Assumed to be sanitized already. + * @param $defaults Array (optional) associative array of default attributes to splice in. + * class and style attributes are combined. Otherwise, values from + * $attributes take precedence over values from $defaults. + * @return String + */ + static function collapseTagAttributes( $attributes, $defaults = null ) { + if ( $defaults ) { + foreach( $defaults as $attribute => $value ) { + if ( isset( $attributes[ $attribute ] ) ) { + if ( $attribute == 'class' ) { + $value .= ' '. $attributes[ $attribute ]; + } else if ( $attribute == 'style' ) { + $value .= '; ' . $attributes[ $attribute ]; + } else { + continue; + } + } + + $attributes[ $attribute ] = $value; + } + } + + $chunks = array(); + + foreach( $attributes as $attribute => $value ) { $encAttribute = htmlspecialchars( $attribute ); $encValue = Sanitizer::safeEncodeAttribute( $value ); - $attribs[] = "$encAttribute=\"$encValue\""; + $chunks[] = "$encAttribute=\"$encValue\""; } - return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; + return count( $chunks ) ? ' ' . implode( ' ', $chunks ) : ''; } /** @@ -857,6 +979,7 @@ class Sanitizer { * * To ensure we don't have to bother escaping anything, we also strip ', ", * & even if $wgExperimentalIds is true. TODO: Is this the best tactic? + * We also strip # because it upsets IE6. * * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and @@ -877,11 +1000,12 @@ class Sanitizer { * @return String */ static function escapeId( $id, $options = array() ) { - global $wgExperimentalHtmlIds; + global $wgHtml5, $wgExperimentalHtmlIds; $options = (array)$options; - if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { - $id = preg_replace( '/[ \t\n\r\f_\'"&]+/', '_', $id ); + if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { + $id = Sanitizer::decodeCharReferences( $id ); + $id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id ); $id = trim( $id, '_' ); if ( $id === '' ) { # Must have been all whitespace to start with. @@ -929,7 +1053,7 @@ class Sanitizer { /** * Given HTML input, escape with htmlspecialchars but un-escape entites. - * This allows (generally harmless) entities like   to survive. + * This allows (generally harmless) entities like   to survive. * * @param $html String to escape * @return String: escaped input @@ -1150,6 +1274,30 @@ class Sanitizer { $text ); } + /** + * Decode any character references, numeric or named entities, + * in the next and normalize the resulting string. (bug 14952) + * + * This is useful for page titles, not for text to be displayed, + * MediaWiki allows HTML entities to escape normalization as a feature. + * + * @param $text String (already normalized, containing entities) + * @return String (still normalized, without entities) + */ + public static function decodeCharReferencesAndNormalize( $text ) { + global $wgContLang; + $text = preg_replace_callback( + MW_CHAR_REFS_REGEX, + array( 'Sanitizer', 'decodeCharReferencesCallback' ), + $text, /* limit */ -1, $count ); + + if ( $count ) { + return $wgContLang->normalize( $text ); + } else { + return $text; + } + } + /** * @param $matches String * @return String @@ -1227,7 +1375,7 @@ class Sanitizer { static function setupAttributeWhitelist() { global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes; - $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style', 'xml:lang' ); + $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); if ( $wgAllowRdfaAttributes ) { #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 @@ -1353,8 +1501,9 @@ class Sanitizer { # 13.2 # Not usually allowed, but may be used for extension-style hooks - # such as when it is rasterized - 'img' => array_merge( $common, array( 'alt' ) ), + # such as when it is rasterized, or if $wgAllowImageTag is + # true + 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), # 15.2.1 'tt' => $common, @@ -1436,7 +1585,7 @@ class Sanitizer { $url = Sanitizer::decodeCharReferences( $url ); # Escape any control characters introduced by the above step - $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url ); + $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url ); # Validate hostname portion $matches = array();