X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=00837fdadf5a105b1290dfbf2706367606ede627;hb=0ccd98bdb63c97862d0d7096992d471f4db8a19d;hp=8f45cfd11988e6abb3f1824b3c06ad685312f2bc;hpb=f8879bfd2bef6962fe02be812df2cd657ac73e17;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 8f45cfd119..00837fdadf 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -389,6 +389,12 @@ class Sanitizer {
 				'li',
 			);
 
+			global $wgAllowImageTag;
+			if ( $wgAllowImageTag ) {
+				$htmlsingle[] = 'img';
+				$htmlsingleonly[] = 'img';
+			}
+
 			$htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 			$htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 
@@ -644,10 +650,6 @@ class Sanitizer {
 			# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 			if( $attribute == 'style' ) {
 				$value = Sanitizer::checkCss( $value );
-				if( $value === false ) {
-					# haxx0r
-					continue;
-				}
 			}
 
 			if ( $attribute === 'id' ) {
@@ -738,26 +740,104 @@ class Sanitizer {
 	 * @return Mixed
 	 */
 	static function checkCss( $value ) {
-		$stripped = Sanitizer::decodeCharReferences( $value );
+		$value = Sanitizer::decodeCharReferences( $value );
 
 		// Remove any comments; IE gets token splitting wrong
-		$stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
-
-		$value = $stripped;
-
-		// ... and continue checks
-		$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
-			'codepointToUtf8(hexdec("$1"))', $stripped );
-		$stripped = str_replace( '\\', '', $stripped );
-		if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
-				$stripped ) ) {
-			# haxx0r
-			return false;
+		$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+		// Decode escape sequences and line continuation
+		// See the grammar in the CSS 2 spec, appendix D.
+		static $decodeRegex, $reencodeTable;
+		if ( !$decodeRegex ) {
+			$space = '[\\x20\\t\\r\\n\\f]';
+			$nl = '(?:\\n|\\r\\n|\\r|\\f)';
+			$backslash = '\\\\';
+			$decodeRegex = "/ $backslash 
+				(?:
+					($nl) |  # 1. Line continuation
+					([0-9A-Fa-f]{1,6})$space? |  # 2. character number
+					(.) | # 3. backslash cancelling special meaning
+					() | # 4. backslash at end of string
+				)/xu";
+		}
+		$value = preg_replace_callback( $decodeRegex,
+			array( __CLASS__, 'cssDecodeCallback' ), $value );
+
+		// Reject problematic keywords and control characters
+		if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
+			return '/* invalid control char */';
+		} elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) {
+			return '/* insecure input */';
 		}
-
 		return $value;
 	}
 
+	static function cssDecodeCallback( $matches ) {
+		if ( $matches[1] !== '' ) {
+			// Line continuation
+			return '';
+		} elseif ( $matches[2] !== '' ) {
+			$char = codepointToUtf8( hexdec( $matches[2] ) );
+		} elseif ( $matches[3] !== '' ) {
+			$char = $matches[3];
+		} else {
+			$char = '\\';
+		}
+		if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
+			// These characters need to be escaped in strings
+			// Clean up the escape sequence to avoid parsing errors by clients
+			return '\\' . dechex( ord( $char ) ) . ' ';
+		} else {
+			// Decode unnecessary escape
+			return $char;
+		}
+	}
+
+	/** 
+	* Take an associative array of attribute name/value pairs
+	* and generate a css style representing all the style-related
+	* attributes. If there already a style attribute in the array,
+	* it is also included in the value returned.
+	*/
+	static function styleFromAttributes( $attributes ) {
+		$styles = array();
+
+		foreach ( $attributes as $attribute => $value ) {
+			if ( $attribute == 'bgcolor' ) {
+				$styles[] = "background-color: $value";
+			} else if ( $attribute == 'border' ) {
+				$styles[] = "border-width: $value";
+			} else if ( $attribute == 'align' ) {
+				$styles[] = "text-align: $value";
+			} else if ( $attribute == 'valign' ) {
+				$styles[] = "vertical-align: $value";
+			} else if ( $attribute == 'width' ) {
+				if ( preg_match( '/\d+/', $value ) === false ) {
+				      $value .= 'px';
+				}
+
+				$styles[] = "width: $value";
+			} else if ( $attribute == 'height' ) {
+				if ( preg_match( '/\d+/', $value ) === false ) {
+				      $value .= 'px';
+				}
+
+				$styles[] = "height: $value";
+			} else if ( $attribute == 'nowrap' ) {
+				if ( $value ) {
+					$styles[] = "white-space: nowrap";
+				}
+			}
+		}
+
+		if ( isset( $attributes[ 'style' ] ) ) {
+			$styles[] = $attributes[ 'style' ];
+		} 
+
+		if ( !$styles ) return '';
+		else return implode( '; ', $styles );
+	}
+
 	/**
 	 * Take a tag soup fragment listing an HTML element's attributes
 	 * and normalize it to well-formed XML, discarding unwanted attributes.
@@ -775,24 +855,66 @@ class Sanitizer {
 	 *
 	 * @param $text String
 	 * @param $element String
+	 * @param $defaults Array (optional) associative array of default attributes to splice in. 
+	 *			class and style attributes are combined. Otherwise, values from
+	 *			$attributes take precedence over values from $defaults.
 	 * @return String
 	 */
-	static function fixTagAttributes( $text, $element ) {
+	static function fixTagAttributes( $text, $element, $defaults = null ) {
 		if( trim( $text ) == '' ) {
 			return '';
 		}
 
-		$stripped = Sanitizer::validateTagAttributes(
-			Sanitizer::decodeTagAttributes( $text ), $element );
+		$decoded = Sanitizer::decodeTagAttributes( $text );
+		$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+		$attribs = Sanitizer::collapseTagAttributes( $stripped, $defaults );
 
-		$attribs = array();
-		foreach( $stripped as $attribute => $value ) {
+		return $attribs;
+	}
+
+	/**
+	 * Take an associative array or attribute name/value pairs
+	 * and collapses it to well-formed XML.
+	 * Does not filter attributes.
+	 * Output is safe for further wikitext processing, with escaping of
+	 * values that could trigger problems.
+	 *
+	 * - Double-quotes all attribute values
+	 * - Prepends space if there are attributes.
+	 *
+	 * @param $attributes Array is an associative array of attribute name/value pairs. 
+	 * 			Assumed to be sanitized already.
+	 * @param $defaults Array (optional) associative array of default attributes to splice in. 
+	 *			class and style attributes are combined. Otherwise, values from
+	 *			$attributes take precedence over values from $defaults.
+	 * @return String
+	 */
+	static function collapseTagAttributes( $attributes, $defaults = null ) {
+		if ( $defaults ) {
+			foreach( $defaults as $attribute => $value ) {
+				if ( isset( $attributes[ $attribute ] ) ) {
+					if ( $attribute == 'class' ) {
+						$value .= ' '. $attributes[ $attribute ];
+					} else if ( $attribute == 'style' ) {
+						$value .= '; ' . $attributes[ $attribute ];
+					} else {
+						continue;
+					}
+				}
+
+				$attributes[ $attribute ] = $value;
+			}
+		}
+
+		$chunks = array();
+
+		foreach( $attributes as $attribute => $value ) {
 			$encAttribute = htmlspecialchars( $attribute );
 			$encValue = Sanitizer::safeEncodeAttribute( $value );
 
-			$attribs[] = "$encAttribute=\"$encValue\"";
+			$chunks[] = "$encAttribute=\"$encValue\"";
 		}
-		return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+		return count( $chunks ) ? ' ' . implode( ' ', $chunks ) : '';
 	}
 
 	/**
@@ -857,6 +979,7 @@ class Sanitizer {
 	 *
 	 * To ensure we don't have to bother escaping anything, we also strip ', ",
 	 * & even if $wgExperimentalIds is true.  TODO: Is this the best tactic?
+	 * We also strip # because it upsets IE6.
 	 *
 	 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 	 *                                                          in the id and
@@ -877,11 +1000,12 @@ class Sanitizer {
 	 * @return String
 	 */
 	static function escapeId( $id, $options = array() ) {
-		global $wgExperimentalHtmlIds;
+		global $wgHtml5, $wgExperimentalHtmlIds;
 		$options = (array)$options;
 
-		if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
-			$id = preg_replace( '/[ \t\n\r\f_\'"&]+/', '_', $id );
+		if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+			$id = Sanitizer::decodeCharReferences( $id );
+			$id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id );
 			$id = trim( $id, '_' );
 			if ( $id === '' ) {
 				# Must have been all whitespace to start with.
@@ -929,7 +1053,7 @@ class Sanitizer {
 
 	/**
 	 * Given HTML input, escape with htmlspecialchars but un-escape entites.
-	 * This allows (generally harmless) entities like &nbsp; to survive.
+	 * This allows (generally harmless) entities like &#160; to survive.
 	 *
 	 * @param $html String to escape
 	 * @return String: escaped input
@@ -1150,6 +1274,30 @@ class Sanitizer {
 			$text );
 	}
 
+	/**
+	 * Decode any character references, numeric or named entities,
+	 * in the next and normalize the resulting string. (bug 14952)
+	 *
+	 * This is useful for page titles, not for text to be displayed,
+	 * MediaWiki allows HTML entities to escape normalization as a feature.
+	 *
+	 * @param $text String (already normalized, containing entities)
+	 * @return String (still normalized, without entities)
+	 */
+	public static function decodeCharReferencesAndNormalize( $text ) {
+		global $wgContLang;
+		$text = preg_replace_callback(
+			MW_CHAR_REFS_REGEX,
+			array( 'Sanitizer', 'decodeCharReferencesCallback' ),
+			$text, /* limit */ -1, $count );
+
+		if ( $count ) {
+			return $wgContLang->normalize( $text );
+		} else {
+			return $text;
+		}
+	}
+
 	/**
 	 * @param $matches String
 	 * @return String
@@ -1227,7 +1375,7 @@ class Sanitizer {
 	static function setupAttributeWhitelist() {
 		global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
 
-		$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style', 'xml:lang' );
+		$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 
 		if ( $wgAllowRdfaAttributes ) {
 			#RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
@@ -1353,8 +1501,9 @@ class Sanitizer {
 
 			# 13.2
 			# Not usually allowed, but may be used for extension-style hooks
-			# such as <math> when it is rasterized
-			'img'        => array_merge( $common, array( 'alt' ) ),
+			# such as <math> when it is rasterized, or if $wgAllowImageTag is
+			# true
+			'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
 
 			# 15.2.1
 			'tt'         => $common,
@@ -1436,7 +1585,7 @@ class Sanitizer {
 		$url = Sanitizer::decodeCharReferences( $url );
 
 		# Escape any control characters introduced by the above step
-		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url );
 
 		# Validate hostname portion
 		$matches = array();