X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=c1c8daf37affbb40f27444e115068ab392dee7c9;hb=56af4f4d3849aa29bdb3883ef090ded39ab32e39;hp=51c751cef324c42ff9c6a4da1882d1c04fdf987f;hpb=c771fc9c96aacb44b86ade5ecca68334c5d8213f;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 51c751cef3..c1c8daf37a 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -28,7 +28,7 @@
  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  */
 define( 'MW_CHAR_REFS_REGEX',
-	'/&([A-Za-z0-9]+);
+	'/&([A-Za-z0-9\x80-\xff]+);
 	 |&\#([0-9]+);
 	 |&\#x([0-9A-Za-z]+);
 	 |&\#X([0-9A-Za-z]+);
@@ -315,7 +315,24 @@ $wgHtmlEntities = array(
 	'zwj'      => 8205,
 	'zwnj'     => 8204 );
 
+/**
+ * Character entity aliases accepted by MediaWiki
+ */
+global $wgHtmlEntityAliases;
+$wgHtmlEntityAliases = array(
+	'×¨××' => 'rlm',
+	'Ø±ÙÙ' => 'rlm',
+);
+
+
+/**
+ * XHTML sanitizer for MediaWiki
+ * @addtogroup Parser
+ */
 class Sanitizer {
+	const NONE = 0;
+	const INITIAL_NONLETTER = 1;
+
 	/**
 	 * Cleans up HTML, removes dangerous tags and attributes, and
 	 * removes HTML comments
@@ -325,8 +342,8 @@ class Sanitizer {
 	 * @param array $args for the processing callback
 	 * @return string
 	 */
-	static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
-		global $wgUseTidy, $wgUserHtml;
+	static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) {
+		global $wgUseTidy;
 
 		static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 			$htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
@@ -334,40 +351,33 @@ class Sanitizer {
 		wfProfileIn( __METHOD__ );
 
 		if ( !$staticInitialised ) {
-			if( $wgUserHtml ) {
-				$htmlpairs = array( # Tags that must be closed
-					'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
-					'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
-					'strike', 'strong', 'tt', 'var', 'div', 'center',
-					'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
-					'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
-				);
-				$htmlsingle = array(
-					'br', 'hr', 'li', 'dt', 'dd'
-				);
-				$htmlsingleonly = array( # Elements that cannot have close tags
-					'br', 'hr'
-				);
-				$htmlnest = array( # Tags that can be nested--??
-					'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
-					'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
-				);
-				$tabletags = array( # Can only appear inside table, we will close them
-					'td', 'th', 'tr',
-				);
-				$htmllist = array( # Tags used by list
-					'ul','ol',
-				);
-				$listtags = array( # Tags that can appear in a list
-					'li',
-				);
-
-			} else {
-				$htmlpairs = array();
-				$htmlsingle = array();
-				$htmlnest = array();
-				$tabletags = array();
-			}
+
+			$htmlpairs = array_merge( $extratags, array( # Tags that must be closed
+				'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+				'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
+				'strike', 'strong', 'tt', 'var', 'div', 'center',
+				'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
+				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
+			) );
+			$htmlsingle = array(
+				'br', 'hr', 'li', 'dt', 'dd'
+			);
+			$htmlsingleonly = array( # Elements that cannot have close tags
+				'br', 'hr'
+			);
+			$htmlnest = array( # Tags that can be nested--??
+				'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
+				'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
+			);
+			$tabletags = array( # Can only appear inside table, we will close them
+				'td', 'th', 'tr',
+			);
+			$htmllist = array( # Tags used by list
+				'ul','ol',
+			);
+			$listtags = array( # Tags that can appear in a list
+				'li',
+			);
 
 			$htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 			$htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
@@ -559,6 +569,7 @@ class Sanitizer {
 	 *
 	 * - Discards attributes not on a whitelist for the given element
 	 * - Unsafe style attributes are discarded
+	 * - Invalid id attributes are reencoded
 	 *
 	 * @param array $attribs
 	 * @param string $element
@@ -568,7 +579,27 @@ class Sanitizer {
 	 * @todo Check for unique id attribute :P
 	 */
 	static function validateTagAttributes( $attribs, $element ) {
-		$whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
+		return Sanitizer::validateAttributes( $attribs,
+			Sanitizer::attributeWhitelist( $element ) );
+	}
+	
+	/**
+	 * Take an array of attribute names and values and normalize or discard
+	 * illegal values for the given whitelist.
+	 *
+	 * - Discards attributes not the given whitelist
+	 * - Unsafe style attributes are discarded
+	 * - Invalid id attributes are reencoded
+	 *
+	 * @param array $attribs
+	 * @param array $whitelist list of allowed attribute names
+	 * @return array
+	 *
+	 * @todo Check for legal values where the DTD limits things.
+	 * @todo Check for unique id attribute :P
+	 */
+	static function validateAttributes( $attribs, $whitelist ) {
+		$whitelist = array_flip( $whitelist );
 		$out = array();
 		foreach( $attribs as $attribute => $value ) {
 			if( !isset( $whitelist[$attribute] ) ) {
@@ -594,6 +625,33 @@ class Sanitizer {
 		return $out;
 	}
 	
+	/**
+	 * Merge two sets of HTML attributes.
+	 * Conflicting items in the second set will override those
+	 * in the first, except for 'class' attributes which will be
+	 * combined.
+	 *
+	 * @todo implement merging for other attributes such as style
+	 * @param array $a
+	 * @param array $b
+	 * @return array
+	 */
+	static function mergeAttributes( $a, $b ) {
+		$out = array_merge( $a, $b );
+		if( isset( $a['class'] )
+			&& isset( $b['class'] )
+			&& $a['class'] !== $b['class'] ) {
+			
+			$out['class'] = implode( ' ',
+				array_unique(
+					preg_split( '/\s+/',
+						$a['class'] . ' ' . $b['class'],
+						-1,
+						PREG_SPLIT_NO_EMPTY ) ) );
+		}
+		return $out;
+	}
+	
 	/**
 	 * Pick apart some CSS and check it for forbidden or unsafe structures.
 	 * Returns a sanitized string, or false if it was just too evil.
@@ -615,7 +673,7 @@ class Sanitizer {
 		$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 			'codepointToUtf8(hexdec("$1"))', $stripped );
 		$stripped = str_replace( '\\', '', $stripped );
-		if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
+		if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 				$stripped ) ) {
 			# haxx0r
 			return false;
@@ -667,7 +725,7 @@ class Sanitizer {
 	 * @return HTML-encoded text fragment
 	 */
 	static function encodeAttribute( $text ) {
-		$encValue = htmlspecialchars( $text );
+		$encValue = htmlspecialchars( $text, ENT_QUOTES );
 
 		// Whitespace is normalized during attribute decoding,
 		// so if we've been passed non-spaces we must encode them
@@ -718,27 +776,34 @@ class Sanitizer {
 	 * Given a value escape it so that it can be used in an id attribute and
 	 * return it, this does not validate the value however (see first link)
 	 *
-	 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
+	 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 	 *                                                          in the id and
 	 *                                                          name attributes
-	 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
-	 *
-	 * @bug 4461
+	 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 	 *
-	 * @static
-	 *
-	 * @param string $id
+	 * @param string $id    Id to validate
+	 * @param int    $flags Currently only two values: Sanitizer::INITIAL_NONLETTER
+	 *                      (default) permits initial non-letter characters,
+	 *                      such as if you're adding a prefix to them.
+	 *                      Sanitizer::NONE will prepend an 'x' if the id
+	 *                      would otherwise start with a nonletter.
 	 * @return string
 	 */
-	static function escapeId( $id ) {
+	static function escapeId( $id, $flags = Sanitizer::INITIAL_NONLETTER ) {
 		static $replace = array(
 			'%3A' => ':',
 			'%' => '.'
 		);
 
 		$id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
-
-		return str_replace( array_keys( $replace ), array_values( $replace ), $id );
+		$id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
+		
+		if( ~$flags & Sanitizer::INITIAL_NONLETTER
+		&& !preg_match( '/[a-zA-Z]/', $id[0] ) ) {
+			// Initial character must be a letter!
+			$id = "x$id";
+		}
+		return $id;
 	}
 
 	/**
@@ -747,7 +812,7 @@ class Sanitizer {
 	 *
 	 * @todo For extra validity, input should be validated UTF-8.
 	 *
-	 * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
+	 * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 	 *
 	 * @param string $class
 	 * @return string
@@ -852,11 +917,16 @@ class Sanitizer {
 	 */
 	private static function normalizeAttributeValue( $text ) {
 		return str_replace( '"', '&quot;',
-			preg_replace(
-				'/\r\n|[\x20\x0d\x0a\x09]/',
-				' ',
+			self::normalizeWhitespace(
 				Sanitizer::normalizeCharReferences( $text ) ) );
 	}
+	
+	private static function normalizeWhitespace( $text ) {
+		return preg_replace(
+			'/\r\n|[\x20\x0d\x0a\x09]/',
+			' ',
+			$text );
+	}
 
 	/**
 	 * Ensure that any entities and character references are legal
@@ -902,16 +972,19 @@ class Sanitizer {
 
 	/**
 	 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
-	 * return the named entity reference as is. Otherwise, returns
-	 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
+	 * return the named entity reference as is. If the entity is a 
+	 * MediaWiki-specific alias, returns the HTML equivalent. Otherwise, 
+	 * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 	 *
 	 * @param string $name
 	 * @return string
 	 * @static
 	 */
 	static function normalizeEntity( $name ) {
-		global $wgHtmlEntities;
-		if( isset( $wgHtmlEntities[$name] ) ) {
+		global $wgHtmlEntities, $wgHtmlEntityAliases;
+		if ( isset( $wgHtmlEntityAliases[$name] ) ) {
+			return "&{$wgHtmlEntityAliases[$name]};";
+		} elseif( isset( $wgHtmlEntities[$name] ) ) {
 			return "&$name;";
 		} else {
 			return "&amp;$name;";
@@ -1008,7 +1081,10 @@ class Sanitizer {
 	 * @return string
 	 */
 	static function decodeEntity( $name ) {
-		global $wgHtmlEntities;
+		global $wgHtmlEntities, $wgHtmlEntityAliases;
+		if ( isset( $wgHtmlEntityAliases[$name] ) ) {
+			$name = $wgHtmlEntityAliases[$name];
+		}
 		if( isset( $wgHtmlEntities[$name] ) ) {
 			return codepointToUtf8( $wgHtmlEntities[$name] );
 		} else {
@@ -1143,6 +1219,11 @@ class Sanitizer {
 			# 11.2.6
 			'td'         => array_merge( $common, $tablecell, $tablealign ),
 			'th'         => array_merge( $common, $tablecell, $tablealign ),
+			
+			# 13.2
+			# Not usually allowed, but may be used for extension-style hooks
+			# such as <math> when it is rasterized
+			'img'        => array_merge( $common, array( 'alt' ) ),
 
 			# 15.2.1
 			'tt'         => $common,
@@ -1169,14 +1250,21 @@ class Sanitizer {
 			'rb'         => $common,
 			'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 			'rp'         => $common,
+			
+			# MathML root element, where used for extensions
+			# 'title' may not be 100% valid here; it's XHTML
+			# http://www.w3.org/TR/REC-MathML/
+			'math'       => array( 'class', 'style', 'id', 'title' ),
 			);
 		return $whitelist;
 	}
 
 	/**
 	 * Take a fragment of (potentially invalid) HTML and return
-	 * a version with any tags removed, encoded suitably for literal
-	 * inclusion in an attribute value.
+	 * a version with any tags removed, encoded as plain text.
+	 *
+	 * Warning: this return value must be further escaped for literal
+	 * inclusion in HTML output as of 1.10!
 	 *
 	 * @param string $text HTML fragment
 	 * @return string
@@ -1186,14 +1274,8 @@ class Sanitizer {
 		$text = StringUtils::delimiterReplace( '<', '>', '', $text );
 
 		# Normalize &entities and whitespace
-		$text = Sanitizer::normalizeAttributeValue( $text );
-
-		# Will be placed into "double-quoted" attributes,
-		# make sure remaining bits are safe.
-		$text = str_replace(
-			array('<', '>', '"'),
-			array('&lt;', '&gt;', '&quot;'),
-			$text );
+		$text = self::decodeCharReferences( $text );
+		$text = self::normalizeWhitespace( $text );
 
 		return $text;
 	}
@@ -1262,4 +1344,4 @@ class Sanitizer {
 
 }
 
-?>
+