X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=7fdd1df5f04c515104376ecbaaeb7a01a68ca39d;hb=1e2199b76bc6641bf0d3827903b45ab35db98bd1;hp=8249f969f1810e060a317006312d9ac43394da80;hpb=ec7276ea08af439384440b5b3e35c315c16e929e;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 8249f969f1..7fdd1df5f0 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -2,7 +2,7 @@
 /**
  * XHTML sanitizer for MediaWiki
  *
- * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
+ * Copyright Â© 2002-2005 Brion Vibber <brion@pobox.com> et al
  * http://www.mediawiki.org/
  *
  * This program is free software; you can redistribute it and/or modify
@@ -40,10 +40,11 @@ define( 'MW_CHAR_REFS_REGEX',
  * Allows some... latitude.
  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  */
-$attrib = '[A-Za-z0-9]';
+$attrib_first = '[:A-Z_a-z]';
+$attrib = '[:A-Z_a-z-.0-9]';
 $space = '[\x09\x0a\x0d\x20]';
 define( 'MW_ATTRIBS_REGEX',
-	"/(?:^|$space)($attrib+)
+	"/(?:^|$space)({$attrib_first}{$attrib}*)
 	  ($space*=$space*
 		(?:
 		 # The attribute value: quoted or alone
@@ -56,6 +57,16 @@ define( 'MW_ATTRIBS_REGEX',
 		)
 	   )?(?=$space|\$)/sx" );
 
+/**
+ * Regular expression to match URIs that could trigger script execution
+ */
+define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
+
+/**
+ * Regular expression to match namespace attributes
+ */
+define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
+
 /**
  * List of all named character entities defined in HTML 4.01
  * http://www.w3.org/TR/html4/sgml/entities.html
@@ -335,11 +346,11 @@ class Sanitizer {
 	 * Cleans up HTML, removes dangerous tags and attributes, and
 	 * removes HTML comments
 	 * @private
-	 * @param string $text
-	 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
-	 * @param array $args for the processing callback
-	 * @param array $extratags for any extra tags to include
-	 * @param array $removetags for any tags (default or extra) to exclude
+	 * @param $text String
+	 * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
+	 * @param $args Array for the processing callback
+	 * @param $extratags Array for any extra tags to include
+	 * @param $removetags Array for any tags (default or extra) to exclude
 	 * @return string
 	 */
 	static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
@@ -357,7 +368,7 @@ class Sanitizer {
 				'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 				'strike', 'strong', 'tt', 'var', 'div', 'center',
 				'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
-				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
+				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr', 'dfn'
 			);
 			$htmlsingle = array(
 				'br', 'hr', 'li', 'dt', 'dd'
@@ -379,6 +390,12 @@ class Sanitizer {
 				'li',
 			);
 
+			global $wgAllowImageTag;
+			if ( $wgAllowImageTag ) {
+				$htmlsingle[] = 'img';
+				$htmlsingleonly[] = 'img';
+			}
+
 			$htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 			$htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 
@@ -394,43 +411,49 @@ class Sanitizer {
 		$extratags = array_flip( $extratags );
 		$removetags = array_flip( $removetags );
 		$htmlpairs = array_merge( $extratags, $htmlpairsStatic );
-		$htmlelements = array_diff( array_unique( array_merge( $extratags, $htmlelementsStatic ) ), $removetags );
+		$htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
 
 		# Remove HTML comments
 		$text = Sanitizer::removeHTMLcomments( $text );
 		$bits = explode( '<', $text );
 		$text = str_replace( '>', '&gt;', array_shift( $bits ) );
-		if(!$wgUseTidy) {
+		if ( !$wgUseTidy ) {
 			$tagstack = $tablestack = array();
 			foreach ( $bits as $x ) {
 				$regs = array();
+				# $slash: Does the current element start with a '/'?
+				# $t: Current element name
+				# $params: String between element name and >
+				# $brace: Ending '>' or '/>'
+				# $rest: Everything until the next element of $bits
 				if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 					list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 				} else {
 					$slash = $t = $params = $brace = $rest = null;
 				}
 
-				$badtag = 0 ;
+				$badtag = false;
 				if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 					# Check our stack
-					if ( $slash ) {
-						# Closing a tag...
-						if( isset( $htmlsingleonly[$t] ) ) {
-							$badtag = 1;
-						} elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
+					if ( $slash && isset( $htmlsingleonly[$t] ) ) {
+						$badtag = true;
+					} elseif ( $slash ) {
+						# Closing a tag... is it the one we just opened?
+						$ot = @array_pop( $tagstack );
+						if ( $ot != $t ) {
 							if ( isset( $htmlsingleallowed[$ot] ) ) {
 								# Pop all elements with an optional close tag
 								# and see if we find a match below them
 								$optstack = array();
-								array_push ($optstack, $ot);
-								while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
-										isset( $htmlsingleallowed[$ot] ) )
-								{
-									array_push ($optstack, $ot);
+								array_push( $optstack, $ot );
+								$ot = @array_pop( $tagstack );
+								while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
+									array_push( $optstack, $ot );
+									$ot = @array_pop( $tagstack );
 								}
 								if ( $t != $ot ) {
-									# No match. Push the optinal elements back again
-									$badtag = 1;
+									# No match. Push the optional elements back again
+									$badtag = true;
 									while ( $ot = @array_pop( $optstack ) ) {
 										array_push( $tagstack, $ot );
 									}
@@ -438,8 +461,8 @@ class Sanitizer {
 							} else {
 								@array_push( $tagstack, $ot );
 								# <li> can be nested in <ul> or <ol>, skip those cases:
-								if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
-									$badtag = 1;
+								if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
+									$badtag = true;
 								}
 							}
 						} else {
@@ -451,23 +474,23 @@ class Sanitizer {
 					} else {
 						# Keep track for later
 						if ( isset( $tabletags[$t] ) &&
-						! in_array( 'table', $tagstack ) ) {
-							$badtag = 1;
-						} else if ( in_array( $t, $tagstack ) &&
-						! isset( $htmlnest [$t ] ) ) {
-							$badtag = 1 ;
+						!in_array( 'table', $tagstack ) ) {
+							$badtag = true;
+						} elseif ( in_array( $t, $tagstack ) &&
+						!isset( $htmlnest [$t ] ) ) {
+							$badtag = true;
 						#Â Is it a self closed htmlpair ? (bug 5487)
-						} else if( $brace == '/>' &&
+						} elseif ( $brace == '/>' &&
 						isset( $htmlpairs[$t] ) ) {
-							$badtag = 1;
-						} elseif( isset( $htmlsingleonly[$t] ) ) {
+							$badtag = true;
+						} elseif ( isset( $htmlsingleonly[$t] ) ) {
 							# Hack to force empty tag for uncloseable elements
 							$brace = '/>';
-						} else if( isset( $htmlsingle[$t] ) ) {
+						} elseif ( isset( $htmlsingle[$t] ) ) {
 							# Hack to not close $htmlsingle tags
-							$brace = NULL;
-						} else if( isset( $tabletags[$t] )
-						&&  in_array($t ,$tagstack) ) {
+							$brace = null;
+						} elseif ( isset( $tabletags[$t] )
+						&& in_array( $t, $tagstack ) ) {
 							// New table tag but forgot to close the previous one
 							$text .= "</$t>";
 						} else {
@@ -487,7 +510,7 @@ class Sanitizer {
 						# Strip non-approved attributes from the tag
 						$newparams = Sanitizer::fixTagAttributes( $params, $t );
 					}
-					if ( ! $badtag ) {
+					if ( !$badtag ) {
 						$rest = str_replace( '>', '&gt;', $rest );
 						$close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 						$text .= "<$slash$t$newparams$close>$rest";
@@ -530,7 +553,7 @@ class Sanitizer {
 	 * trailing spaces and one of the newlines.
 	 *
 	 * @private
-	 * @param string $text
+	 * @param $text String
 	 * @return string
 	 */
 	static function removeHTMLcomments( $text ) {
@@ -576,9 +599,9 @@ class Sanitizer {
 	 * - Unsafe style attributes are discarded
 	 * - Invalid id attributes are reencoded
 	 *
-	 * @param array $attribs
-	 * @param string $element
-	 * @return array
+	 * @param $attribs Array
+	 * @param $element String
+	 * @return Array
 	 *
 	 * @todo Check for legal values where the DTD limits things.
 	 * @todo Check for unique id attribute :P
@@ -596,40 +619,94 @@ class Sanitizer {
 	 * - Unsafe style attributes are discarded
 	 * - Invalid id attributes are reencoded
 	 *
-	 * @param array $attribs
-	 * @param array $whitelist list of allowed attribute names
-	 * @return array
+	 * @param $attribs Array
+	 * @param $whitelist Array: list of allowed attribute names
+	 * @return Array
 	 *
 	 * @todo Check for legal values where the DTD limits things.
 	 * @todo Check for unique id attribute :P
 	 */
 	static function validateAttributes( $attribs, $whitelist ) {
+		global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
+
 		$whitelist = array_flip( $whitelist );
+		$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
+
 		$out = array();
 		foreach( $attribs as $attribute => $value ) {
-			if( !isset( $whitelist[$attribute] ) ) {
+			#allow XML namespace declaration if RDFa is enabled
+			if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
+				if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+					$out[$attribute] = $value;
+				}
+
+				continue;
+			}
+
+			# Allow any attribute beginning with "data-", if in HTML5 mode
+			if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
 				continue;
 			}
+
 			# Strip javascript "expression" from stylesheets.
 			# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 			if( $attribute == 'style' ) {
 				$value = Sanitizer::checkCss( $value );
-				if( $value === false ) {
-					# haxx0r
-					continue;
-				}
 			}
 
 			if ( $attribute === 'id' ) {
-				global $wgEnforceHtmlIds;
-				$value = Sanitizer::escapeId( $value,
-					$wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+				$value = Sanitizer::escapeId( $value, 'noninitial' );
+			}
+
+			//RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
+			if ( $attribute === 'rel' || $attribute === 'rev' || 
+				$attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
+				$attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
+				$attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
+				$attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
+
+				//Paranoia. Allow "simple" values but suppress javascript
+				if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+					continue; 
+				}
+			}
+
+			# NOTE: even though elements using href/src are not allowed directly, supply
+			#       validation code that can be used by tag hook handlers, etc
+			if ( $attribute === 'href' || $attribute === 'src' ) {
+				if ( !preg_match( $hrefExp, $value ) ) {
+					continue; //drop any href or src attributes not using an allowed protocol.
+						  //NOTE: this also drops all relative URLs
+				}
 			}
 
 			// If this attribute was previously set, override it.
 			// Output should only have one attribute of each name.
 			$out[$attribute] = $value;
 		}
+
+		if ( $wgAllowMicrodataAttributes ) {
+			# There are some complicated validity constraints we need to
+			# enforce here.  First of all, we don't want to allow non-standard
+			# itemtypes.
+			$allowedTypes = array(
+				'http://microformats.org/profile/hcard',
+				'http://microformats.org/profile/hcalendar#vevent',
+				'http://n.whatwg.org/work',
+			);
+			if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
+			$allowedTypes ) ) {
+				# Kill everything
+				unset( $out['itemscope'] );
+			}
+			# itemtype, itemid, itemref don't make sense without itemscope
+			if ( !array_key_exists( 'itemscope', $out ) ) {
+				unset( $out['itemtype'] );
+				unset( $out['itemid'] );
+				unset( $out['itemref'] );
+			}
+			# TODO: Strip itemprop if we aren't descendants of an itemscope.
+		}
 		return $out;
 	}
 
@@ -639,8 +716,8 @@ class Sanitizer {
 	 * will be combined (if they're both strings).
 	 *
 	 * @todo implement merging for other attributes such as style
-	 * @param array $a
-	 * @param array $b
+	 * @param $a Array
+	 * @param $b Array
 	 * @return array
 	 */
 	static function mergeAttributes( $a, $b ) {
@@ -661,30 +738,108 @@ class Sanitizer {
 	 *
 	 * Currently URL references, 'expression', 'tps' are forbidden.
 	 *
-	 * @param string $value
-	 * @return mixed
+	 * @param $value String
+	 * @return Mixed
 	 */
 	static function checkCss( $value ) {
-		$stripped = Sanitizer::decodeCharReferences( $value );
+		$value = Sanitizer::decodeCharReferences( $value );
 
 		// Remove any comments; IE gets token splitting wrong
-		$stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
-
-		$value = $stripped;
-
-		// ... and continue checks
-		$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
-			'codepointToUtf8(hexdec("$1"))', $stripped );
-		$stripped = str_replace( '\\', '', $stripped );
-		if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
-				$stripped ) ) {
-			# haxx0r
-			return false;
+		$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+		// Decode escape sequences and line continuation
+		// See the grammar in the CSS 2 spec, appendix D.
+		static $decodeRegex;
+		if ( !$decodeRegex ) {
+			$space = '[\\x20\\t\\r\\n\\f]';
+			$nl = '(?:\\n|\\r\\n|\\r|\\f)';
+			$backslash = '\\\\';
+			$decodeRegex = "/ $backslash 
+				(?:
+					($nl) |  # 1. Line continuation
+					([0-9A-Fa-f]{1,6})$space? |  # 2. character number
+					(.) | # 3. backslash cancelling special meaning
+					() | # 4. backslash at end of string
+				)/xu";
+		}
+		$value = preg_replace_callback( $decodeRegex,
+			array( __CLASS__, 'cssDecodeCallback' ), $value );
+
+		// Reject problematic keywords and control characters
+		if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
+			return '/* invalid control char */';
+		} elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) {
+			return '/* insecure input */';
 		}
-
 		return $value;
 	}
 
+	static function cssDecodeCallback( $matches ) {
+		if ( $matches[1] !== '' ) {
+			// Line continuation
+			return '';
+		} elseif ( $matches[2] !== '' ) {
+			$char = codepointToUtf8( hexdec( $matches[2] ) );
+		} elseif ( $matches[3] !== '' ) {
+			$char = $matches[3];
+		} else {
+			$char = '\\';
+		}
+		if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
+			// These characters need to be escaped in strings
+			// Clean up the escape sequence to avoid parsing errors by clients
+			return '\\' . dechex( ord( $char ) ) . ' ';
+		} else {
+			// Decode unnecessary escape
+			return $char;
+		}
+	}
+
+	/** 
+	* Take an associative array of attribute name/value pairs
+	* and generate a css style representing all the style-related
+	* attributes. If there already a style attribute in the array,
+	* it is also included in the value returned.
+	*/
+	static function styleFromAttributes( $attributes ) {
+		$styles = array();
+
+		foreach ( $attributes as $attribute => $value ) {
+			if ( $attribute == 'bgcolor' ) {
+				$styles[] = "background-color: $value";
+			} else if ( $attribute == 'border' ) {
+				$styles[] = "border-width: $value";
+			} else if ( $attribute == 'align' ) {
+				$styles[] = "text-align: $value";
+			} else if ( $attribute == 'valign' ) {
+				$styles[] = "vertical-align: $value";
+			} else if ( $attribute == 'width' ) {
+				if ( preg_match( '/\d+/', $value ) === false ) {
+				      $value .= 'px';
+				}
+
+				$styles[] = "width: $value";
+			} else if ( $attribute == 'height' ) {
+				if ( preg_match( '/\d+/', $value ) === false ) {
+				      $value .= 'px';
+				}
+
+				$styles[] = "height: $value";
+			} else if ( $attribute == 'nowrap' ) {
+				if ( $value ) {
+					$styles[] = "white-space: nowrap";
+				}
+			}
+		}
+
+		if ( isset( $attributes[ 'style' ] ) ) {
+			$styles[] = $attributes[ 'style' ];
+		} 
+
+		if ( !$styles ) return '';
+		else return implode( '; ', $styles );
+	}
+
 	/**
 	 * Take a tag soup fragment listing an HTML element's attributes
 	 * and normalize it to well-formed XML, discarding unwanted attributes.
@@ -700,31 +855,73 @@ class Sanitizer {
 	 * - Unsafe style attributes are discarded
 	 * - Prepends space if there are attributes.
 	 *
-	 * @param string $text
-	 * @param string $element
-	 * @return string
+	 * @param $text String
+	 * @param $element String
+	 * @param $defaults Array (optional) associative array of default attributes to splice in. 
+	 *			class and style attributes are combined. Otherwise, values from
+	 *			$attributes take precedence over values from $defaults.
+	 * @return String
 	 */
-	static function fixTagAttributes( $text, $element ) {
+	static function fixTagAttributes( $text, $element, $defaults = null ) {
 		if( trim( $text ) == '' ) {
 			return '';
 		}
 
-		$stripped = Sanitizer::validateTagAttributes(
-			Sanitizer::decodeTagAttributes( $text ), $element );
+		$decoded = Sanitizer::decodeTagAttributes( $text );
+		$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+		$attribs = Sanitizer::collapseTagAttributes( $stripped, $defaults );
 
-		$attribs = array();
-		foreach( $stripped as $attribute => $value ) {
+		return $attribs;
+	}
+
+	/**
+	 * Take an associative array or attribute name/value pairs
+	 * and collapses it to well-formed XML.
+	 * Does not filter attributes.
+	 * Output is safe for further wikitext processing, with escaping of
+	 * values that could trigger problems.
+	 *
+	 * - Double-quotes all attribute values
+	 * - Prepends space if there are attributes.
+	 *
+	 * @param $attributes Array is an associative array of attribute name/value pairs. 
+	 * 			Assumed to be sanitized already.
+	 * @param $defaults Array (optional) associative array of default attributes to splice in. 
+	 *			class and style attributes are combined. Otherwise, values from
+	 *			$attributes take precedence over values from $defaults.
+	 * @return String
+	 */
+	static function collapseTagAttributes( $attributes, $defaults = null ) {
+		if ( $defaults ) {
+			foreach( $defaults as $attribute => $value ) {
+				if ( isset( $attributes[ $attribute ] ) ) {
+					if ( $attribute == 'class' ) {
+						$value .= ' '. $attributes[ $attribute ];
+					} else if ( $attribute == 'style' ) {
+						$value .= '; ' . $attributes[ $attribute ];
+					} else {
+						continue;
+					}
+				}
+
+				$attributes[ $attribute ] = $value;
+			}
+		}
+
+		$chunks = array();
+
+		foreach( $attributes as $attribute => $value ) {
 			$encAttribute = htmlspecialchars( $attribute );
 			$encValue = Sanitizer::safeEncodeAttribute( $value );
 
-			$attribs[] = "$encAttribute=\"$encValue\"";
+			$chunks[] = "$encAttribute=\"$encValue\"";
 		}
-		return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+		return count( $chunks ) ? ' ' . implode( ' ', $chunks ) : '';
 	}
 
 	/**
 	 * Encode an attribute value for HTML output.
-	 * @param $text
+	 * @param $text String
 	 * @return HTML-encoded text fragment
 	 */
 	static function encodeAttribute( $text ) {
@@ -745,7 +942,7 @@ class Sanitizer {
 	/**
 	 * Encode an attribute value for HTML tags, with extra armoring
 	 * against further wiki processing.
-	 * @param $text
+	 * @param $text String
 	 * @return HTML-encoded text fragment
 	 */
 	static function safeEncodeAttribute( $text ) {
@@ -776,63 +973,66 @@ class Sanitizer {
 	}
 
 	/**
-	 * Given a value escape it so that it can be used in an id attribute and
-	 * return it, this does not validate the value however (see first link)
+	 * Given a value, escape it so that it can be used in an id attribute and
+	 * return it.  This will use HTML5 validation if $wgExperimentalHtmlIds is
+	 * true, allowing anything but ASCII whitespace.  Otherwise it will use
+	 * HTML 4 rules, which means a narrow subset of ASCII, with bad characters
+	 * escaped with lots of dots.
+	 *
+	 * To ensure we don't have to bother escaping anything, we also strip ', ",
+	 * & even if $wgExperimentalIds is true.  TODO: Is this the best tactic?
+	 * We also strip # because it upsets IE, and % because it could be
+	 * ambiguous if it's part of something that looks like a percent escape
+	 * (which don't work reliably in fragments cross-browser).
 	 *
 	 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 	 *                                                          in the id and
 	 *                                                          name attributes
 	 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+	 * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+	 *   HTML5 definition of id attribute
 	 *
-	 * @param string $id      Id to validate
-	 * @param mixed  $options String or array of strings (default is array()):
+	 * @param $id String: id to escape
+	 * @param $options Mixed: string or array of strings (default is array()):
 	 *   'noninitial': This is a non-initial fragment of an id, not a full id,
 	 *       so don't pay attention if the first character isn't valid at the
-	 *       beginning of an id.
-	 *   'xml': Don't restrict the id to be HTML4-compatible.  This option
-	 *       allows any alphabetic character to be used, per the XML standard.
-	 *       Therefore, it also completely changes the type of escaping: instead
-	 *       of weird dot-encoding, runs of invalid characters (mostly
-	 *       whitespace) are just compressed into a single underscore.
-	 * @return string
+	 *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
+	 *       false.
+	 *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
+	 *       if $wgExperimentalHtmlIds is used, so we can generate extra
+	 *       anchors and links won't break.
+	 * @return String
 	 */
 	static function escapeId( $id, $options = array() ) {
+		global $wgHtml5, $wgExperimentalHtmlIds;
 		$options = (array)$options;
 
-		if ( !in_array( 'xml', $options ) ) {
-			# HTML4-style escaping
-			static $replace = array(
-				'%3A' => ':',
-				'%' => '.'
-			);
-
-			$id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
-			$id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
-
-			if ( !preg_match( '/^[a-zA-Z]/', $id )
-			&& !in_array( 'noninitial', $options ) )  {
-				// Initial character must be a letter!
-				$id = "x$id";
+		if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+			$id = Sanitizer::decodeCharReferences( $id );
+			$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
+			$id = trim( $id, '_' );
+			if ( $id === '' ) {
+				# Must have been all whitespace to start with.
+				return '_';
+			} else {
+				return $id;
 			}
-			return $id;
 		}
 
-		# XML-style escaping.  For the patterns used, see the XML 1.0 standard,
-		# 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
-		$nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
-			. '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
-			. '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
-		$nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
-			. '\x{203F}-\x{2040}';
-		# Replace _ as well so we don't get multiple consecutive underscores
-		$id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
-		$id = trim( $id, '_' );
-
-		if ( !preg_match( "/^[$nameStartChar]/u", $id )
-		&& !in_array( 'noninitial', $options ) ) {
-			$id = "_$id";
-		}
+		# HTML4-style escaping
+		static $replace = array(
+			'%3A' => ':',
+			'%' => '.'
+		);
+
+		$id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+		$id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 
+		if ( !preg_match( '/^[a-zA-Z]/', $id )
+		&& !in_array( 'noninitial', $options ) )  {
+			// Initial character must be a letter!
+			$id = "x$id";
+		}
 		return $id;
 	}
 
@@ -844,8 +1044,8 @@ class Sanitizer {
 	 *
 	 * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 	 *
-	 * @param string $class
-	 * @return string
+	 * @param $class String
+	 * @return String
 	 */
 	static function escapeClass( $class ) {
 		// Convert ugly stuff to underscores and kill underscores in ugly places
@@ -857,25 +1057,23 @@ class Sanitizer {
 
 	/**
 	 * Given HTML input, escape with htmlspecialchars but un-escape entites.
-	 * This allows (generally harmless) entities like &nbsp; to survive.
+	 * This allows (generally harmless) entities like &#160; to survive.
 	 *
-	 * @param  string $html String to escape
-	 * @return string Escaped input
+	 * @param $html String to escape
+	 * @return String: escaped input
 	 */
 	static function escapeHtmlAllowEntities( $html ) {
+		$html = Sanitizer::decodeCharReferences( $html );
 		# It seems wise to escape ' as well as ", as a matter of course.  Can't
 		# hurt.
 		$html = htmlspecialchars( $html, ENT_QUOTES );
-		$html = str_replace( '&amp;', '&', $html );
-		$html = Sanitizer::normalizeCharReferences( $html );
 		return $html;
 	}
 
 	/**
 	 * Regex replace callback for armoring links against further processing.
-	 * @param array $matches
+	 * @param $matches Array
 	 * @return string
-	 * @private
 	 */
 	private static function armorLinksCallback( $matches ) {
 		return str_replace( ':', '&#58;', $matches[1] );
@@ -886,16 +1084,15 @@ class Sanitizer {
 	 * a partial tag string. Attribute names are forces to lowercase,
 	 * character references are decoded to UTF-8 text.
 	 *
-	 * @param string
-	 * @return array
+	 * @param $text String
+	 * @return Array
 	 */
 	public static function decodeTagAttributes( $text ) {
-		$attribs = array();
-
 		if( trim( $text ) == '' ) {
-			return $attribs;
+			return array();
 		}
 
+		$attribs = array();
 		$pairs = array();
 		if( !preg_match_all(
 			MW_ATTRIBS_REGEX,
@@ -923,9 +1120,8 @@ class Sanitizer {
 	 * Pick the appropriate attribute value from a match set from the
 	 * MW_ATTRIBS_REGEX matches.
 	 *
-	 * @param array $set
-	 * @return string
-	 * @private
+	 * @param $set Array
+	 * @return String
 	 */
 	private static function getTagAttributeCallback( $set ) {
 		if( isset( $set[6] ) ) {
@@ -957,9 +1153,8 @@ class Sanitizer {
 	 * but note that we're not returning the value, but are returning
 	 * XML source fragments that will be slapped into output.
 	 *
-	 * @param string $text
-	 * @return string
-	 * @private
+	 * @param $text String
+	 * @return String
 	 */
 	private static function normalizeAttributeValue( $text ) {
 		return str_replace( '"', '&quot;',
@@ -974,6 +1169,18 @@ class Sanitizer {
 			$text );
 	}
 
+	/**
+	 * Normalizes whitespace in a section name, such as might be returned
+	 * by Parser::stripSectionName(), for use in the id's that are used for
+	 * section links.
+	 *
+	 * @param $section String
+	 * @return String
+	 */
+	static function normalizeSectionNameWhitespace( $section ) {
+		return trim( preg_replace( '/[ _]+/', ' ', $section ) );
+	}
+
 	/**
 	 * Ensure that any entities and character references are legal
 	 * for XML and XHTML specifically. Any stray bits will be
@@ -984,8 +1191,8 @@ class Sanitizer {
 	 * c. use &#x, not &#X
 	 * d. fix or reject non-valid attributes
 	 *
-	 * @param string $text
-	 * @return string
+	 * @param $text String
+	 * @return String
 	 * @private
 	 */
 	static function normalizeCharReferences( $text ) {
@@ -995,8 +1202,8 @@ class Sanitizer {
 			$text );
 	}
 	/**
-	 * @param string $matches
-	 * @return string
+	 * @param $matches String
+	 * @return String
 	 */
 	static function normalizeCharReferencesCallback( $matches ) {
 		$ret = null;
@@ -1022,9 +1229,8 @@ class Sanitizer {
 	 * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
 	 * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 	 *
-	 * @param string $name
-	 * @return string
-	 * @static
+	 * @param $name String
+	 * @return String
 	 */
 	static function normalizeEntity( $name ) {
 		global $wgHtmlEntities, $wgHtmlEntityAliases;
@@ -1057,8 +1263,8 @@ class Sanitizer {
 
 	/**
 	 * Returns true if a given Unicode codepoint is a valid character in XML.
-	 * @param int $codepoint
-	 * @return bool
+	 * @param $codepoint Integer
+	 * @return Boolean
 	 */
 	private static function validateCodepoint( $codepoint ) {
 		return ($codepoint ==    0x09)
@@ -1073,10 +1279,8 @@ class Sanitizer {
 	 * Decode any character references, numeric or named entities,
 	 * in the text and return a UTF-8 string.
 	 *
-	 * @param string $text
-	 * @return string
-	 * @public
-	 * @static
+	 * @param $text String
+	 * @return String
 	 */
 	public static function decodeCharReferences( $text ) {
 		return preg_replace_callback(
@@ -1086,8 +1290,32 @@ class Sanitizer {
 	}
 
 	/**
-	 * @param string $matches
-	 * @return string
+	 * Decode any character references, numeric or named entities,
+	 * in the next and normalize the resulting string. (bug 14952)
+	 *
+	 * This is useful for page titles, not for text to be displayed,
+	 * MediaWiki allows HTML entities to escape normalization as a feature.
+	 *
+	 * @param $text String (already normalized, containing entities)
+	 * @return String (still normalized, without entities)
+	 */
+	public static function decodeCharReferencesAndNormalize( $text ) {
+		global $wgContLang;
+		$text = preg_replace_callback(
+			MW_CHAR_REFS_REGEX,
+			array( 'Sanitizer', 'decodeCharReferencesCallback' ),
+			$text, /* limit */ -1, $count );
+
+		if ( $count ) {
+			return $wgContLang->normalize( $text );
+		} else {
+			return $text;
+		}
+	}
+
+	/**
+	 * @param $matches String
+	 * @return String
 	 */
 	static function decodeCharReferencesCallback( $matches ) {
 		if( $matches[1] != '' ) {
@@ -1106,8 +1334,8 @@ class Sanitizer {
 	/**
 	 * Return UTF-8 string for a codepoint if that is a valid
 	 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
-	 * @param int $codepoint
-	 * @return string
+	 * @param $codepoint Integer
+	 * @return String
 	 * @private
 	 */
 	static function decodeChar( $codepoint ) {
@@ -1123,8 +1351,8 @@ class Sanitizer {
 	 * return the UTF-8 encoding of that character. Otherwise, returns
 	 * pseudo-entity source (eg &foo;)
 	 *
-	 * @param string $name
-	 * @return string
+	 * @param $name Strings
+	 * @return String
 	 */
 	static function decodeEntity( $name ) {
 		global $wgHtmlEntities, $wgHtmlEntityAliases;
@@ -1139,11 +1367,10 @@ class Sanitizer {
 	}
 
 	/**
-	 * Fetch the whitelist of acceptable attributes for a given
-	 * element name.
+	 * Fetch the whitelist of acceptable attributes for a given element name.
 	 *
-	 * @param string $element
-	 * @return array
+	 * @param $element String
+	 * @return Array
 	 */
 	static function attributeWhitelist( $element ) {
 		static $list;
@@ -1158,10 +1385,27 @@ class Sanitizer {
 	/**
 	 * Foreach array key (an allowed HTML element), return an array
 	 * of allowed attributes
-	 * @return array
+	 * @return Array
 	 */
 	static function setupAttributeWhitelist() {
+		global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
+
 		$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
+
+		if ( $wgAllowRdfaAttributes ) {
+			#RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+			$common = array_merge( $common, array(
+			    'about', 'property', 'resource', 'datatype', 'typeof', 
+			) );
+		}
+
+		if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+			# add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+			$common = array_merge( $common, array(
+			    'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
+			) );
+		}
+
 		$block = array_merge( $common, array( 'align' ) );
 		$tablealign = array( 'align', 'char', 'charoff', 'valign' );
 		$tablecell = array( 'abbr',
@@ -1202,12 +1446,12 @@ class Sanitizer {
 			'em'         => $common,
 			'strong'     => $common,
 			'cite'       => $common,
-			# dfn
+			'dfn'        => $common,
 			'code'       => $common,
 			# samp
 			# kbd
 			'var'        => $common,
-			# abbr
+			'abbr'       => $common,
 			# acronym
 
 			# 9.2.2
@@ -1267,10 +1511,14 @@ class Sanitizer {
 			'td'         => array_merge( $common, $tablecell, $tablealign ),
 			'th'         => array_merge( $common, $tablecell, $tablealign ),
 
+			# 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
+			'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 
+
 			# 13.2
 			# Not usually allowed, but may be used for extension-style hooks
-			# such as <math> when it is rasterized
-			'img'        => array_merge( $common, array( 'alt' ) ),
+			# such as <math> when it is rasterized, or if $wgAllowImageTag is
+			# true
+			'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
 
 			# 15.2.1
 			'tt'         => $common,
@@ -1313,8 +1561,8 @@ class Sanitizer {
 	 * Warning: this return value must be further escaped for literal
 	 * inclusion in HTML output as of 1.10!
 	 *
-	 * @param string $text HTML fragment
-	 * @return string
+	 * @param $text String: HTML fragment
+	 * @return String
 	 */
 	static function stripAllTags( $text ) {
 		# Actual <tags>
@@ -1334,8 +1582,7 @@ class Sanitizer {
 	 *
 	 * Use for passing XHTML fragments to PHP's XML parsing functions
 	 *
-	 * @return string
-	 * @static
+	 * @return String
 	 */
 	static function hackDocType() {
 		global $wgHtmlEntities;
@@ -1353,7 +1600,7 @@ class Sanitizer {
 		$url = Sanitizer::decodeCharReferences( $url );
 
 		# Escape any control characters introduced by the above step
-		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url );
 
 		# Validate hostname portion
 		$matches = array();
@@ -1381,7 +1628,7 @@ class Sanitizer {
 
 			$host = preg_replace( $strip, '', $host );
 
-			// @fixme: validate hostnames here
+			// @todo Fixme: validate hostnames here
 
 			return $protocol . $host . $rest;
 		} else {