X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=8272ea4e72368064514d4f39353429caf204a03b;hb=a1d6bc1cb7a4cc83b225a9b9afb8ad268710038e;hp=ea36f255135d5d587f4ce43b357d4b6997bdaa71;hpb=979508b9bf0dbdde9141d6135c418b7d94933803;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index ea36f25513..8272ea4e72 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -25,323 +25,333 @@
  */
 
 /**
- * Regular expression to match various types of character references in
- * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
- */
-define( 'MW_CHAR_REFS_REGEX',
-	'/&([A-Za-z0-9\x80-\xff]+);
-	 |&\#([0-9]+);
-	 |&\#x([0-9A-Za-z]+);
-	 |&\#X([0-9A-Za-z]+);
-	 |(&)/x' );
-
-/**
- * Regular expression to match HTML/XML attribute pairs within a tag.
- * Allows some... latitude.
- * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+ * XHTML sanitizer for MediaWiki
+ * @ingroup Parser
  */
-$attrib_first = '[:A-Z_a-z]';
-$attrib = '[:A-Z_a-z-.0-9]';
-$space = '[\x09\x0a\x0d\x20]';
-define( 'MW_ATTRIBS_REGEX',
-	"/(?:^|$space)({$attrib_first}{$attrib}*)
-	  ($space*=$space*
-		(?:
-		 # The attribute value: quoted or alone
-		  \"([^<\"]*)\"
-		 | '([^<']*)'
-		 |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
-		 |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
-							 # colors are specified like this.
-							 # We'll be normalizing it.
-		)
-	   )?(?=$space|\$)/sx" );
+class Sanitizer {
+	/**
+	 * Regular expression to match various types of character references in
+	 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
+	 */
+	const CHAR_REFS_REGEX = 
+		'/&([A-Za-z0-9\x80-\xff]+);
+		 |&\#([0-9]+);
+		 |&\#[xX]([0-9A-Fa-f]+);
+		 |(&)/x';
 
-/**
- * Regular expression to match URIs that could trigger script execution
- */
-define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
+	/**
+	 * Blacklist for evil uris like javascript:
+	 * WARNING: DO NOT use this in any place that actually requires blacklisting
+	 * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
+	 * only way to be secure from javascript: uri based xss vectors is to whitelist
+	 * things that you know are safe and deny everything else.
+	 * [1]: http://ha.ckers.org/xss.html
+	 */
+	const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
+	const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
 
-/**
- * Regular expression to match namespace attributes
- */
-define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
+	/**
+	 * List of all named character entities defined in HTML 4.01
+	 * http://www.w3.org/TR/html4/sgml/entities.html
+	 * As well as &apos; which is only defined starting in XHTML1.
+	 * @private
+	 */
+	static $htmlEntities = array(
+		'Aacute'   => 193,
+		'aacute'   => 225,
+		'Acirc'    => 194,
+		'acirc'    => 226,
+		'acute'    => 180,
+		'AElig'    => 198,
+		'aelig'    => 230,
+		'Agrave'   => 192,
+		'agrave'   => 224,
+		'alefsym'  => 8501,
+		'Alpha'    => 913,
+		'alpha'    => 945,
+		'amp'      => 38,
+		'and'      => 8743,
+		'ang'      => 8736,
+		'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
+		'Aring'    => 197,
+		'aring'    => 229,
+		'asymp'    => 8776,
+		'Atilde'   => 195,
+		'atilde'   => 227,
+		'Auml'     => 196,
+		'auml'     => 228,
+		'bdquo'    => 8222,
+		'Beta'     => 914,
+		'beta'     => 946,
+		'brvbar'   => 166,
+		'bull'     => 8226,
+		'cap'      => 8745,
+		'Ccedil'   => 199,
+		'ccedil'   => 231,
+		'cedil'    => 184,
+		'cent'     => 162,
+		'Chi'      => 935,
+		'chi'      => 967,
+		'circ'     => 710,
+		'clubs'    => 9827,
+		'cong'     => 8773,
+		'copy'     => 169,
+		'crarr'    => 8629,
+		'cup'      => 8746,
+		'curren'   => 164,
+		'dagger'   => 8224,
+		'Dagger'   => 8225,
+		'darr'     => 8595,
+		'dArr'     => 8659,
+		'deg'      => 176,
+		'Delta'    => 916,
+		'delta'    => 948,
+		'diams'    => 9830,
+		'divide'   => 247,
+		'Eacute'   => 201,
+		'eacute'   => 233,
+		'Ecirc'    => 202,
+		'ecirc'    => 234,
+		'Egrave'   => 200,
+		'egrave'   => 232,
+		'empty'    => 8709,
+		'emsp'     => 8195,
+		'ensp'     => 8194,
+		'Epsilon'  => 917,
+		'epsilon'  => 949,
+		'equiv'    => 8801,
+		'Eta'      => 919,
+		'eta'      => 951,
+		'ETH'      => 208,
+		'eth'      => 240,
+		'Euml'     => 203,
+		'euml'     => 235,
+		'euro'     => 8364,
+		'exist'    => 8707,
+		'fnof'     => 402,
+		'forall'   => 8704,
+		'frac12'   => 189,
+		'frac14'   => 188,
+		'frac34'   => 190,
+		'frasl'    => 8260,
+		'Gamma'    => 915,
+		'gamma'    => 947,
+		'ge'       => 8805,
+		'gt'       => 62,
+		'harr'     => 8596,
+		'hArr'     => 8660,
+		'hearts'   => 9829,
+		'hellip'   => 8230,
+		'Iacute'   => 205,
+		'iacute'   => 237,
+		'Icirc'    => 206,
+		'icirc'    => 238,
+		'iexcl'    => 161,
+		'Igrave'   => 204,
+		'igrave'   => 236,
+		'image'    => 8465,
+		'infin'    => 8734,
+		'int'      => 8747,
+		'Iota'     => 921,
+		'iota'     => 953,
+		'iquest'   => 191,
+		'isin'     => 8712,
+		'Iuml'     => 207,
+		'iuml'     => 239,
+		'Kappa'    => 922,
+		'kappa'    => 954,
+		'Lambda'   => 923,
+		'lambda'   => 955,
+		'lang'     => 9001,
+		'laquo'    => 171,
+		'larr'     => 8592,
+		'lArr'     => 8656,
+		'lceil'    => 8968,
+		'ldquo'    => 8220,
+		'le'       => 8804,
+		'lfloor'   => 8970,
+		'lowast'   => 8727,
+		'loz'      => 9674,
+		'lrm'      => 8206,
+		'lsaquo'   => 8249,
+		'lsquo'    => 8216,
+		'lt'       => 60,
+		'macr'     => 175,
+		'mdash'    => 8212,
+		'micro'    => 181,
+		'middot'   => 183,
+		'minus'    => 8722,
+		'Mu'       => 924,
+		'mu'       => 956,
+		'nabla'    => 8711,
+		'nbsp'     => 160,
+		'ndash'    => 8211,
+		'ne'       => 8800,
+		'ni'       => 8715,
+		'not'      => 172,
+		'notin'    => 8713,
+		'nsub'     => 8836,
+		'Ntilde'   => 209,
+		'ntilde'   => 241,
+		'Nu'       => 925,
+		'nu'       => 957,
+		'Oacute'   => 211,
+		'oacute'   => 243,
+		'Ocirc'    => 212,
+		'ocirc'    => 244,
+		'OElig'    => 338,
+		'oelig'    => 339,
+		'Ograve'   => 210,
+		'ograve'   => 242,
+		'oline'    => 8254,
+		'Omega'    => 937,
+		'omega'    => 969,
+		'Omicron'  => 927,
+		'omicron'  => 959,
+		'oplus'    => 8853,
+		'or'       => 8744,
+		'ordf'     => 170,
+		'ordm'     => 186,
+		'Oslash'   => 216,
+		'oslash'   => 248,
+		'Otilde'   => 213,
+		'otilde'   => 245,
+		'otimes'   => 8855,
+		'Ouml'     => 214,
+		'ouml'     => 246,
+		'para'     => 182,
+		'part'     => 8706,
+		'permil'   => 8240,
+		'perp'     => 8869,
+		'Phi'      => 934,
+		'phi'      => 966,
+		'Pi'       => 928,
+		'pi'       => 960,
+		'piv'      => 982,
+		'plusmn'   => 177,
+		'pound'    => 163,
+		'prime'    => 8242,
+		'Prime'    => 8243,
+		'prod'     => 8719,
+		'prop'     => 8733,
+		'Psi'      => 936,
+		'psi'      => 968,
+		'quot'     => 34,
+		'radic'    => 8730,
+		'rang'     => 9002,
+		'raquo'    => 187,
+		'rarr'     => 8594,
+		'rArr'     => 8658,
+		'rceil'    => 8969,
+		'rdquo'    => 8221,
+		'real'     => 8476,
+		'reg'      => 174,
+		'rfloor'   => 8971,
+		'Rho'      => 929,
+		'rho'      => 961,
+		'rlm'      => 8207,
+		'rsaquo'   => 8250,
+		'rsquo'    => 8217,
+		'sbquo'    => 8218,
+		'Scaron'   => 352,
+		'scaron'   => 353,
+		'sdot'     => 8901,
+		'sect'     => 167,
+		'shy'      => 173,
+		'Sigma'    => 931,
+		'sigma'    => 963,
+		'sigmaf'   => 962,
+		'sim'      => 8764,
+		'spades'   => 9824,
+		'sub'      => 8834,
+		'sube'     => 8838,
+		'sum'      => 8721,
+		'sup'      => 8835,
+		'sup1'     => 185,
+		'sup2'     => 178,
+		'sup3'     => 179,
+		'supe'     => 8839,
+		'szlig'    => 223,
+		'Tau'      => 932,
+		'tau'      => 964,
+		'there4'   => 8756,
+		'Theta'    => 920,
+		'theta'    => 952,
+		'thetasym' => 977,
+		'thinsp'   => 8201,
+		'THORN'    => 222,
+		'thorn'    => 254,
+		'tilde'    => 732,
+		'times'    => 215,
+		'trade'    => 8482,
+		'Uacute'   => 218,
+		'uacute'   => 250,
+		'uarr'     => 8593,
+		'uArr'     => 8657,
+		'Ucirc'    => 219,
+		'ucirc'    => 251,
+		'Ugrave'   => 217,
+		'ugrave'   => 249,
+		'uml'      => 168,
+		'upsih'    => 978,
+		'Upsilon'  => 933,
+		'upsilon'  => 965,
+		'Uuml'     => 220,
+		'uuml'     => 252,
+		'weierp'   => 8472,
+		'Xi'       => 926,
+		'xi'       => 958,
+		'Yacute'   => 221,
+		'yacute'   => 253,
+		'yen'      => 165,
+		'Yuml'     => 376,
+		'yuml'     => 255,
+		'Zeta'     => 918,
+		'zeta'     => 950,
+		'zwj'      => 8205,
+		'zwnj'     => 8204
+	);
 
-/**
- * List of all named character entities defined in HTML 4.01
- * http://www.w3.org/TR/html4/sgml/entities.html
- * @private
- */
-global $wgHtmlEntities;
-$wgHtmlEntities = array(
-	'Aacute'   => 193,
-	'aacute'   => 225,
-	'Acirc'    => 194,
-	'acirc'    => 226,
-	'acute'    => 180,
-	'AElig'    => 198,
-	'aelig'    => 230,
-	'Agrave'   => 192,
-	'agrave'   => 224,
-	'alefsym'  => 8501,
-	'Alpha'    => 913,
-	'alpha'    => 945,
-	'amp'      => 38,
-	'and'      => 8743,
-	'ang'      => 8736,
-	'Aring'    => 197,
-	'aring'    => 229,
-	'asymp'    => 8776,
-	'Atilde'   => 195,
-	'atilde'   => 227,
-	'Auml'     => 196,
-	'auml'     => 228,
-	'bdquo'    => 8222,
-	'Beta'     => 914,
-	'beta'     => 946,
-	'brvbar'   => 166,
-	'bull'     => 8226,
-	'cap'      => 8745,
-	'Ccedil'   => 199,
-	'ccedil'   => 231,
-	'cedil'    => 184,
-	'cent'     => 162,
-	'Chi'      => 935,
-	'chi'      => 967,
-	'circ'     => 710,
-	'clubs'    => 9827,
-	'cong'     => 8773,
-	'copy'     => 169,
-	'crarr'    => 8629,
-	'cup'      => 8746,
-	'curren'   => 164,
-	'dagger'   => 8224,
-	'Dagger'   => 8225,
-	'darr'     => 8595,
-	'dArr'     => 8659,
-	'deg'      => 176,
-	'Delta'    => 916,
-	'delta'    => 948,
-	'diams'    => 9830,
-	'divide'   => 247,
-	'Eacute'   => 201,
-	'eacute'   => 233,
-	'Ecirc'    => 202,
-	'ecirc'    => 234,
-	'Egrave'   => 200,
-	'egrave'   => 232,
-	'empty'    => 8709,
-	'emsp'     => 8195,
-	'ensp'     => 8194,
-	'Epsilon'  => 917,
-	'epsilon'  => 949,
-	'equiv'    => 8801,
-	'Eta'      => 919,
-	'eta'      => 951,
-	'ETH'      => 208,
-	'eth'      => 240,
-	'Euml'     => 203,
-	'euml'     => 235,
-	'euro'     => 8364,
-	'exist'    => 8707,
-	'fnof'     => 402,
-	'forall'   => 8704,
-	'frac12'   => 189,
-	'frac14'   => 188,
-	'frac34'   => 190,
-	'frasl'    => 8260,
-	'Gamma'    => 915,
-	'gamma'    => 947,
-	'ge'       => 8805,
-	'gt'       => 62,
-	'harr'     => 8596,
-	'hArr'     => 8660,
-	'hearts'   => 9829,
-	'hellip'   => 8230,
-	'Iacute'   => 205,
-	'iacute'   => 237,
-	'Icirc'    => 206,
-	'icirc'    => 238,
-	'iexcl'    => 161,
-	'Igrave'   => 204,
-	'igrave'   => 236,
-	'image'    => 8465,
-	'infin'    => 8734,
-	'int'      => 8747,
-	'Iota'     => 921,
-	'iota'     => 953,
-	'iquest'   => 191,
-	'isin'     => 8712,
-	'Iuml'     => 207,
-	'iuml'     => 239,
-	'Kappa'    => 922,
-	'kappa'    => 954,
-	'Lambda'   => 923,
-	'lambda'   => 955,
-	'lang'     => 9001,
-	'laquo'    => 171,
-	'larr'     => 8592,
-	'lArr'     => 8656,
-	'lceil'    => 8968,
-	'ldquo'    => 8220,
-	'le'       => 8804,
-	'lfloor'   => 8970,
-	'lowast'   => 8727,
-	'loz'      => 9674,
-	'lrm'      => 8206,
-	'lsaquo'   => 8249,
-	'lsquo'    => 8216,
-	'lt'       => 60,
-	'macr'     => 175,
-	'mdash'    => 8212,
-	'micro'    => 181,
-	'middot'   => 183,
-	'minus'    => 8722,
-	'Mu'       => 924,
-	'mu'       => 956,
-	'nabla'    => 8711,
-	'nbsp'     => 160,
-	'ndash'    => 8211,
-	'ne'       => 8800,
-	'ni'       => 8715,
-	'not'      => 172,
-	'notin'    => 8713,
-	'nsub'     => 8836,
-	'Ntilde'   => 209,
-	'ntilde'   => 241,
-	'Nu'       => 925,
-	'nu'       => 957,
-	'Oacute'   => 211,
-	'oacute'   => 243,
-	'Ocirc'    => 212,
-	'ocirc'    => 244,
-	'OElig'    => 338,
-	'oelig'    => 339,
-	'Ograve'   => 210,
-	'ograve'   => 242,
-	'oline'    => 8254,
-	'Omega'    => 937,
-	'omega'    => 969,
-	'Omicron'  => 927,
-	'omicron'  => 959,
-	'oplus'    => 8853,
-	'or'       => 8744,
-	'ordf'     => 170,
-	'ordm'     => 186,
-	'Oslash'   => 216,
-	'oslash'   => 248,
-	'Otilde'   => 213,
-	'otilde'   => 245,
-	'otimes'   => 8855,
-	'Ouml'     => 214,
-	'ouml'     => 246,
-	'para'     => 182,
-	'part'     => 8706,
-	'permil'   => 8240,
-	'perp'     => 8869,
-	'Phi'      => 934,
-	'phi'      => 966,
-	'Pi'       => 928,
-	'pi'       => 960,
-	'piv'      => 982,
-	'plusmn'   => 177,
-	'pound'    => 163,
-	'prime'    => 8242,
-	'Prime'    => 8243,
-	'prod'     => 8719,
-	'prop'     => 8733,
-	'Psi'      => 936,
-	'psi'      => 968,
-	'quot'     => 34,
-	'radic'    => 8730,
-	'rang'     => 9002,
-	'raquo'    => 187,
-	'rarr'     => 8594,
-	'rArr'     => 8658,
-	'rceil'    => 8969,
-	'rdquo'    => 8221,
-	'real'     => 8476,
-	'reg'      => 174,
-	'rfloor'   => 8971,
-	'Rho'      => 929,
-	'rho'      => 961,
-	'rlm'      => 8207,
-	'rsaquo'   => 8250,
-	'rsquo'    => 8217,
-	'sbquo'    => 8218,
-	'Scaron'   => 352,
-	'scaron'   => 353,
-	'sdot'     => 8901,
-	'sect'     => 167,
-	'shy'      => 173,
-	'Sigma'    => 931,
-	'sigma'    => 963,
-	'sigmaf'   => 962,
-	'sim'      => 8764,
-	'spades'   => 9824,
-	'sub'      => 8834,
-	'sube'     => 8838,
-	'sum'      => 8721,
-	'sup'      => 8835,
-	'sup1'     => 185,
-	'sup2'     => 178,
-	'sup3'     => 179,
-	'supe'     => 8839,
-	'szlig'    => 223,
-	'Tau'      => 932,
-	'tau'      => 964,
-	'there4'   => 8756,
-	'Theta'    => 920,
-	'theta'    => 952,
-	'thetasym' => 977,
-	'thinsp'   => 8201,
-	'THORN'    => 222,
-	'thorn'    => 254,
-	'tilde'    => 732,
-	'times'    => 215,
-	'trade'    => 8482,
-	'Uacute'   => 218,
-	'uacute'   => 250,
-	'uarr'     => 8593,
-	'uArr'     => 8657,
-	'Ucirc'    => 219,
-	'ucirc'    => 251,
-	'Ugrave'   => 217,
-	'ugrave'   => 249,
-	'uml'      => 168,
-	'upsih'    => 978,
-	'Upsilon'  => 933,
-	'upsilon'  => 965,
-	'Uuml'     => 220,
-	'uuml'     => 252,
-	'weierp'   => 8472,
-	'Xi'       => 926,
-	'xi'       => 958,
-	'Yacute'   => 221,
-	'yacute'   => 253,
-	'yen'      => 165,
-	'Yuml'     => 376,
-	'yuml'     => 255,
-	'Zeta'     => 918,
-	'zeta'     => 950,
-	'zwj'      => 8205,
-	'zwnj'     => 8204 );
+	/**
+	 * Character entity aliases accepted by MediaWiki
+	 */
+	static $htmlEntityAliases = array(
+		'×¨××' => 'rlm',
+		'Ø±ÙÙ' => 'rlm',
+	);
 
-/**
- * Character entity aliases accepted by MediaWiki
- */
-global $wgHtmlEntityAliases;
-$wgHtmlEntityAliases = array(
-	'×¨××' => 'rlm',
-	'Ø±ÙÙ' => 'rlm',
-);
+	/**
+	 * Lazy-initialised attributes regex, see getAttribsRegex()
+	 */
+	static $attribsRegex;
 
+	/**
+	 * Regular expression to match HTML/XML attribute pairs within a tag.
+	 * Allows some... latitude.
+	 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+	 */
+	static function getAttribsRegex() {
+		if ( self::$attribsRegex === null ) {
+			$attribFirst = '[:A-Z_a-z0-9]';
+			$attrib = '[:A-Z_a-z-.0-9]';
+			$space = '[\x09\x0a\x0d\x20]';
+			self::$attribsRegex = 
+				"/(?:^|$space)({$attribFirst}{$attrib}*)
+				  ($space*=$space*
+					(?:
+					 # The attribute value: quoted or alone
+					  \"([^<\"]*)\"
+					 | '([^<']*)'
+					 |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+					 |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
+										 # colors are specified like this.
+										 # We'll be normalizing it.
+					)
+				)?(?=$space|\$)/sx";
+		}
+		return self::$attribsRegex;
+	}
 
-/**
- * XHTML sanitizer for MediaWiki
- * @ingroup Parser
- */
-class Sanitizer {
 	/**
 	 * Cleans up HTML, removes dangerous tags and attributes, and
 	 * removes HTML comments
@@ -368,7 +378,7 @@ class Sanitizer {
 				'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 				'strike', 'strong', 'tt', 'var', 'div', 'center',
 				'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
-				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr', 'dfn',
+				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
 				'kbd', 'samp'
 			);
 			$htmlsingle = array(
@@ -592,6 +602,102 @@ class Sanitizer {
 		return $text;
 	}
 
+	/**
+	 * Take an array of attribute names and values and fix some deprecated values
+	 * for the given element type.
+	 * This does not validate properties, so you should ensure that you call
+	 * validateTagAttributes AFTER this to ensure that the resulting style rule
+	 * this may add is safe.
+	 * 
+	 * - Converts most presentational attributes like align into inline css
+	 *
+	 * @param $attribs Array
+	 * @param $element String
+	 * @return Array
+	 */
+	static function fixDeprecatedAttributes( $attribs, $element ) {
+		global $wgHtml5, $wgCleanupPresentationalAttributes;
+		
+		// presentational attributes were removed from html5, we can leave them
+		// in when html5 is turned off
+		if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
+			return $attribs;
+		}
+		
+		$table = array( 'table' );
+		$cells = array( 'td', 'th' );
+		$colls = array( 'col', 'colgroup' );
+		$tblocks = array( 'tbody', 'tfoot', 'thead' );
+		$h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
+		
+		$presentationalAttribs = array(
+			'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
+			'clear' => array( 'clear', array( 'br' ) ),
+			'height' => array( 'height', $cells ),
+			'nowrap' => array( 'white-space', $cells ),
+			'size' => array( 'height', array( 'hr' ) ),
+			'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
+			'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
+			'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
+		);
+		
+		// Ensure that any upper case or mixed case attributes are converted to lowercase
+		foreach ( $attribs as $attribute => $value ) {
+			if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) {
+				$attribs[strtolower( $attribute )] = $value;
+				unset( $attribs[$attribute] );
+			}
+		}
+		
+		$style = "";
+		foreach ( $presentationalAttribs as $attribute => $info ) {
+			list( $property, $elements ) = $info;
+			
+			// Skip if this attribute is not relevant to this element
+			if ( !in_array( $element, $elements ) ) {
+				continue;
+			}
+			
+			// Skip if the attribute is not used
+			if ( !array_key_exists( $attribute, $attribs ) ) {
+				continue;
+			}
+			
+			$value = $attribs[$attribute];
+			
+			// For nowrap the value should be nowrap instead of whatever text is in the value
+			if ( $attribute === 'nowrap' ) {
+				$value = 'nowrap';
+			}
+			
+			// clear="all" is clear: both; in css
+			if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) {
+				$value = 'both';
+			}
+			
+			// Size based properties should have px applied to them if they have no unit
+			if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
+				if ( preg_match( '/^[\d.]+$/', $value ) ) {
+					$value = "{$value}px";
+				}
+			}
+			
+			$style .= " $property: $value;";
+			
+			unset( $attribs[$attribute] );
+		}
+		
+		if ( $style ) {
+			// Prepend our style rules so that they can be overridden by user css
+			if ( isset($attribs['style']) ) {
+				$style .= " " . $attribs['style'];
+			}
+			$attribs['style'] = trim($style);
+		}
+		
+		return $attribs;
+	}
+
 	/**
 	 * Take an array of attribute names and values and normalize or discard
 	 * illegal values for the given element type.
@@ -636,8 +742,8 @@ class Sanitizer {
 		$out = array();
 		foreach( $attribs as $attribute => $value ) {
 			#allow XML namespace declaration if RDFa is enabled
-			if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
-				if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+			if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
+				if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 					$out[$attribute] = $value;
 				}
 
@@ -667,7 +773,7 @@ class Sanitizer {
 				$attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
 
 				//Paranoia. Allow "simple" values but suppress javascript
-				if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+				if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 					continue; 
 				}
 			}
@@ -687,19 +793,6 @@ class Sanitizer {
 		}
 
 		if ( $wgAllowMicrodataAttributes ) {
-			# There are some complicated validity constraints we need to
-			# enforce here.  First of all, we don't want to allow non-standard
-			# itemtypes.
-			$allowedTypes = array(
-				'http://microformats.org/profile/hcard',
-				'http://microformats.org/profile/hcalendar#vevent',
-				'http://n.whatwg.org/work',
-			);
-			if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
-			$allowedTypes ) ) {
-				# Kill everything
-				unset( $out['itemscope'] );
-			}
 			# itemtype, itemid, itemref don't make sense without itemscope
 			if ( !array_key_exists( 'itemscope', $out ) ) {
 				unset( $out['itemtype'] );
@@ -735,21 +828,34 @@ class Sanitizer {
 
 	/**
 	 * Pick apart some CSS and check it for forbidden or unsafe structures.
-	 * Returns a sanitized string, or false if it was just too evil.
+	 * Returns a sanitized string. This sanitized string will have
+	 * character references and escape sequences decoded, and comments
+	 * stripped. If the input is just too evil, only a comment complaining
+	 * about evilness will be returned.
 	 *
 	 * Currently URL references, 'expression', 'tps' are forbidden.
 	 *
+	 * NOTE: Despite the fact that character references are decoded, the
+	 * returned string may contain character references given certain
+	 * clever input strings. These character references must
+	 * be escaped before the return value is embedded in HTML.
+	 * 
 	 * @param $value String
-	 * @return Mixed
+	 * @return String
 	 */
 	static function checkCss( $value ) {
+		// Decode character references like &#123;
 		$value = Sanitizer::decodeCharReferences( $value );
 
-		// Remove any comments; IE gets token splitting wrong
-		$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
 		// Decode escape sequences and line continuation
 		// See the grammar in the CSS 2 spec, appendix D.
+		// This has to be done AFTER decoding character references.
+		// This means it isn't possible for this function to return
+		// unsanitized escape sequences. It is possible to manufacture
+		// input that contains character references that decode to
+		// escape sequences that decode to character references, but
+		// it's OK for the return value to contain character references
+		// because the caller is supposed to escape those anyway.
 		static $decodeRegex;
 		if ( !$decodeRegex ) {
 			$space = '[\\x20\\t\\r\\n\\f]';
@@ -765,6 +871,21 @@ class Sanitizer {
 		}
 		$value = preg_replace_callback( $decodeRegex,
 			array( __CLASS__, 'cssDecodeCallback' ), $value );
+		
+		// Remove any comments; IE gets token splitting wrong
+		// This must be done AFTER decoding character references and
+		// escape sequences, because those steps can introduce comments
+		// This step cannot introduce character references or escape
+		// sequences, because it replaces comments with spaces rather
+		// than removing them completely.
+		$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+		// Remove anything after a comment-start token, to guard against
+		// incorrect client implementations.
+		$commentPos = strpos( $value, '/*' );
+		if ( $commentPos !== false ) {
+			$value = substr( $value, 0, $commentPos );
+		}
 
 		// Reject problematic keywords and control characters
 		if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
@@ -775,6 +896,10 @@ class Sanitizer {
 		return $value;
 	}
 
+	/**
+	 * @param $matches array
+	 * @return String
+	 */
 	static function cssDecodeCallback( $matches ) {
 		if ( $matches[1] !== '' ) {
 			// Line continuation
@@ -796,51 +921,6 @@ class Sanitizer {
 		}
 	}
 
-	/** 
-	* Take an associative array of attribute name/value pairs
-	* and generate a css style representing all the style-related
-	* attributes. If there already a style attribute in the array,
-	* it is also included in the value returned.
-	*/
-	static function styleFromAttributes( $attributes ) {
-		$styles = array();
-
-		foreach ( $attributes as $attribute => $value ) {
-			if ( $attribute == 'bgcolor' ) {
-				$styles[] = "background-color: $value";
-			} else if ( $attribute == 'border' ) {
-				$styles[] = "border-width: $value";
-			} else if ( $attribute == 'align' ) {
-				$styles[] = "text-align: $value";
-			} else if ( $attribute == 'valign' ) {
-				$styles[] = "vertical-align: $value";
-			} else if ( $attribute == 'width' ) {
-				if ( preg_match( '/\d+/', $value ) === false ) {
-				      $value .= 'px';
-				}
-
-				$styles[] = "width: $value";
-			} else if ( $attribute == 'height' ) {
-				if ( preg_match( '/\d+/', $value ) === false ) {
-				      $value .= 'px';
-				}
-
-				$styles[] = "height: $value";
-			} else if ( $attribute == 'nowrap' ) {
-				if ( $value ) {
-					$styles[] = "white-space: nowrap";
-				}
-			}
-		}
-
-		if ( isset( $attributes[ 'style' ] ) ) {
-			$styles[] = $attributes[ 'style' ];
-		} 
-
-		if ( !$styles ) return '';
-		else return implode( '; ', $styles );
-	}
-
 	/**
 	 * Take a tag soup fragment listing an HTML element's attributes
 	 * and normalize it to well-formed XML, discarding unwanted attributes.
@@ -858,66 +938,25 @@ class Sanitizer {
 	 *
 	 * @param $text String
 	 * @param $element String
-	 * @param $defaults Array (optional) associative array of default attributes to splice in. 
-	 *			class and style attributes are combined. Otherwise, values from
-	 *			$attributes take precedence over values from $defaults.
 	 * @return String
 	 */
-	static function fixTagAttributes( $text, $element, $defaults = null ) {
+	static function fixTagAttributes( $text, $element ) {
 		if( trim( $text ) == '' ) {
 			return '';
 		}
 
 		$decoded = Sanitizer::decodeTagAttributes( $text );
+		$decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
 		$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
-		$attribs = Sanitizer::collapseTagAttributes( $stripped, $defaults );
 
-		return $attribs;
-	}
-
-	/**
-	 * Take an associative array or attribute name/value pairs
-	 * and collapses it to well-formed XML.
-	 * Does not filter attributes.
-	 * Output is safe for further wikitext processing, with escaping of
-	 * values that could trigger problems.
-	 *
-	 * - Double-quotes all attribute values
-	 * - Prepends space if there are attributes.
-	 *
-	 * @param $attributes Array is an associative array of attribute name/value pairs. 
-	 * 			Assumed to be sanitized already.
-	 * @param $defaults Array (optional) associative array of default attributes to splice in. 
-	 *			class and style attributes are combined. Otherwise, values from
-	 *			$attributes take precedence over values from $defaults.
-	 * @return String
-	 */
-	static function collapseTagAttributes( $attributes, $defaults = null ) {
-		if ( $defaults ) {
-			foreach( $defaults as $attribute => $value ) {
-				if ( isset( $attributes[ $attribute ] ) ) {
-					if ( $attribute == 'class' ) {
-						$value .= ' '. $attributes[ $attribute ];
-					} else if ( $attribute == 'style' ) {
-						$value .= '; ' . $attributes[ $attribute ];
-					} else {
-						continue;
-					}
-				}
-
-				$attributes[ $attribute ] = $value;
-			}
-		}
-
-		$chunks = array();
-
-		foreach( $attributes as $attribute => $value ) {
+		$attribs = array();
+		foreach( $stripped as $attribute => $value ) {
 			$encAttribute = htmlspecialchars( $attribute );
 			$encValue = Sanitizer::safeEncodeAttribute( $value );
 
-			$chunks[] = "$encAttribute=\"$encValue\"";
+			$attribs[] = "$encAttribute=\"$encValue\"";
 		}
-		return count( $chunks ) ? ' ' . implode( ' ', $chunks ) : '';
+		return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 	}
 
 	/**
@@ -1096,7 +1135,7 @@ class Sanitizer {
 		$attribs = array();
 		$pairs = array();
 		if( !preg_match_all(
-			MW_ATTRIBS_REGEX,
+			self::getAttribsRegex(),
 			$text,
 			$pairs,
 			PREG_SET_ORDER ) ) {
@@ -1119,7 +1158,7 @@ class Sanitizer {
 
 	/**
 	 * Pick the appropriate attribute value from a match set from the
-	 * MW_ATTRIBS_REGEX matches.
+	 * attribs regex matches.
 	 *
 	 * @param $set Array
 	 * @return String
@@ -1163,6 +1202,10 @@ class Sanitizer {
 				Sanitizer::normalizeCharReferences( $text ) ) );
 	}
 
+	/**
+	 * @param $text string
+	 * @return mixed
+	 */
 	private static function normalizeWhitespace( $text ) {
 		return preg_replace(
 			'/\r\n|[\x20\x0d\x0a\x09]/',
@@ -1187,7 +1230,8 @@ class Sanitizer {
 	 * for XML and XHTML specifically. Any stray bits will be
 	 * &amp;-escaped to result in a valid text fragment.
 	 *
-	 * a. any named char refs must be known in XHTML
+	 * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
+	 *   numericized (this way we're well-formed even without a DTD)
 	 * b. any numeric char refs must be legal chars, not invalid or forbidden
 	 * c. use &#x, not &#X
 	 * d. fix or reject non-valid attributes
@@ -1198,7 +1242,7 @@ class Sanitizer {
 	 */
 	static function normalizeCharReferences( $text ) {
 		return preg_replace_callback(
-			MW_CHAR_REFS_REGEX,
+			self::CHAR_REFS_REGEX,
 			array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 			$text );
 	}
@@ -1214,8 +1258,6 @@ class Sanitizer {
 			$ret = Sanitizer::decCharReference( $matches[2] );
 		} elseif( $matches[3] != ''  ) {
 			$ret = Sanitizer::hexCharReference( $matches[3] );
-		} elseif( $matches[4] != '' ) {
-			$ret = Sanitizer::hexCharReference( $matches[4] );
 		}
 		if( is_null( $ret ) ) {
 			return htmlspecialchars( $matches[0] );
@@ -1226,24 +1268,31 @@ class Sanitizer {
 
 	/**
 	 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
-	 * return the named entity reference as is. If the entity is a
-	 * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
-	 * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
+	 * return the equivalent numeric entity reference (except for the core &lt;
+	 * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
+	 * the HTML equivalent. Otherwise, returns HTML-escaped text of
+	 * pseudo-entity source (eg &amp;foo;)
 	 *
 	 * @param $name String
 	 * @return String
 	 */
 	static function normalizeEntity( $name ) {
-		global $wgHtmlEntities, $wgHtmlEntityAliases;
-		if ( isset( $wgHtmlEntityAliases[$name] ) ) {
-			return "&{$wgHtmlEntityAliases[$name]};";
-		} elseif( isset( $wgHtmlEntities[$name] ) ) {
+		if ( isset( self::$htmlEntityAliases[$name] ) ) {
+			return '&' . self::$htmlEntityAliases[$name] . ';';
+		} elseif ( in_array( $name,
+		array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
 			return "&$name;";
+		} elseif ( isset( self::$htmlEntities[$name] ) ) {
+			return '&#' . self::$htmlEntities[$name] . ';';
 		} else {
 			return "&amp;$name;";
 		}
 	}
 
+	/**
+	 * @param $codepoint
+	 * @return null|string
+	 */
 	static function decCharReference( $codepoint ) {
 		$point = intval( $codepoint );
 		if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1253,6 +1302,10 @@ class Sanitizer {
 		}
 	}
 
+	/**
+	 * @param $codepoint
+	 * @return null|string
+	 */
 	static function hexCharReference( $codepoint ) {
 		$point = hexdec( $codepoint );
 		if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1285,7 +1338,7 @@ class Sanitizer {
 	 */
 	public static function decodeCharReferences( $text ) {
 		return preg_replace_callback(
-			MW_CHAR_REFS_REGEX,
+			self::CHAR_REFS_REGEX,
 			array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 			$text );
 	}
@@ -1303,7 +1356,7 @@ class Sanitizer {
 	public static function decodeCharReferencesAndNormalize( $text ) {
 		global $wgContLang;
 		$text = preg_replace_callback(
-			MW_CHAR_REFS_REGEX,
+			self::CHAR_REFS_REGEX,
 			array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 			$text, /* limit */ -1, $count );
 
@@ -1325,8 +1378,6 @@ class Sanitizer {
 			return  Sanitizer::decodeChar( intval( $matches[2] ) );
 		} elseif( $matches[3] != ''  ) {
 			return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
-		} elseif( $matches[4] != '' ) {
-			return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 		}
 		# Last case should be an ampersand by itself
 		return $matches[0];
@@ -1352,16 +1403,15 @@ class Sanitizer {
 	 * return the UTF-8 encoding of that character. Otherwise, returns
 	 * pseudo-entity source (eg &foo;)
 	 *
-	 * @param $name Strings
+	 * @param $name String
 	 * @return String
 	 */
 	static function decodeEntity( $name ) {
-		global $wgHtmlEntities, $wgHtmlEntityAliases;
-		if ( isset( $wgHtmlEntityAliases[$name] ) ) {
-			$name = $wgHtmlEntityAliases[$name];
+		if ( isset( self::$htmlEntityAliases[$name] ) ) {
+			$name = self::$htmlEntityAliases[$name];
 		}
-		if( isset( $wgHtmlEntities[$name] ) ) {
-			return codepointToUtf8( $wgHtmlEntities[$name] );
+		if( isset( self::$htmlEntities[$name] ) ) {
+			return codepointToUtf8( self::$htmlEntities[$name] );
 		} else {
 			return "&$name;";
 		}
@@ -1449,8 +1499,8 @@ class Sanitizer {
 			'cite'       => $common,
 			'dfn'        => $common,
 			'code'       => $common,
-			# samp
-			# kbd
+			'samp'       => $common,
+			'kbd'        => $common,
 			'var'        => $common,
 			'abbr'       => $common,
 			# acronym
@@ -1586,22 +1636,26 @@ class Sanitizer {
 	 * @return String
 	 */
 	static function hackDocType() {
-		global $wgHtmlEntities;
 		$out = "<!DOCTYPE html [\n";
-		foreach( $wgHtmlEntities as $entity => $codepoint ) {
+		foreach( self::$htmlEntities as $entity => $codepoint ) {
 			$out .= "<!ENTITY $entity \"&#$codepoint;\">";
 		}
 		$out .= "]>\n";
 		return $out;
 	}
 
+	/**
+	 * @param $url string
+	 * @return mixed|string
+	 */
 	static function cleanUrl( $url ) {
 		# Normalize any HTML entities in input. They will be
 		# re-escaped by makeExternalLink().
 		$url = Sanitizer::decodeCharReferences( $url );
 
 		# Escape any control characters introduced by the above step
-		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url );
+		$url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 
+			array( __CLASS__, 'cleanUrlCallback' ), $url );
 
 		# Validate hostname portion
 		$matches = array();
@@ -1629,7 +1683,7 @@ class Sanitizer {
 
 			$host = preg_replace( $strip, '', $host );
 
-			// @todo Fixme: validate hostnames here
+			// @todo FIXME: Validate hostnames here
 
 			return $protocol . $host . $rest;
 		} else {
@@ -1637,4 +1691,63 @@ class Sanitizer {
 		}
 	}
 
+	/**
+	 * @param $matches array
+	 * @return string
+	 */
+	static function cleanUrlCallback( $matches ) {
+		return urlencode( $matches[0] );
+	}
+
+	/**
+	 * Does a string look like an e-mail address?
+	 *
+	 * This validates an email address using an HTML5 specification found at:
+	 * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+	 * Which as of 2011-01-24 says:
+	 *
+	 *   A valid e-mail address is a string that matches the ABNF production
+	 *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
+	 *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
+	 *   3.5.
+	 *
+	 * This function is an implementation of the specification as requested in
+	 * bug 22449.
+	 *
+	 * Client-side forms will use the same standard validation rules via JS or
+	 * HTML 5 validation; additional restrictions can be enforced server-side
+	 * by extensions via the 'isValidEmailAddr' hook.
+	 *
+	 * Note that this validation doesn't 100% match RFC 2822, but is believed
+	 * to be liberal enough for wide use. Some invalid addresses will still
+	 * pass validation here.
+	 *
+	 * @since 1.18
+	 *
+	 * @param $addr String E-mail address
+	 * @return Bool
+	 */
+	public static function validateEmail( $addr ) {
+		$result = null;
+		if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+			return $result;
+		}
+
+		// Please note strings below are enclosed in brackets [], this make the
+		// hyphen "-" a range indicator. Hence it is double backslashed below.
+		// See bug 26948
+		$rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
+		$rfc1034_ldh_str = "a-z0-9\\-" ;
+
+		$HTML5_email_regexp = "/
+		^                      # start of string
+		[$rfc5322_atext\\.]+    # user part which is liberal :p
+		@                      # 'apostrophe'
+		[$rfc1034_ldh_str]+       # First domain part
+		(\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
+		$                      # End of string
+		/ix" ; // case Insensitive, eXtended
+
+		return (bool) preg_match( $HTML5_email_regexp, $addr );
+	}
 }