<?php
-
/**
* (X)HTML sanitizer for MediaWiki
*
* @subpackage Parser
*/
+/**
+ * Regular expression to match various types of character references in
+ * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
+ */
+define( 'MW_CHAR_REFS_REGEX',
+ '/&([A-Za-z0-9]+);
+ |&\#([0-9]+);
+ |&\#x([0-9A-Za-z]+);
+ |&\#X([0-9A-Za-z]+);
+ |(&)/x' );
+
+/**
+ * Regular expression to match HTML/XML attribute pairs within a tag.
+ * Allows some... latitude.
+ * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+ */
+$attrib = '[A-Za-z0-9]';
+$space = '[\x09\x0a\x0d\x20]';
+define( 'MW_ATTRIBS_REGEX',
+ "/(?:^|$space)($attrib+)
+ ($space*=$space*
+ (?:
+ # The attribute value: quoted or alone
+ \"([^<\"]*)\"
+ | '([^<']*)'
+ | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+ | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
+ # colors are specified like this.
+ # We'll be normalizing it.
+ )
+ )?(?=$space|\$)/sx" );
+
+/**
+ * List of all named character entities defined in HTML 4.01
+ * http://www.w3.org/TR/html4/sgml/entities.html
+ * @access private
+ */
+global $wgHtmlEntities;
+$wgHtmlEntities = array(
+ 'Aacute' => 193,
+ 'aacute' => 225,
+ 'Acirc' => 194,
+ 'acirc' => 226,
+ 'acute' => 180,
+ 'AElig' => 198,
+ 'aelig' => 230,
+ 'Agrave' => 192,
+ 'agrave' => 224,
+ 'alefsym' => 8501,
+ 'Alpha' => 913,
+ 'alpha' => 945,
+ 'amp' => 38,
+ 'and' => 8743,
+ 'ang' => 8736,
+ 'Aring' => 197,
+ 'aring' => 229,
+ 'asymp' => 8776,
+ 'Atilde' => 195,
+ 'atilde' => 227,
+ 'Auml' => 196,
+ 'auml' => 228,
+ 'bdquo' => 8222,
+ 'Beta' => 914,
+ 'beta' => 946,
+ 'brvbar' => 166,
+ 'bull' => 8226,
+ 'cap' => 8745,
+ 'Ccedil' => 199,
+ 'ccedil' => 231,
+ 'cedil' => 184,
+ 'cent' => 162,
+ 'Chi' => 935,
+ 'chi' => 967,
+ 'circ' => 710,
+ 'clubs' => 9827,
+ 'cong' => 8773,
+ 'copy' => 169,
+ 'crarr' => 8629,
+ 'cup' => 8746,
+ 'curren' => 164,
+ 'dagger' => 8224,
+ 'Dagger' => 8225,
+ 'darr' => 8595,
+ 'dArr' => 8659,
+ 'deg' => 176,
+ 'Delta' => 916,
+ 'delta' => 948,
+ 'diams' => 9830,
+ 'divide' => 247,
+ 'Eacute' => 201,
+ 'eacute' => 233,
+ 'Ecirc' => 202,
+ 'ecirc' => 234,
+ 'Egrave' => 200,
+ 'egrave' => 232,
+ 'empty' => 8709,
+ 'emsp' => 8195,
+ 'ensp' => 8194,
+ 'Epsilon' => 917,
+ 'epsilon' => 949,
+ 'equiv' => 8801,
+ 'Eta' => 919,
+ 'eta' => 951,
+ 'ETH' => 208,
+ 'eth' => 240,
+ 'Euml' => 203,
+ 'euml' => 235,
+ 'euro' => 8364,
+ 'exist' => 8707,
+ 'fnof' => 402,
+ 'forall' => 8704,
+ 'frac12' => 189,
+ 'frac14' => 188,
+ 'frac34' => 190,
+ 'frasl' => 8260,
+ 'Gamma' => 915,
+ 'gamma' => 947,
+ 'ge' => 8805,
+ 'gt' => 62,
+ 'harr' => 8596,
+ 'hArr' => 8660,
+ 'hearts' => 9829,
+ 'hellip' => 8230,
+ 'Iacute' => 205,
+ 'iacute' => 237,
+ 'Icirc' => 206,
+ 'icirc' => 238,
+ 'iexcl' => 161,
+ 'Igrave' => 204,
+ 'igrave' => 236,
+ 'image' => 8465,
+ 'infin' => 8734,
+ 'int' => 8747,
+ 'Iota' => 921,
+ 'iota' => 953,
+ 'iquest' => 191,
+ 'isin' => 8712,
+ 'Iuml' => 207,
+ 'iuml' => 239,
+ 'Kappa' => 922,
+ 'kappa' => 954,
+ 'Lambda' => 923,
+ 'lambda' => 955,
+ 'lang' => 9001,
+ 'laquo' => 171,
+ 'larr' => 8592,
+ 'lArr' => 8656,
+ 'lceil' => 8968,
+ 'ldquo' => 8220,
+ 'le' => 8804,
+ 'lfloor' => 8970,
+ 'lowast' => 8727,
+ 'loz' => 9674,
+ 'lrm' => 8206,
+ 'lsaquo' => 8249,
+ 'lsquo' => 8216,
+ 'lt' => 60,
+ 'macr' => 175,
+ 'mdash' => 8212,
+ 'micro' => 181,
+ 'middot' => 183,
+ 'minus' => 8722,
+ 'Mu' => 924,
+ 'mu' => 956,
+ 'nabla' => 8711,
+ 'nbsp' => 160,
+ 'ndash' => 8211,
+ 'ne' => 8800,
+ 'ni' => 8715,
+ 'not' => 172,
+ 'notin' => 8713,
+ 'nsub' => 8836,
+ 'Ntilde' => 209,
+ 'ntilde' => 241,
+ 'Nu' => 925,
+ 'nu' => 957,
+ 'Oacute' => 211,
+ 'oacute' => 243,
+ 'Ocirc' => 212,
+ 'ocirc' => 244,
+ 'OElig' => 338,
+ 'oelig' => 339,
+ 'Ograve' => 210,
+ 'ograve' => 242,
+ 'oline' => 8254,
+ 'Omega' => 937,
+ 'omega' => 969,
+ 'Omicron' => 927,
+ 'omicron' => 959,
+ 'oplus' => 8853,
+ 'or' => 8744,
+ 'ordf' => 170,
+ 'ordm' => 186,
+ 'Oslash' => 216,
+ 'oslash' => 248,
+ 'Otilde' => 213,
+ 'otilde' => 245,
+ 'otimes' => 8855,
+ 'Ouml' => 214,
+ 'ouml' => 246,
+ 'para' => 182,
+ 'part' => 8706,
+ 'permil' => 8240,
+ 'perp' => 8869,
+ 'Phi' => 934,
+ 'phi' => 966,
+ 'Pi' => 928,
+ 'pi' => 960,
+ 'piv' => 982,
+ 'plusmn' => 177,
+ 'pound' => 163,
+ 'prime' => 8242,
+ 'Prime' => 8243,
+ 'prod' => 8719,
+ 'prop' => 8733,
+ 'Psi' => 936,
+ 'psi' => 968,
+ 'quot' => 34,
+ 'radic' => 8730,
+ 'rang' => 9002,
+ 'raquo' => 187,
+ 'rarr' => 8594,
+ 'rArr' => 8658,
+ 'rceil' => 8969,
+ 'rdquo' => 8221,
+ 'real' => 8476,
+ 'reg' => 174,
+ 'rfloor' => 8971,
+ 'Rho' => 929,
+ 'rho' => 961,
+ 'rlm' => 8207,
+ 'rsaquo' => 8250,
+ 'rsquo' => 8217,
+ 'sbquo' => 8218,
+ 'Scaron' => 352,
+ 'scaron' => 353,
+ 'sdot' => 8901,
+ 'sect' => 167,
+ 'shy' => 173,
+ 'Sigma' => 931,
+ 'sigma' => 963,
+ 'sigmaf' => 962,
+ 'sim' => 8764,
+ 'spades' => 9824,
+ 'sub' => 8834,
+ 'sube' => 8838,
+ 'sum' => 8721,
+ 'sup' => 8835,
+ 'sup1' => 185,
+ 'sup2' => 178,
+ 'sup3' => 179,
+ 'supe' => 8839,
+ 'szlig' => 223,
+ 'Tau' => 932,
+ 'tau' => 964,
+ 'there4' => 8756,
+ 'Theta' => 920,
+ 'theta' => 952,
+ 'thetasym' => 977,
+ 'thinsp' => 8201,
+ 'THORN' => 222,
+ 'thorn' => 254,
+ 'tilde' => 732,
+ 'times' => 215,
+ 'trade' => 8482,
+ 'Uacute' => 218,
+ 'uacute' => 250,
+ 'uarr' => 8593,
+ 'uArr' => 8657,
+ 'Ucirc' => 219,
+ 'ucirc' => 251,
+ 'Ugrave' => 217,
+ 'ugrave' => 249,
+ 'uml' => 168,
+ 'upsih' => 978,
+ 'Upsilon' => 933,
+ 'upsilon' => 965,
+ 'Uuml' => 220,
+ 'uuml' => 252,
+ 'weierp' => 8472,
+ 'Xi' => 926,
+ 'xi' => 958,
+ 'Yacute' => 221,
+ 'yacute' => 253,
+ 'yen' => 165,
+ 'Yuml' => 376,
+ 'yuml' => 255,
+ 'Zeta' => 918,
+ 'zeta' => 950,
+ 'zwj' => 8205,
+ 'zwnj' => 8204 );
+
+/** @package MediaWiki */
class Sanitizer {
/**
* Cleans up HTML, removes dangerous tags and attributes, and
* removes HTML comments
* @access private
* @param string $text
+ * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
+ * @param array $args for the processing callback
* @return string
*/
- function removeHTMLtags( $text ) {
+ function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
global $wgUseTidy, $wgUserHtml;
$fname = 'Parser::removeHTMLtags';
wfProfileIn( $fname );
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
);
+ $htmlsingleonly = array( # Elements that cannot have close tags
+ 'br', 'hr'
+ );
$htmlnest = array( # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
$tagstack = array(); $tablestack = array();
foreach ( $bits as $x ) {
$prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
- preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
+ preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
error_reporting( $prev );
# Check our stack
if ( $slash ) {
# Closing a tag...
- if ( ! in_array( $t, $htmlsingle ) &&
+ if( in_array( $t, $htmlsingleonly ) ) {
+ $badtag = 1;
+ } elseif( !in_array( $t, $htmlsingle ) &&
( $ot = @array_pop( $tagstack ) ) != $t ) {
@array_push( $tagstack, $ot );
$badtag = 1;
} else if ( in_array( $t, $tagstack ) &&
! in_array ( $t , $htmlnest ) ) {
$badtag = 1 ;
+ } elseif( in_array( $t, $htmlsingleonly ) ) {
+ # Hack to force empty tag for uncloseable elements
+ $brace = '/>';
} else if ( ! in_array( $t, $htmlsingle ) ) {
if ( $t == 'table' ) {
array_push( $tablestack, $tagstack );
}
array_push( $tagstack, $t );
}
+
+ # Replace any variables or template parameters with
+ # plaintext results.
+ if( is_callable( $processCallback ) ) {
+ call_user_func_array( $processCallback, array( &$params, $args ) );
+ }
+
# Strip non-approved attributes from the tag
$newparams = Sanitizer::fixTagAttributes( $params, $t );
}
if ( ! $badtag ) {
$rest = str_replace( '>', '>', $rest );
- $text .= "<$slash$t$newparams$brace$rest";
+ $close = ( $brace == '/>' ) ? ' /' : '';
+ $text .= "<$slash$t$newparams$close>$rest";
continue;
}
}
} else {
# this might be possible using tidy itself
foreach ( $bits as $x ) {
- preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
+ preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
@list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+ if( is_callable( $processCallback ) ) {
+ call_user_func_array( $processCallback, array( &$params, $args ) );
+ }
$newparams = Sanitizer::fixTagAttributes( $params, $t );
$rest = str_replace( '>', '>', $rest );
$text .= "<$slash$t$newparams$brace$rest";
# Unquoted attribute
# Since we quote this later, this can be anything distinguishable
# from the end of the attribute
- $attrib = '[A-Za-z0-9]';
- $space = '[\x09\x0a\x0d\x20]';
if( !preg_match_all(
- "/(?:^|$space)($attrib+)
- ($space*=$space*
- (?:
- # The attribute value: quoted or alone
- \"([^<\"]*)\"
- | '([^<']*)'
- | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
- | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
- # colors are specified like this.
- # We'll be normalizing it.
- )
- )?(?=$space|\$)/sx",
+ MW_ATTRIBS_REGEX,
$text,
$pairs,
PREG_SET_ORDER ) ) {
if( !isset( $whitelist[$attribute] ) ) {
continue;
}
- if( !isset( $set[2] ) ) {
- # In XHTML, attributes must have a value.
- $value = $set[1];
- } elseif( $set[3] != '' ) {
- # Double-quoted
- $value = Sanitizer::normalizeAttributeValue( $set[3] );
- } elseif( $set[4] != '' ) {
- # Single-quoted
- $value = str_replace( '"', '"',
- Sanitizer::normalizeAttributeValue( $set[4] ) );
- } elseif( $set[5] != '' ) {
- # No quotes.
- $value = Sanitizer::normalizeAttributeValue( $set[5] );
- } elseif( $set[6] != '' ) {
- # Illegal #XXXXXX color with no quotes.
- $value = Sanitizer::normalizeAttributeValue( $set[6] );
- } else {
- wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
- }
+
+ $raw = Sanitizer::getTagAttributeCallback( $set );
+ $value = Sanitizer::normalizeAttributeValue( $raw );
# Strip javascript "expression" from stylesheets.
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
if( $attribute == 'style' && preg_match(
'/(expression|tps*:\/\/|url\\s*\().*/is',
- wfMungeToUtf8( $value ) ) ) {
+ Sanitizer::decodeCharReferences( $value ) ) ) {
# haxx0r
continue;
}
- if( !isset( $attribs[$attribute] ) ) {
- $attribs[$attribute] = "$attribute=\"$value\"";
- }
+ # Templates and links may be expanded in later parsing,
+ # creating invalid or dangerous output. Suppress this.
+ $value = strtr( $value, array(
+ '{' => '{',
+ '[' => '[',
+ "''" => '''',
+ 'ISBN' => 'ISBN',
+ 'RFC' => 'RFC',
+ 'PMID' => 'PMID',
+ ) );
+ $value = preg_replace(
+ '/(' . URL_PROTOCOLS . '):/',
+ '\\1:', $value );
+
+ // If this attribute was previously set, override it.
+ // Output should only have one attribute of each name.
+ $attribs[$attribute] = "$attribute=\"$value\"";
}
if( empty( $attribs ) ) {
return '';
}
}
+ /**
+ * Return an associative array of attribute names and values from
+ * a partial tag string. Attribute names are forces to lowercase,
+ * character references are decoded to UTF-8 text.
+ *
+ * @param string
+ * @return array
+ */
+ function decodeTagAttributes( $text ) {
+ $attribs = array();
+
+ if( trim( $text ) == '' ) {
+ return $attribs;
+ }
+
+ if( !preg_match_all(
+ MW_ATTRIBS_REGEX,
+ $text,
+ $pairs,
+ PREG_SET_ORDER ) ) {
+ return $attribs;
+ }
+
+ foreach( $pairs as $set ) {
+ $attribute = strtolower( $set[1] );
+ $value = Sanitizer::getTagAttributeCallback( $set );
+ $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
+ }
+ return $attribs;
+ }
+
+ /**
+ * Pick the appropriate attribute value from a match set from the
+ * MW_ATTRIBS_REGEX matches.
+ *
+ * @param array $set
+ * @return string
+ * @access private
+ */
+ function getTagAttributeCallback( $set ) {
+ if( isset( $set[6] ) ) {
+ # Illegal #XXXXXX color with no quotes.
+ return $set[6];
+ } elseif( isset( $set[5] ) ) {
+ # No quotes.
+ return $set[5];
+ } elseif( isset( $set[4] ) ) {
+ # Single-quoted
+ return $set[4];
+ } elseif( isset( $set[3] ) ) {
+ # Double-quoted
+ return $set[3];
+ } elseif( !isset( $set[2] ) ) {
+ # In XHTML, attributes must have a value.
+ # For 'reduced' form, return explicitly the attribute name here.
+ return $set[1];
+ } else {
+ wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
+ }
+ }
+
/**
* Normalize whitespace and character references in an XML source-
* encoded text for an attribute value.
* @access private
*/
function normalizeAttributeValue( $text ) {
- return preg_replace(
- '/\r\n|[\x20\x0d\x0a\x09]/',
- ' ',
- Sanitizer::normalizeCharReferences( $text ) );
+ return str_replace( '"', '"',
+ preg_replace(
+ '/\r\n|[\x20\x0d\x0a\x09]/',
+ ' ',
+ Sanitizer::normalizeCharReferences( $text ) ) );
}
/**
*/
function normalizeCharReferences( $text ) {
return preg_replace_callback(
- '/&([A-Za-z0-9]+);
- |&\#([0-9]+);
- |&\#x([0-9A-Za-z]+);
- |&\#X([0-9A-Za-z]+);
- |(&)/x',
+ MW_CHAR_REFS_REGEX,
array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
$text );
}
* @return string
*/
function normalizeEntity( $name ) {
- # List of all named character entities defined in HTML 4.01
- # http://www.w3.org/TR/html4/sgml/entities.html
- static $htmlEntities = array(
- 'aacute' => true,
- 'Aacute' => true,
- 'acirc' => true,
- 'Acirc' => true,
- 'acute' => true,
- 'aelig' => true,
- 'AElig' => true,
- 'agrave' => true,
- 'Agrave' => true,
- 'alefsym' => true,
- 'alpha' => true,
- 'Alpha' => true,
- 'amp' => true,
- 'and' => true,
- 'ang' => true,
- 'apos' => true,
- 'aring' => true,
- 'Aring' => true,
- 'asymp' => true,
- 'atilde' => true,
- 'Atilde' => true,
- 'auml' => true,
- 'Auml' => true,
- 'bdquo' => true,
- 'beta' => true,
- 'Beta' => true,
- 'brvbar' => true,
- 'bull' => true,
- 'cap' => true,
- 'ccedil' => true,
- 'Ccedil' => true,
- 'cedil' => true,
- 'cent' => true,
- 'chi' => true,
- 'Chi' => true,
- 'circ' => true,
- 'clubs' => true,
- 'cong' => true,
- 'copy' => true,
- 'crarr' => true,
- 'cup' => true,
- 'curren' => true,
- 'dagger' => true,
- 'Dagger' => true,
- 'darr' => true,
- 'dArr' => true,
- 'deg' => true,
- 'delta' => true,
- 'Delta' => true,
- 'diams' => true,
- 'divide' => true,
- 'eacute' => true,
- 'Eacute' => true,
- 'ecirc' => true,
- 'Ecirc' => true,
- 'egrave' => true,
- 'Egrave' => true,
- 'empty' => true,
- 'emsp' => true,
- 'ensp' => true,
- 'epsilon' => true,
- 'Epsilon' => true,
- 'equiv' => true,
- 'eta' => true,
- 'Eta' => true,
- 'eth' => true,
- 'ETH' => true,
- 'euml' => true,
- 'Euml' => true,
- 'euro' => true,
- 'exist' => true,
- 'fnof' => true,
- 'forall' => true,
- 'frac12' => true,
- 'frac14' => true,
- 'frac34' => true,
- 'frasl' => true,
- 'gamma' => true,
- 'Gamma' => true,
- 'ge' => true,
- 'gt' => true,
- 'harr' => true,
- 'hArr' => true,
- 'hearts' => true,
- 'hellip' => true,
- 'iacute' => true,
- 'Iacute' => true,
- 'icirc' => true,
- 'Icirc' => true,
- 'iexcl' => true,
- 'igrave' => true,
- 'Igrave' => true,
- 'image' => true,
- 'infin' => true,
- 'int' => true,
- 'iota' => true,
- 'Iota' => true,
- 'iquest' => true,
- 'isin' => true,
- 'iuml' => true,
- 'Iuml' => true,
- 'kappa' => true,
- 'Kappa' => true,
- 'lambda' => true,
- 'Lambda' => true,
- 'lang' => true,
- 'laquo' => true,
- 'larr' => true,
- 'lArr' => true,
- 'lceil' => true,
- 'ldquo' => true,
- 'le' => true,
- 'lfloor' => true,
- 'lowast' => true,
- 'loz' => true,
- 'lrm' => true,
- 'lsaquo' => true,
- 'lsquo' => true,
- 'lt' => true,
- 'macr' => true,
- 'mdash' => true,
- 'micro' => true,
- 'middot' => true,
- 'minus' => true,
- 'mu' => true,
- 'Mu' => true,
- 'nabla' => true,
- 'nbsp' => true,
- 'ndash' => true,
- 'ne' => true,
- 'ni' => true,
- 'not' => true,
- 'notin' => true,
- 'nsub' => true,
- 'ntilde' => true,
- 'Ntilde' => true,
- 'nu' => true,
- 'Nu' => true,
- 'oacute' => true,
- 'Oacute' => true,
- 'ocirc' => true,
- 'Ocirc' => true,
- 'oelig' => true,
- 'OElig' => true,
- 'ograve' => true,
- 'Ograve' => true,
- 'oline' => true,
- 'omega' => true,
- 'Omega' => true,
- 'omicron' => true,
- 'Omicron' => true,
- 'oplus' => true,
- 'or' => true,
- 'ordf' => true,
- 'ordm' => true,
- 'oslash' => true,
- 'Oslash' => true,
- 'otilde' => true,
- 'Otilde' => true,
- 'otimes' => true,
- 'ouml' => true,
- 'Ouml' => true,
- 'para' => true,
- 'part' => true,
- 'permil' => true,
- 'perp' => true,
- 'phi' => true,
- 'Phi' => true,
- 'pi' => true,
- 'Pi' => true,
- 'piv' => true,
- 'plusmn' => true,
- 'pound' => true,
- 'prime' => true,
- 'Prime' => true,
- 'prod' => true,
- 'prop' => true,
- 'psi' => true,
- 'Psi' => true,
- 'quot' => true,
- 'radic' => true,
- 'rang' => true,
- 'raquo' => true,
- 'rarr' => true,
- 'rArr' => true,
- 'rceil' => true,
- 'rdquo' => true,
- 'real' => true,
- 'reg' => true,
- 'rfloor' => true,
- 'rho' => true,
- 'Rho' => true,
- 'rlm' => true,
- 'rsaquo' => true,
- 'rsquo' => true,
- 'sbquo' => true,
- 'scaron' => true,
- 'Scaron' => true,
- 'sdot' => true,
- 'sect' => true,
- 'shy' => true,
- 'sigma' => true,
- 'Sigma' => true,
- 'sigmaf' => true,
- 'sim' => true,
- 'spades' => true,
- 'sub' => true,
- 'sube' => true,
- 'sum' => true,
- 'sup' => true,
- 'sup1' => true,
- 'sup2' => true,
- 'sup3' => true,
- 'supe' => true,
- 'szlig' => true,
- 'tau' => true,
- 'Tau' => true,
- 'there4' => true,
- 'theta' => true,
- 'Theta' => true,
- 'thetasym' => true,
- 'thinsp' => true,
- 'thorn' => true,
- 'THORN' => true,
- 'tilde' => true,
- 'times' => true,
- 'trade' => true,
- 'uacute' => true,
- 'Uacute' => true,
- 'uarr' => true,
- 'uArr' => true,
- 'ucirc' => true,
- 'Ucirc' => true,
- 'ugrave' => true,
- 'Ugrave' => true,
- 'uml' => true,
- 'upsih' => true,
- 'upsilon' => true,
- 'Upsilon' => true,
- 'uuml' => true,
- 'Uuml' => true,
- 'weierp' => true,
- 'xi' => true,
- 'Xi' => true,
- 'yacute' => true,
- 'Yacute' => true,
- 'yen' => true,
- 'yuml' => true,
- 'Yuml' => true,
- 'zeta' => true,
- 'Zeta' => true,
- 'zwj' => true,
- 'zwnj' => true );
- if( isset( $htmlEntities[$name] ) ) {
+ global $wgHtmlEntities;
+ if( isset( $wgHtmlEntities[$name] ) ) {
return "&$name;";
} else {
return "&$name;";
|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
}
+ /**
+ * Decode any character references, numeric or named entities,
+ * in the text and return a UTF-8 string.
+ *
+ * @param string $text
+ * @return string
+ * @access public
+ */
+ function decodeCharReferences( $text ) {
+ return preg_replace_callback(
+ MW_CHAR_REFS_REGEX,
+ array( 'Sanitizer', 'decodeCharReferencesCallback' ),
+ $text );
+ }
+
+ /**
+ * @param string $matches
+ * @return string
+ */
+ function decodeCharReferencesCallback( $matches ) {
+ if( $matches[1] != '' ) {
+ return Sanitizer::decodeEntity( $matches[1] );
+ } elseif( $matches[2] != '' ) {
+ return Sanitizer::decodeChar( intval( $matches[2] ) );
+ } elseif( $matches[3] != '' ) {
+ return Sanitizer::decodeChar( hexdec( $matches[3] ) );
+ } elseif( $matches[4] != '' ) {
+ return Sanitizer::decodeChar( hexdec( $matches[4] ) );
+ }
+ # Last case should be an ampersand by itself
+ return $matches[0];
+ }
+
+ /**
+ * Return UTF-8 string for a codepoint if that is a valid
+ * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
+ * @param int $codepoint
+ * @return string
+ * @access private
+ */
+ function decodeChar( $codepoint ) {
+ if( Sanitizer::validateCodepoint( $codepoint ) ) {
+ return codepointToUtf8( $codepoint );
+ } else {
+ return UTF8_REPLACEMENT;
+ }
+ }
+
+ /**
+ * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
+ * return the UTF-8 encoding of that character. Otherwise, returns
+ * pseudo-entity source (eg &foo;)
+ *
+ * @param string $name
+ * @return string
+ */
+ function decodeEntity( $name ) {
+ global $wgHtmlEntities;
+ if( isset( $wgHtmlEntities[$name] ) ) {
+ return codepointToUtf8( $wgHtmlEntities[$name] );
+ } else {
+ return "&$name;";
+ }
+ }
+
/**
* Fetch the whitelist of acceptable attributes for a given
* element name.
* @return array
*/
function attributeWhitelist( $element ) {
- $list = Sanitizer::setupAttributeWhitelist();
+ static $list;
+ if( !isset( $list ) ) {
+ $list = Sanitizer::setupAttributeWhitelist();
+ }
return isset( $list[$element] )
? $list[$element]
: array();
'rowspan',
'colspan',
'nowrap', # deprecated
- 'width', # deprecated
- 'height' # deprecated
+ 'width', # deprecated
+ 'height', # deprecated
+ 'bgcolor' # deprecated
);
# Numbers refer to sections in HTML 4.01 standard describing the element.