* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
- * @package MediaWiki
- * @subpackage Parser
+ * @file
+ * @ingroup Parser
*/
/**
* Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
*/
define( 'MW_CHAR_REFS_REGEX',
- '/&([A-Za-z0-9]+);
+ '/&([A-Za-z0-9\x80-\xff]+);
|&\#([0-9]+);
|&\#x([0-9A-Za-z]+);
|&\#X([0-9A-Za-z]+);
'zwj' => 8205,
'zwnj' => 8204 );
-/** @package MediaWiki */
+/**
+ * Character entity aliases accepted by MediaWiki
+ */
+global $wgHtmlEntityAliases;
+$wgHtmlEntityAliases = array(
+ 'רלמ' => 'rlm',
+ 'رلم' => 'rlm',
+);
+
+
+/**
+ * XHTML sanitizer for MediaWiki
+ * @ingroup Parser
+ */
class Sanitizer {
/**
* Cleans up HTML, removes dangerous tags and attributes, and
* removes HTML comments
* @private
- * @param string $text
- * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
- * @param array $args for the processing callback
+ * @param $text String
+ * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
+ * @param $args Array for the processing callback
+ * @param $extratags Array for any extra tags to include
+ * @param $removetags Array for any tags (default or extra) to exclude
* @return string
*/
- static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
- global $wgUseTidy, $wgUserHtml;
+ static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
+ global $wgUseTidy;
+
+ static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
+ $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
- static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
- $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
-
wfProfileIn( __METHOD__ );
-
+
if ( !$staticInitialised ) {
- if( $wgUserHtml ) {
- $htmlpairs = array( # Tags that must be closed
- 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
- 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
- 'strike', 'strong', 'tt', 'var', 'div', 'center',
- 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
- 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
- );
- $htmlsingle = array(
- 'br', 'hr', 'li', 'dt', 'dd'
- );
- $htmlsingleonly = array( # Elements that cannot have close tags
- 'br', 'hr'
- );
- $htmlnest = array( # Tags that can be nested--??
- 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
- 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
- );
- $tabletags = array( # Can only appear inside table
- 'td', 'th', 'tr',
- );
- $htmllist = array( # Tags used by list
- 'ul','ol',
- );
- $listtags = array( # Tags that can appear in a list
- 'li',
- );
-
- } else {
- $htmlpairs = array();
- $htmlsingle = array();
- $htmlnest = array();
- $tabletags = array();
- }
- $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
- $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
+ $htmlpairsStatic = array( # Tags that must be closed
+ 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
+ 'strike', 'strong', 'tt', 'var', 'div', 'center',
+ 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
+ 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
+ );
+ $htmlsingle = array(
+ 'br', 'hr', 'li', 'dt', 'dd'
+ );
+ $htmlsingleonly = array( # Elements that cannot have close tags
+ 'br', 'hr'
+ );
+ $htmlnest = array( # Tags that can be nested--??
+ 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
+ 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
+ );
+ $tabletags = array( # Can only appear inside table, we will close them
+ 'td', 'th', 'tr',
+ );
+ $htmllist = array( # Tags used by list
+ 'ul','ol',
+ );
+ $listtags = array( # Tags that can appear in a list
+ 'li',
+ );
+
+ $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
+ $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
# Convert them all to hashtables for faster lookup
- $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
- 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
+ $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
+ 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
foreach ( $vars as $var ) {
$$var = array_flip( $$var );
}
$staticInitialised = true;
}
+ # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
+ $extratags = array_flip( $extratags );
+ $removetags = array_flip( $removetags );
+ $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
+ $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
# Remove HTML comments
$text = Sanitizer::removeHTMLcomments( $text );
$bits = explode( '<', $text );
- $text = array_shift( $bits );
+ $text = str_replace( '>', '>', array_shift( $bits ) );
if(!$wgUseTidy) {
$tagstack = $tablestack = array();
foreach ( $bits as $x ) {
- $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
$regs = array();
- preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs );
- list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
- error_reporting( $prev );
+ if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
+ list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+ } else {
+ $slash = $t = $params = $brace = $rest = null;
+ }
$badtag = 0 ;
if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
$optstack = array();
array_push ($optstack, $ot);
while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
- isset( $htmlsingleallowed[$ot] ) )
+ isset( $htmlsingleallowed[$ot] ) )
{
array_push ($optstack, $ot);
}
} else if( isset( $htmlsingle[$t] ) ) {
# Hack to not close $htmlsingle tags
$brace = NULL;
+ } else if( isset( $tabletags[$t] )
+ && in_array($t ,$tagstack) ) {
+ // New table tag but forgot to close the previous one
+ $text .= "</$t>";
} else {
if ( $t == 'table' ) {
array_push( $tablestack, $tagstack );
}
if ( ! $badtag ) {
$rest = str_replace( '>', '>', $rest );
- $close = ( $brace == '/>' ) ? ' /' : '';
+ $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
$text .= "<$slash$t$newparams$close>$rest";
continue;
}
* trailing spaces and one of the newlines.
*
* @private
- * @param string $text
+ * @param $text String
* @return string
*/
static function removeHTMLcomments( $text ) {
*
* - Discards attributes not on a whitelist for the given element
* - Unsafe style attributes are discarded
+ * - Invalid id attributes are reencoded
*
- * @param array $attribs
- * @param string $element
- * @return array
+ * @param $attribs Array
+ * @param $element String
+ * @return Array
*
* @todo Check for legal values where the DTD limits things.
* @todo Check for unique id attribute :P
*/
static function validateTagAttributes( $attribs, $element ) {
- $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
+ return Sanitizer::validateAttributes( $attribs,
+ Sanitizer::attributeWhitelist( $element ) );
+ }
+
+ /**
+ * Take an array of attribute names and values and normalize or discard
+ * illegal values for the given whitelist.
+ *
+ * - Discards attributes not the given whitelist
+ * - Unsafe style attributes are discarded
+ * - Invalid id attributes are reencoded
+ *
+ * @param $attribs Array
+ * @param $whitelist Array: list of allowed attribute names
+ * @return Array
+ *
+ * @todo Check for legal values where the DTD limits things.
+ * @todo Check for unique id attribute :P
+ */
+ static function validateAttributes( $attribs, $whitelist ) {
+ $whitelist = array_flip( $whitelist );
$out = array();
foreach( $attribs as $attribute => $value ) {
if( !isset( $whitelist[$attribute] ) ) {
}
}
- if ( $attribute === 'id' )
- $value = Sanitizer::escapeId( $value );
+ if ( $attribute === 'id' ) {
+ global $wgEnforceHtmlIds;
+ $value = Sanitizer::escapeId( $value,
+ $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+ }
// If this attribute was previously set, override it.
// Output should only have one attribute of each name.
}
return $out;
}
-
+
+ /**
+ * Merge two sets of HTML attributes. Conflicting items in the second set
+ * will override those in the first, except for 'class' attributes which
+ * will be combined (if they're both strings).
+ *
+ * @todo implement merging for other attributes such as style
+ * @param $a Array
+ * @param $b Array
+ * @return array
+ */
+ static function mergeAttributes( $a, $b ) {
+ $out = array_merge( $a, $b );
+ if( isset( $a['class'] ) && isset( $b['class'] )
+ && is_string( $a['class'] ) && is_string( $b['class'] )
+ && $a['class'] !== $b['class'] ) {
+ $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
+ -1, PREG_SPLIT_NO_EMPTY );
+ $out['class'] = implode( ' ', array_unique( $classes ) );
+ }
+ return $out;
+ }
+
/**
* Pick apart some CSS and check it for forbidden or unsafe structures.
* Returns a sanitized string, or false if it was just too evil.
*
* Currently URL references, 'expression', 'tps' are forbidden.
*
- * @param string $value
- * @return mixed
+ * @param $value String
+ * @return Mixed
*/
static function checkCss( $value ) {
$stripped = Sanitizer::decodeCharReferences( $value );
// Remove any comments; IE gets token splitting wrong
$stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
-
+
$value = $stripped;
// ... and continue checks
$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
'codepointToUtf8(hexdec("$1"))', $stripped );
$stripped = str_replace( '\\', '', $stripped );
- if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
+ if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
$stripped ) ) {
# haxx0r
return false;
}
-
+
return $value;
}
* - Unsafe style attributes are discarded
* - Prepends space if there are attributes.
*
- * @param string $text
- * @param string $element
- * @return string
+ * @param $text String
+ * @param $element String
+ * @return String
*/
static function fixTagAttributes( $text, $element ) {
if( trim( $text ) == '' ) {
return '';
}
-
+
$stripped = Sanitizer::validateTagAttributes(
Sanitizer::decodeTagAttributes( $text ), $element );
-
+
$attribs = array();
foreach( $stripped as $attribute => $value ) {
$encAttribute = htmlspecialchars( $attribute );
$encValue = Sanitizer::safeEncodeAttribute( $value );
-
+
$attribs[] = "$encAttribute=\"$encValue\"";
}
return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
/**
* Encode an attribute value for HTML output.
- * @param $text
+ * @param $text String
* @return HTML-encoded text fragment
*/
static function encodeAttribute( $text ) {
- $encValue = htmlspecialchars( $text );
-
+ $encValue = htmlspecialchars( $text, ENT_QUOTES );
+
// Whitespace is normalized during attribute decoding,
// so if we've been passed non-spaces we must encode them
// ahead of time or they won't be preserved.
"\r" => ' ',
"\t" => '	',
) );
-
+
return $encValue;
}
-
+
/**
* Encode an attribute value for HTML tags, with extra armoring
* against further wiki processing.
- * @param $text
+ * @param $text String
* @return HTML-encoded text fragment
*/
static function safeEncodeAttribute( $text ) {
$encValue = Sanitizer::encodeAttribute( $text );
-
+
# Templates and links may be expanded in later parsing,
# creating invalid or dangerous output. Suppress this.
$encValue = strtr( $encValue, array(
* Given a value escape it so that it can be used in an id attribute and
* return it, this does not validate the value however (see first link)
*
- * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
+ * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
* in the id and
* name attributes
- * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
- *
- * @bug 4461
+ * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
*
- * @static
- *
- * @param string $id
- * @return string
+ * @param $id String: id to validate
+ * @param $options Mixed: string or array of strings (default is array()):
+ * 'noninitial': This is a non-initial fragment of an id, not a full id,
+ * so don't pay attention if the first character isn't valid at the
+ * beginning of an id.
+ * 'xml': Don't restrict the id to be HTML4-compatible. This option
+ * allows any alphabetic character to be used, per the XML standard.
+ * Therefore, it also completely changes the type of escaping: instead
+ * of weird dot-encoding, runs of invalid characters (mostly
+ * whitespace) are just compressed into a single underscore.
+ * @return String
*/
- static function escapeId( $id ) {
- static $replace = array(
- '%3A' => ':',
- '%' => '.'
- );
+ static function escapeId( $id, $options = array() ) {
+ $options = (array)$options;
+
+ if ( !in_array( 'xml', $options ) ) {
+ # HTML4-style escaping
+ static $replace = array(
+ '%3A' => ':',
+ '%' => '.'
+ );
- $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+ $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+ $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
- return str_replace( array_keys( $replace ), array_values( $replace ), $id );
+ if ( !preg_match( '/^[a-zA-Z]/', $id )
+ && !in_array( 'noninitial', $options ) ) {
+ // Initial character must be a letter!
+ $id = "x$id";
+ }
+ return $id;
+ }
+
+ # XML-style escaping. For the patterns used, see the XML 1.0 standard,
+ # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
+ $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
+ . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
+ . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
+ $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
+ . '\x{203F}-\x{2040}';
+ # Replace _ as well so we don't get multiple consecutive underscores
+ $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
+ $id = trim( $id, '_' );
+
+ if ( !preg_match( "/^[$nameStartChar]/u", $id )
+ && !in_array( 'noninitial', $options ) ) {
+ $id = "_$id";
+ }
+
+ return $id;
}
/**
* Given a value, escape it so that it can be used as a CSS class and
* return it.
*
- * TODO: For extra validity, input should be validated UTF-8.
+ * @todo For extra validity, input should be validated UTF-8.
*
- * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
+ * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
*
- * @param string $class
- * @return string
+ * @param $class String
+ * @return String
*/
static function escapeClass( $class ) {
// Convert ugly stuff to underscores and kill underscores in ugly places
$class ), '_');
}
+ /**
+ * Given HTML input, escape with htmlspecialchars but un-escape entites.
+ * This allows (generally harmless) entities like to survive.
+ *
+ * @param $html String to escape
+ * @return String: escaped input
+ */
+ static function escapeHtmlAllowEntities( $html ) {
+ # It seems wise to escape ' as well as ", as a matter of course. Can't
+ # hurt.
+ $html = htmlspecialchars( $html, ENT_QUOTES );
+ $html = str_replace( '&', '&', $html );
+ $html = Sanitizer::normalizeCharReferences( $html );
+ return $html;
+ }
+
/**
* Regex replace callback for armoring links against further processing.
- * @param array $matches
+ * @param $matches Array
* @return string
- * @private
*/
private static function armorLinksCallback( $matches ) {
return str_replace( ':', ':', $matches[1] );
* a partial tag string. Attribute names are forces to lowercase,
* character references are decoded to UTF-8 text.
*
- * @param string
- * @return array
+ * @param $text String
+ * @return Array
*/
- static function decodeTagAttributes( $text ) {
+ public static function decodeTagAttributes( $text ) {
$attribs = array();
if( trim( $text ) == '' ) {
foreach( $pairs as $set ) {
$attribute = strtolower( $set[1] );
$value = Sanitizer::getTagAttributeCallback( $set );
-
+
// Normalize whitespace
$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
$value = trim( $value );
-
+
// Decode character references
$attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
}
* Pick the appropriate attribute value from a match set from the
* MW_ATTRIBS_REGEX matches.
*
- * @param array $set
- * @return string
- * @private
+ * @param $set Array
+ * @return String
*/
private static function getTagAttributeCallback( $set ) {
if( isset( $set[6] ) ) {
* but note that we're not returning the value, but are returning
* XML source fragments that will be slapped into output.
*
- * @param string $text
- * @return string
- * @private
+ * @param $text String
+ * @return String
*/
private static function normalizeAttributeValue( $text ) {
return str_replace( '"', '"',
- preg_replace(
- '/\r\n|[\x20\x0d\x0a\x09]/',
- ' ',
+ self::normalizeWhitespace(
Sanitizer::normalizeCharReferences( $text ) ) );
}
+ private static function normalizeWhitespace( $text ) {
+ return preg_replace(
+ '/\r\n|[\x20\x0d\x0a\x09]/',
+ ' ',
+ $text );
+ }
+
/**
* Ensure that any entities and character references are legal
* for XML and XHTML specifically. Any stray bits will be
* c. use &#x, not &#X
* d. fix or reject non-valid attributes
*
- * @param string $text
- * @return string
+ * @param $text String
+ * @return String
* @private
*/
static function normalizeCharReferences( $text ) {
$text );
}
/**
- * @param string $matches
- * @return string
+ * @param $matches String
+ * @return String
*/
static function normalizeCharReferencesCallback( $matches ) {
$ret = null;
/**
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
- * return the named entity reference as is. Otherwise, returns
- * HTML-escaped text of pseudo-entity source (eg &foo;)
+ * return the named entity reference as is. If the entity is a
+ * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
+ * returns HTML-escaped text of pseudo-entity source (eg &foo;)
*
- * @param string $name
- * @return string
- * @static
+ * @param $name String
+ * @return String
*/
static function normalizeEntity( $name ) {
- global $wgHtmlEntities;
- if( isset( $wgHtmlEntities[$name] ) ) {
+ global $wgHtmlEntities, $wgHtmlEntityAliases;
+ if ( isset( $wgHtmlEntityAliases[$name] ) ) {
+ return "&{$wgHtmlEntityAliases[$name]};";
+ } elseif( isset( $wgHtmlEntities[$name] ) ) {
return "&$name;";
} else {
return "&$name;";
/**
* Returns true if a given Unicode codepoint is a valid character in XML.
- * @param int $codepoint
- * @return bool
+ * @param $codepoint Integer
+ * @return Boolean
*/
private static function validateCodepoint( $codepoint ) {
return ($codepoint == 0x09)
* Decode any character references, numeric or named entities,
* in the text and return a UTF-8 string.
*
- * @param string $text
- * @return string
- * @public
- * @static
+ * @param $text String
+ * @return String
*/
public static function decodeCharReferences( $text ) {
return preg_replace_callback(
}
/**
- * @param string $matches
- * @return string
+ * @param $matches String
+ * @return String
*/
static function decodeCharReferencesCallback( $matches ) {
if( $matches[1] != '' ) {
/**
* Return UTF-8 string for a codepoint if that is a valid
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
- * @param int $codepoint
- * @return string
+ * @param $codepoint Integer
+ * @return String
* @private
*/
static function decodeChar( $codepoint ) {
* return the UTF-8 encoding of that character. Otherwise, returns
* pseudo-entity source (eg &foo;)
*
- * @param string $name
- * @return string
+ * @param $name Strings
+ * @return String
*/
static function decodeEntity( $name ) {
- global $wgHtmlEntities;
+ global $wgHtmlEntities, $wgHtmlEntityAliases;
+ if ( isset( $wgHtmlEntityAliases[$name] ) ) {
+ $name = $wgHtmlEntityAliases[$name];
+ }
if( isset( $wgHtmlEntities[$name] ) ) {
return codepointToUtf8( $wgHtmlEntities[$name] );
} else {
}
/**
- * Fetch the whitelist of acceptable attributes for a given
- * element name.
+ * Fetch the whitelist of acceptable attributes for a given element name.
*
- * @param string $element
- * @return array
+ * @param $element String
+ * @return Array
*/
static function attributeWhitelist( $element ) {
static $list;
}
/**
- * @todo Document it a bit
- * @return array
+ * Foreach array key (an allowed HTML element), return an array
+ * of allowed attributes
+ * @return Array
*/
static function setupAttributeWhitelist() {
$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
'td' => array_merge( $common, $tablecell, $tablealign ),
'th' => array_merge( $common, $tablecell, $tablealign ),
+ # 13.2
+ # Not usually allowed, but may be used for extension-style hooks
+ # such as <math> when it is rasterized
+ 'img' => array_merge( $common, array( 'alt' ) ),
+
# 15.2.1
'tt' => $common,
'b' => $common,
'rb' => $common,
'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
'rp' => $common,
+
+ # MathML root element, where used for extensions
+ # 'title' may not be 100% valid here; it's XHTML
+ # http://www.w3.org/TR/REC-MathML/
+ 'math' => array( 'class', 'style', 'id', 'title' ),
);
return $whitelist;
}
/**
* Take a fragment of (potentially invalid) HTML and return
- * a version with any tags removed, encoded suitably for literal
- * inclusion in an attribute value.
+ * a version with any tags removed, encoded as plain text.
*
- * @param string $text HTML fragment
- * @return string
+ * Warning: this return value must be further escaped for literal
+ * inclusion in HTML output as of 1.10!
+ *
+ * @param $text String: HTML fragment
+ * @return String
*/
static function stripAllTags( $text ) {
# Actual <tags>
$text = StringUtils::delimiterReplace( '<', '>', '', $text );
# Normalize &entities and whitespace
- $text = Sanitizer::normalizeAttributeValue( $text );
-
- # Will be placed into "double-quoted" attributes,
- # make sure remaining bits are safe.
- $text = str_replace(
- array('<', '>', '"'),
- array('<', '>', '"'),
- $text );
+ $text = self::decodeCharReferences( $text );
+ $text = self::normalizeWhitespace( $text );
return $text;
}
*
* Use for passing XHTML fragments to PHP's XML parsing functions
*
- * @return string
- * @static
+ * @return String
*/
static function hackDocType() {
global $wgHtmlEntities;
$out .= "]>\n";
return $out;
}
-
- static function cleanUrl( $url, $hostname=true ) {
+
+ static function cleanUrl( $url ) {
# Normalize any HTML entities in input. They will be
# re-escaped by makeExternalLink().
$url = Sanitizer::decodeCharReferences( $url );
# Escape any control characters introduced by the above step
$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
-
+
# Validate hostname portion
$matches = array();
if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
list( /* $whole */, $protocol, $host, $rest ) = $matches;
-
+
// Characters that will be ignored in IDNs.
// http://tools.ietf.org/html/3454#section-3.1
// Strip them before further processing so blacklists and such work.
\xe2\x80\x8d| # 200d ZERO WIDTH JOINER
[\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
/xuD";
-
+
$host = preg_replace( $strip, '', $host );
-
+
// @fixme: validate hostnames here
-
+
return $protocol . $host . $rest;
} else {
return $url;
}
}
-
-?>