* HTML sanitizer for %MediaWiki.
*
* Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
- * http://www.mediawiki.org/
+ * https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* Cleans up HTML, removes dangerous tags and attributes, and
* removes HTML comments
* @private
- * @param $text String
- * @param $processCallback Callback to do any variable or parameter
- * replacements in HTML attribute values
- * @param array $args for the processing callback
- * @param array $extratags for any extra tags to include
- * @param array $removetags for any tags (default or extra) to exclude
+ * @param string $text
+ * @param callable $processCallback Callback to do any variable or parameter
+ * replacements in HTML attribute values
+ * @param array|bool $args Arguments for the processing callback
+ * @param array $extratags For any extra tags to include
+ * @param array $removetags For any tags (default or extra) to exclude
* @return string
*/
static function removeHTMLtags( $text, $processCallback = null,
$badtag = true;
} elseif ( $slash ) {
# Closing a tag... is it the one we just opened?
- $ot = @array_pop( $tagstack );
+ wfSuppressWarnings();
+ $ot = array_pop( $tagstack );
+ wfRestoreWarnings();
+
if ( $ot != $t ) {
if ( isset( $htmlsingleallowed[$ot] ) ) {
# Pop all elements with an optional close tag
}
}
} else {
- @array_push( $tagstack, $ot );
+ wfSuppressWarnings();
+ array_push( $tagstack, $ot );
+ wfRestoreWarnings();
+
# <li> can be nested in <ul> or <ol>, skip those cases:
if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
$badtag = true;
} else {
# this might be possible using tidy itself
foreach ( $bits as $x ) {
- preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
- $x, $regs );
- @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+ preg_match(
+ '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
+ $x,
+ $regs
+ );
+
+ wfSuppressWarnings();
+ list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+ wfRestoreWarnings();
+
$badtag = false;
if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
if ( is_callable( $processCallback ) ) {
* trailing spaces and one of the newlines.
*
* @private
- * @param $text String
+ * @param string $text
* @return string
*/
static function removeHTMLcomments( $text ) {
# Remove the comment, leading and trailing
# spaces, and leave only one newline.
$text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
- }
- else {
+ } else {
# Remove just the comment.
$text = substr_replace( $text, '', $start, $end - $start );
}
* where we may want to allow a tag within content but ONLY when it has
* specific attributes set.
*
- * @param $params
- * @param $element
+ * @param string $params
+ * @param string $element
* @return bool
*/
static function validateTag( $params, $element ) {
* - Unsafe style attributes are discarded
* - Invalid id attributes are re-encoded
*
- * @param $attribs Array
- * @param $element String
- * @return Array
+ * @param array $attribs
+ * @param string $element
+ * @return array
*
* @todo Check for legal values where the DTD limits things.
* @todo Check for unique id attribute :P
* - Unsafe style attributes are discarded
* - Invalid id attributes are re-encoded
*
- * @param $attribs Array
+ * @param array $attribs
* @param array $whitelist list of allowed attribute names
- * @return Array
+ * @return array
*
* @todo Check for legal values where the DTD limits things.
* @todo Check for unique id attribute :P
* will be combined (if they're both strings).
*
* @todo implement merging for other attributes such as style
- * @param $a Array
- * @param $b Array
+ * @param array $a
+ * @param array $b
* @return array
*/
static function mergeAttributes( $a, $b ) {
* clever input strings. These character references must
* be escaped before the return value is embedded in HTML.
*
- * @param $value String
- * @return String
+ * @param string $value
+ * @return string
*/
static function checkCss( $value ) {
// Decode character references like {
// Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
$value = preg_replace_callback(
- '/[ï¼\81-ï½\9a]/u', // U+FF01 to U+FF5A
+ '/[ï¼\81-[]-ï½\9a]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
function ( $matches ) {
$cp = utf8ToCodepoint( $matches[0] );
if ( $cp === false ) {
// Reject problematic keywords and control characters
if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
return '/* invalid control char */';
- } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( | image\s*\( | image-set\s*\( !ix', $value ) ) {
+ } elseif ( preg_match(
+ '! expression
+ | filter\s*:
+ | accelerator\s*:
+ | -o-link\s*:
+ | -o-link-source\s*:
+ | -o-replace\s*:
+ | url\s*\(
+ | image\s*\(
+ | image-set\s*\(
+ !ix', $value ) ) {
return '/* insecure input */';
}
return $value;
}
/**
- * @param $matches array
- * @return String
+ * @param array $matches
+ * @return string
*/
static function cssDecodeCallback( $matches ) {
if ( $matches[1] !== '' ) {
* - Unsafe style attributes are discarded
* - Prepends space if there are attributes.
*
- * @param $text String
- * @param $element String
- * @return String
+ * @param string $text
+ * @param string $element
+ * @return string
*/
static function fixTagAttributes( $text, $element ) {
if ( trim( $text ) == '' ) {
/**
* Encode an attribute value for HTML output.
- * @param $text String
- * @return HTML-encoded text fragment
+ * @param string $text
+ * @return string HTML-encoded text fragment
*/
static function encodeAttribute( $text ) {
$encValue = htmlspecialchars( $text, ENT_QUOTES );
/**
* Encode an attribute value for HTML tags, with extra armoring
* against further wiki processing.
- * @param $text String
- * @return HTML-encoded text fragment
+ * @param string $text
+ * @return string HTML-encoded text fragment
*/
static function safeEncodeAttribute( $text ) {
$encValue = Sanitizer::encodeAttribute( $text );
* (which don't work reliably in fragments cross-browser).
*
* @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
- * in the id and
- * name attributes
- * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+ * in the id and name attributes
+ * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
+ * the id attribute
* @see http://www.whatwg.org/html/elements.html#the-id-attribute
* HTML5 definition of id attribute
*
* @param string $id id to escape
- * @param $options Mixed: string or array of strings (default is array()):
+ * @param string|array $options String or array of strings (default is array()):
* 'noninitial': This is a non-initial fragment of an id, not a full id,
* so don't pay attention if the first character isn't valid at the
* beginning of an id. Only matters if $wgExperimentalHtmlIds is
* 'legacy': Behave the way the old HTML 4-based ID escaping worked even
* if $wgExperimentalHtmlIds is used, so we can generate extra
* anchors and links won't break.
- * @return String
+ * @return string
*/
static function escapeId( $id, $options = array() ) {
global $wgExperimentalHtmlIds;
*
* @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
*
- * @param $class String
- * @return String
+ * @param string $class
+ * @return string
*/
static function escapeClass( $class ) {
// Convert ugly stuff to underscores and kill underscores in ugly places
* Given HTML input, escape with htmlspecialchars but un-escape entities.
* This allows (generally harmless) entities like   to survive.
*
- * @param string $html to escape
- * @return String: escaped input
+ * @param string $html HTML to escape
+ * @return string Escaped input
*/
static function escapeHtmlAllowEntities( $html ) {
$html = Sanitizer::decodeCharReferences( $html );
/**
* Regex replace callback for armoring links against further processing.
- * @param $matches Array
+ * @param array $matches
* @return string
*/
private static function armorLinksCallback( $matches ) {
* a partial tag string. Attribute names are forces to lowercase,
* character references are decoded to UTF-8 text.
*
- * @param $text String
- * @return Array
+ * @param string $text
+ * @return array
*/
public static function decodeTagAttributes( $text ) {
if ( trim( $text ) == '' ) {
* Build a partial tag string from an associative array of attribute
* names and values as returned by decodeTagAttributes.
*
- * @param $assoc_array Array
- * @return String
+ * @param array $assoc_array
+ * @return string
*/
public static function safeEncodeTagAttributes( $assoc_array ) {
$attribs = array();
* Pick the appropriate attribute value from a match set from the
* attribs regex matches.
*
- * @param $set Array
- * @throws MWException
- * @return String
+ * @param array $set
+ * @throws MWException when tag conditions are not met.
+ * @return string
*/
private static function getTagAttributeCallback( $set ) {
if ( isset( $set[6] ) ) {
* but note that we're not returning the value, but are returning
* XML source fragments that will be slapped into output.
*
- * @param $text String
- * @return String
+ * @param string $text
+ * @return string
+ * @todo Remove, unused?
*/
private static function normalizeAttributeValue( $text ) {
return str_replace( '"', '"',
}
/**
- * @param $text string
- * @return mixed
+ * @param string $text
+ * @return string
*/
private static function normalizeWhitespace( $text ) {
return preg_replace(
* by Parser::stripSectionName(), for use in the id's that are used for
* section links.
*
- * @param $section String
- * @return String
+ * @param string $section
+ * @return string
*/
static function normalizeSectionNameWhitespace( $section ) {
return trim( preg_replace( '/[ _]+/', ' ', $section ) );
* c. use lower cased "&#x", not "&#X"
* d. fix or reject non-valid attributes
*
- * @param $text String
- * @return String
+ * @param string $text
+ * @return string
* @private
*/
static function normalizeCharReferences( $text ) {
array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
$text );
}
+
/**
- * @param $matches String
- * @return String
+ * @param string $matches
+ * @return string
*/
static function normalizeCharReferencesCallback( $matches ) {
$ret = null;
* the HTML equivalent. Otherwise, returns HTML-escaped text of
* pseudo-entity source (eg &foo;)
*
- * @param $name String
- * @return String
+ * @param string $name
+ * @return string
*/
static function normalizeEntity( $name ) {
if ( isset( self::$htmlEntityAliases[$name] ) ) {
}
/**
- * @param $codepoint
+ * @param int $codepoint
* @return null|string
*/
static function decCharReference( $codepoint ) {
}
/**
- * @param $codepoint
+ * @param int $codepoint
* @return null|string
*/
static function hexCharReference( $codepoint ) {
/**
* Returns true if a given Unicode codepoint is a valid character in XML.
- * @param $codepoint Integer
- * @return Boolean
+ * @param int $codepoint
+ * @return bool
*/
private static function validateCodepoint( $codepoint ) {
return $codepoint == 0x09
* Decode any character references, numeric or named entities,
* in the text and return a UTF-8 string.
*
- * @param $text String
- * @return String
+ * @param string $text
+ * @return string
*/
public static function decodeCharReferences( $text ) {
return preg_replace_callback(
* This is useful for page titles, not for text to be displayed,
* MediaWiki allows HTML entities to escape normalization as a feature.
*
- * @param string $text (already normalized, containing entities)
- * @return String (still normalized, without entities)
+ * @param string $text Already normalized, containing entities
+ * @return string Still normalized, without entities
*/
public static function decodeCharReferencesAndNormalize( $text ) {
global $wgContLang;
}
/**
- * @param $matches String
- * @return String
+ * @param string $matches
+ * @return string
*/
static function decodeCharReferencesCallback( $matches ) {
if ( $matches[1] != '' ) {
/**
* Return UTF-8 string for a codepoint if that is a valid
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
- * @param $codepoint Integer
- * @return String
+ * @param int $codepoint
+ * @return string
* @private
*/
static function decodeChar( $codepoint ) {
* return the UTF-8 encoding of that character. Otherwise, returns
* pseudo-entity source (eg "&foo;")
*
- * @param $name String
- * @return String
+ * @param string $name
+ * @return string
*/
static function decodeEntity( $name ) {
if ( isset( self::$htmlEntityAliases[$name] ) ) {
/**
* Fetch the whitelist of acceptable attributes for a given element name.
*
- * @param $element String
- * @return Array
+ * @param string $element
+ * @return array
*/
static function attributeWhitelist( $element ) {
$list = Sanitizer::setupAttributeWhitelist();
/**
* Foreach array key (an allowed HTML element), return an array
* of allowed attributes
- * @return Array
+ * @return array
*/
static function setupAttributeWhitelist() {
global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
-
static $whitelist, $staticInitialised;
+
$globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
- if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
+ if ( $whitelist !== null && $staticInitialised == $globalContext ) {
return $whitelist;
}
* inclusion in HTML output as of 1.10!
*
* @param string $text HTML fragment
- * @return String
+ * @return string
*/
static function stripAllTags( $text ) {
# Actual <tags>
*
* Use for passing XHTML fragments to PHP's XML parsing functions
*
- * @return String
+ * @return string
*/
static function hackDocType() {
$out = "<!DOCTYPE html [\n";
}
/**
- * @param $url string
+ * @param string $url
* @return mixed|string
*/
static function cleanUrl( $url ) {
}
/**
- * @param $matches array
+ * @param array $matches
* @return string
*/
static function cleanUrlCallback( $matches ) {
* @since 1.18
*
* @param string $addr E-mail address
- * @return Bool
+ * @return bool
*/
public static function validateEmail( $addr ) {
$result = null;
$rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
$rfc1034_ldh_str = "a-z0-9\\-";
- $HTML5_email_regexp = "/
+ $html5_email_regexp = "/
^ # start of string
[$rfc5322_atext\\.]+ # user part which is liberal :p
@ # 'apostrophe'
$ # End of string
/ix"; // case Insensitive, eXtended
- return (bool)preg_match( $HTML5_email_regexp, $addr );
+ return (bool)preg_match( $html5_email_regexp, $addr );
}
}