/**
* Acceptable tag name charset from HTML5 parsing spec
- * http://www.w3.org/TR/html5/syntax.html#tag-open-state
+ * https://www.w3.org/TR/html5/syntax.html#tag-open-state
*/
const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
/**
* List of all named character entities defined in HTML 4.01
- * http://www.w3.org/TR/html4/sgml/entities.html
+ * https://www.w3.org/TR/html4/sgml/entities.html
* As well as ' which is only defined starting in XHTML1.
*/
private static $htmlEntities = [
/**
* Regular expression to match HTML/XML attribute pairs within a tag.
* Allows some... latitude. Based on,
- * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+ * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
* @return string
*/
'kbd', 'samp', 'data', 'time', 'mark'
];
$htmlsingle = [
- 'br', 'wbr', 'hr', 'li', 'dt', 'dd'
- ];
- $htmlsingleonly = [ # Elements that cannot have close tags
- 'br', 'wbr', 'hr'
+ 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
];
- $htmlsingle[] = $htmlsingleonly[] = 'meta';
- $htmlsingle[] = $htmlsingleonly[] = 'link';
+ # Elements that cannot have close tags. This is (not coincidentally)
+ # also the list of tags for which the HTML 5 parsing algorithm
+ # requires you to "acknowledge the token's self-closing flag", i.e.
+ # a self-closing tag like <br/> is not an HTML 5 parse error only
+ # for this list.
+ $htmlsingleonly = [
+ 'br', 'wbr', 'hr', 'meta', 'link'
+ ];
$htmlnest = [ # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
* @param array|bool $args Arguments for the processing callback
* @param array $extratags For any extra tags to include
* @param array $removetags For any tags (default or extra) to exclude
+ * @param callable $warnCallback (Deprecated) Callback allowing the
+ * addition of a tracking category when bad input is encountered.
+ * DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be
+ * removed shortly.
* @return string
*/
public static function removeHTMLtags( $text, $processCallback = null,
- $args = [], $extratags = [], $removetags = []
+ $args = [], $extratags = [], $removetags = [], $warnCallback = null
) {
extract( self::getRecognizedTagData( $extratags, $removetags ) );
$badtag = true;
# Is it a self closed htmlpair ? (bug 5487)
} elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
+ // Eventually we'll just remove the self-closing
+ // slash, in order to be consistent with HTML5
+ // semantics.
+ // $brace = '>';
+ // For now, let's just warn authors to clean up.
+ if ( is_callable( $warnCallback ) ) {
+ call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
+ }
$badtag = true;
} elseif ( isset( $htmlsingleonly[$t] ) ) {
# Hack to force empty tag for unclosable elements
call_user_func_array( $processCallback, [ &$params, $args ] );
}
+ if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
+ // Eventually we'll just remove the self-closing
+ // slash, in order to be consistent with HTML5
+ // semantics.
+ // $brace = '>';
+ // For now, let's just warn authors to clean up.
+ if ( is_callable( $warnCallback ) ) {
+ call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
+ }
+ }
if ( !Sanitizer::validateTag( $params, $t ) ) {
$badtag = true;
}
$newparams = Sanitizer::fixTagAttributes( $params, $t );
if ( !$badtag ) {
+ if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
+ # Interpret self-closing tags as empty tags even when
+ # HTML 5 would interpret them as start tags. Such input
+ # is commonly seen on Wikimedia wikis with this intention.
+ $brace = "></$t>";
+ }
+
$rest = str_replace( '>', '>', $rest );
$text .= "<$slash$t$newparams$brace$rest";
continue;
| url\s*\(
| image\s*\(
| image-set\s*\(
+ | attr\s*\([^)]+[\s,]+url
!ix', $value ) ) {
return '/* insecure input */';
}
* - Double attributes are discarded
* - Unsafe style attributes are discarded
* - Prepends space if there are attributes.
+ * - (Optionally) Sorts attributes by name.
*
* @param string $text
* @param string $element
+ * @param bool $sorted Whether to sort the attributes (default: false)
* @return string
*/
- static function fixTagAttributes( $text, $element ) {
+ static function fixTagAttributes( $text, $element, $sorted = false ) {
if ( trim( $text ) == '' ) {
return '';
}
$decoded = Sanitizer::decodeTagAttributes( $text );
$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+ if ( $sorted ) {
+ ksort( $stripped );
+ }
+
return Sanitizer::safeEncodeTagAttributes( $stripped );
}
* ambiguous if it's part of something that looks like a percent escape
* (which don't work reliably in fragments cross-browser).
*
- * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
+ * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
* in the id and name attributes
- * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
+ * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
* the id attribute
- * @see http://www.whatwg.org/html/elements.html#the-id-attribute
+ * @see https://www.w3.org/TR/html5/dom.html#the-id-attribute
* HTML5 definition of id attribute
*
* @param string $id Id to escape
*
* @todo For extra validity, input should be validated UTF-8.
*
- * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
+ * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
*
* @param string $class
* @return string
} elseif ( !isset( $set[2] ) ) {
# In XHTML, attributes must have a value so return an empty string.
# See "Empty attribute syntax",
- # http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
+ # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
return "";
} else {
throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
# RDFa
# These attributes are specified in section 9 of
- # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+ # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
'about',
'property',
'resource',
'typeof',
# Microdata. These are specified by
- # http://www.whatwg.org/html/microdata.html#the-microdata-model
+ # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
'itemid',
'itemprop',
'itemref',
];
# Numbers refer to sections in HTML 4.01 standard describing the element.
- # See: http://www.w3.org/TR/html4/
+ # See: https://www.w3.org/TR/html4/
$whitelist = [
# 7.5.4
'div' => $block,
# 9.3.2
'br' => array_merge( $common, [ 'clear' ] ),
- # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
+ # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
'wbr' => $common,
# 9.3.4
'hr' => array_merge( $common, [ 'width' ] ),
# HTML Ruby annotation text module, simple ruby only.
- # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
+ # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
'ruby' => $common,
# rbc
'rb' => $common,
# MathML root element, where used for extensions
# 'title' may not be 100% valid here; it's XHTML
- # http://www.w3.org/TR/REC-MathML/
+ # https://www.w3.org/TR/REC-MathML/
'math' => [ 'class', 'style', 'id', 'title' ],
# HTML 5 section 4.6
'bdi' => $common,
# HTML5 elements, defined by:
- # http://www.whatwg.org/html/
+ # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
'data' => array_merge( $common, [ 'value' ] ),
'time' => array_merge( $common, [ 'datetime' ] ),
'mark' => $common,
list( /* $whole */, $protocol, $host, $rest ) = $matches;
// Characters that will be ignored in IDNs.
- // http://tools.ietf.org/html/3454#section-3.1
+ // https://tools.ietf.org/html/rfc3454#section-3.1
// Strip them before further processing so blacklists and such work.
$strip = "/
\\s| # general whitespace