/**
* Regular expression to match HTML/XML attribute pairs within a tag.
- * Allows some... latitude.
+ * Allows some... latitude. Based on,
+ * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
* @return string
*/
if ( self::$attribsRegex === null ) {
$attribFirst = '[:A-Z_a-z0-9]';
$attrib = '[:A-Z_a-z-.0-9]';
- $space = '[\x09\x0a\x0d\x20]';
+ $space = '[\x09\x0a\x0c\x0d\x20]';
self::$attribsRegex =
"/(?:^|$space)({$attribFirst}{$attrib}*)
($space*=$space*
(?:
# The attribute value: quoted or alone
- \"([^<\"]*)(?:\"|\$)
- | '([^<']*)(?:'|\$)
- | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+ \"([^\"]*)(?:\"|\$)
+ | '([^']*)(?:'|\$)
+ | (((?!$space|>).)*)
)
)?(?=$space|\$)/sx";
}
* @return array
*/
public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
- global $wgAllowMicrodataAttributes, $wgAllowImageTag;
+ global $wgAllowImageTag;
static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
// Base our staticInitialised variable off of the global config state so that if the globals
// are changed (like in the screwed up test system) we will re-initialise the settings.
- $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+ $globalContext = $wgAllowImageTag;
if ( !$staticInitialised || $staticInitialised != $globalContext ) {
$htmlpairsStatic = [ # Tags that must be closed
'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
'kbd', 'samp', 'data', 'time', 'mark'
];
$htmlsingle = [
- 'br', 'wbr', 'hr', 'li', 'dt', 'dd'
+ 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
];
- $htmlsingleonly = [ # Elements that cannot have close tags
- 'br', 'wbr', 'hr'
+
+ # Elements that cannot have close tags. This is (not coincidentally)
+ # also the list of tags for which the HTML 5 parsing algorithm
+ # requires you to "acknowledge the token's self-closing flag", i.e.
+ # a self-closing tag like <br/> is not an HTML 5 parse error only
+ # for this list.
+ $htmlsingleonly = [
+ 'br', 'wbr', 'hr', 'meta', 'link'
];
- if ( $wgAllowMicrodataAttributes ) {
- $htmlsingle[] = $htmlsingleonly[] = 'meta';
- $htmlsingle[] = $htmlsingleonly[] = 'link';
- }
+
$htmlnest = [ # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
* @param array|bool $args Arguments for the processing callback
* @param array $extratags For any extra tags to include
* @param array $removetags For any tags (default or extra) to exclude
+ * @param callable $warnCallback (Deprecated) Callback allowing the
+ * addition of a tracking category when bad input is encountered.
+ * DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be
+ * removed shortly.
* @return string
*/
public static function removeHTMLtags( $text, $processCallback = null,
- $args = [], $extratags = [], $removetags = []
+ $args = [], $extratags = [], $removetags = [], $warnCallback = null
) {
extract( self::getRecognizedTagData( $extratags, $removetags ) );
$badtag = true;
# Is it a self closed htmlpair ? (bug 5487)
} elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
+ // Eventually we'll just remove the self-closing
+ // slash, in order to be consistent with HTML5
+ // semantics.
+ // $brace = '>';
+ // For now, let's just warn authors to clean up.
+ if ( is_callable( $warnCallback ) ) {
+ call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
+ }
$badtag = true;
} elseif ( isset( $htmlsingleonly[$t] ) ) {
# Hack to force empty tag for unclosable elements
call_user_func_array( $processCallback, [ &$params, $args ] );
}
+ if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
+ // Eventually we'll just remove the self-closing
+ // slash, in order to be consistent with HTML5
+ // semantics.
+ // $brace = '>';
+ // For now, let's just warn authors to clean up.
+ if ( is_callable( $warnCallback ) ) {
+ call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
+ }
+ }
if ( !Sanitizer::validateTag( $params, $t ) ) {
$badtag = true;
}
$newparams = Sanitizer::fixTagAttributes( $params, $t );
if ( !$badtag ) {
+ if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
+ # Interpret self-closing tags as empty tags even when
+ # HTML 5 would interpret them as start tags. Such input
+ # is commonly seen on Wikimedia wikis with this intention.
+ $brace = "></$t>";
+ }
+
$rest = str_replace( '>', '>', $rest );
$text .= "<$slash$t$newparams$brace$rest";
continue;
* @todo Check for unique id attribute :P
*/
static function validateAttributes( $attribs, $whitelist ) {
- global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
-
$whitelist = array_flip( $whitelist );
$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
$out = [];
foreach ( $attribs as $attribute => $value ) {
- # allow XML namespace declaration if RDFa is enabled
- if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
+ # Allow XML namespace declaration to allow RDFa
+ if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
$out[$attribute] = $value;
}
$out[$attribute] = $value;
}
- if ( $wgAllowMicrodataAttributes ) {
- # itemtype, itemid, itemref don't make sense without itemscope
- if ( !array_key_exists( 'itemscope', $out ) ) {
- unset( $out['itemtype'] );
- unset( $out['itemid'] );
- unset( $out['itemref'] );
- }
- # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
+ # itemtype, itemid, itemref don't make sense without itemscope
+ if ( !array_key_exists( 'itemscope', $out ) ) {
+ unset( $out['itemtype'] );
+ unset( $out['itemid'] );
+ unset( $out['itemref'] );
}
+ # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
+
return $out;
}
* - Double attributes are discarded
* - Unsafe style attributes are discarded
* - Prepends space if there are attributes.
+ * - (Optionally) Sorts attributes by name.
*
* @param string $text
* @param string $element
+ * @param bool $sorted Whether to sort the attributes (default: false)
* @return string
*/
- static function fixTagAttributes( $text, $element ) {
+ static function fixTagAttributes( $text, $element, $sorted = false ) {
if ( trim( $text ) == '' ) {
return '';
}
$decoded = Sanitizer::decodeTagAttributes( $text );
$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+ if ( $sorted ) {
+ ksort( $stripped );
+ }
+
return Sanitizer::safeEncodeTagAttributes( $stripped );
}
* @return array
*/
static function setupAttributeWhitelist() {
- global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
- static $whitelist, $staticInitialised;
+ static $whitelist;
- $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
-
- if ( $whitelist !== null && $staticInitialised == $globalContext ) {
+ if ( $whitelist !== null ) {
return $whitelist;
}
'aria-labelledby',
'aria-owns',
'role',
- ];
- if ( $wgAllowRdfaAttributes ) {
- # RDFa attributes as specified in section 9 of
+ # RDFa
+ # These attributes are specified in section 9 of
# http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
- $common = array_merge( $common, [
- 'about', 'property', 'resource', 'datatype', 'typeof',
- ] );
- }
+ 'about',
+ 'property',
+ 'resource',
+ 'datatype',
+ 'typeof',
- if ( $wgAllowMicrodataAttributes ) {
- # add HTML5 microdata tags as specified by
+ # Microdata. These are specified by
# http://www.whatwg.org/html/microdata.html#the-microdata-model
- $common = array_merge( $common, [
- 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
- ] );
- }
+ 'itemid',
+ 'itemprop',
+ 'itemref',
+ 'itemscope',
+ 'itemtype',
+ ];
$block = array_merge( $common, [ 'align' ] );
$tablealign = [ 'align', 'valign' ];
'link' => [ 'itemprop', 'href' ],
];
- $staticInitialised = $globalContext;
-
return $whitelist;
}