<?php
/**
- * XHTML sanitizer for %MediaWiki.
+ * HTML sanitizer for %MediaWiki.
*
* Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
* http://www.mediawiki.org/
*/
/**
- * XHTML sanitizer for MediaWiki
+ * HTML sanitizer for MediaWiki
* @ingroup Parser
*/
class Sanitizer {
* @return string
*/
static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
- global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
+ global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
// Base our staticInitialised variable off of the global config state so that if the globals
// are changed (like in the screwed up test system) we will re-initialise the settings.
- $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+ $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
if ( !$staticInitialised || $staticInitialised != $globalContext ) {
$htmlpairsStatic = array( # Tags that must be closed
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
- 'kbd', 'samp'
+ 'kbd', 'samp', 'data', 'time', 'mark'
);
- if ( $wgHtml5 ) {
- $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
- }
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
);
$htmlsingleonly = array( # Elements that cannot have close tags
'br', 'hr'
);
- if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+ if ( $wgAllowMicrodataAttributes ) {
$htmlsingle[] = $htmlsingleonly[] = 'meta';
$htmlsingle[] = $htmlsingleonly[] = 'link';
}
$htmlnest = array( # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
- 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
+ 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
+ 'var', 'kbd', 'samp'
);
$tabletags = array( # Can only appear inside table, we will close them
'td', 'th', 'tr',
* @todo Check for unique id attribute :P
*/
static function validateAttributes( $attribs, $whitelist ) {
- global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
+ global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
$whitelist = array_flip( $whitelist );
$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
continue;
}
- # Allow any attribute beginning with "data-", if in HTML5 mode
- if ( !( $wgHtml5 && preg_match( '/^data-/i', $attribute ) ) && !isset( $whitelist[$attribute] ) ) {
+ # Allow any attribute beginning with "data-"
+ if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
continue;
}
# WAI-ARIA
# http://www.w3.org/TR/wai-aria/
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
+ # http://www.whatwg.org/html/elements.html#wai-aria
# For now we only support role="presentation" until we work out what roles should be
# usable by content and we ensure that our code explicitly rejects patterns that
# violate HTML5's ARIA restrictions.
/**
* Pick apart some CSS and check it for forbidden or unsafe structures.
* Returns a sanitized string. This sanitized string will have
- * character references and escape sequences decoded, and comments
- * stripped. If the input is just too evil, only a comment complaining
- * about evilness will be returned.
+ * character references and escape sequences decoded and comments
+ * stripped (unless it is itself one valid comment, in which case the value
+ * will be passed through). If the input is just too evil, only a comment
+ * complaining about evilness will be returned.
*
* Currently URL references, 'expression', 'tps' are forbidden.
*
$value = preg_replace_callback( $decodeRegex,
array( __CLASS__, 'cssDecodeCallback' ), $value );
- // Remove any comments; IE gets token splitting wrong
- // This must be done AFTER decoding character references and
- // escape sequences, because those steps can introduce comments
- // This step cannot introduce character references or escape
- // sequences, because it replaces comments with spaces rather
- // than removing them completely.
- $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
- // Remove anything after a comment-start token, to guard against
- // incorrect client implementations.
- $commentPos = strpos( $value, '/*' );
- if ( $commentPos !== false ) {
- $value = substr( $value, 0, $commentPos );
+ // Let the value through if it's nothing but a single comment, to
+ // allow other functions which may reject it to pass some error
+ // message through.
+ if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
+ // Remove any comments; IE gets token splitting wrong
+ // This must be done AFTER decoding character references and
+ // escape sequences, because those steps can introduce comments
+ // This step cannot introduce character references or escape
+ // sequences, because it replaces comments with spaces rather
+ // than removing them completely.
+ $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+ // Remove anything after a comment-start token, to guard against
+ // incorrect client implementations.
+ $commentPos = strpos( $value, '/*' );
+ if ( $commentPos !== false ) {
+ $value = substr( $value, 0, $commentPos );
+ }
}
// Reject problematic keywords and control characters
$decoded = Sanitizer::decodeTagAttributes( $text );
$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
- $attribs = array();
- foreach ( $stripped as $attribute => $value ) {
- $encAttribute = htmlspecialchars( $attribute );
- $encValue = Sanitizer::safeEncodeAttribute( $value );
-
- $attribs[] = "$encAttribute=\"$encValue\"";
- }
- return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+ return Sanitizer::safeEncodeTagAttributes( $stripped );
}
/**
* in the id and
* name attributes
* @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
- * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+ * @see http://www.whatwg.org/html/elements.html#the-id-attribute
* HTML5 definition of id attribute
*
* @param string $id id to escape
* @return String
*/
static function escapeId( $id, $options = array() ) {
- global $wgHtml5, $wgExperimentalHtmlIds;
+ global $wgExperimentalHtmlIds;
$options = (array)$options;
- if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+ if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
$id = Sanitizer::decodeCharReferences( $id );
$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
$id = trim( $id, '_' );
return $attribs;
}
+ /**
+ * Build a partial tag string from an associative array of attribute
+ * names and values as returned by decodeTagAttributes.
+ *
+ * @param $assoc_array Array
+ * @return String
+ */
+ public static function safeEncodeTagAttributes( $assoc_array ) {
+ $attribs = array();
+ foreach ( $assoc_array as $attribute => $value ) {
+ $encAttribute = htmlspecialchars( $attribute );
+ $encValue = Sanitizer::safeEncodeAttribute( $value );
+
+ $attribs[] = "$encAttribute=\"$encValue\"";
+ }
+ return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+ }
+
/**
* Pick the appropriate attribute value from a match set from the
* attribs regex matches.
* @return Boolean
*/
private static function validateCodepoint( $codepoint ) {
- return ($codepoint == 0x09)
- || ($codepoint == 0x0a)
- || ($codepoint == 0x0d)
- || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
- || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
- || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
+ return $codepoint == 0x09
+ || $codepoint == 0x0a
+ || $codepoint == 0x0d
+ || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
+ || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
+ || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
}
/**
* @return Array
*/
static function setupAttributeWhitelist() {
- global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
+ global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
static $whitelist, $staticInitialised;
- $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
+ $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
return $whitelist;
) );
}
- if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
- # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+ if ( $wgAllowMicrodataAttributes ) {
+ # add HTML5 microdata tags as specified by http://www.whatwg.org/html/microdata.html#the-microdata-model
$common = array_merge( $common, array(
'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
) );
# 15.3
'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
- # XHTML Ruby annotation text module, simple ruby only.
- # http://www.w3c.org/TR/ruby/
+ # HTML Ruby annotation text module, simple ruby only.
+ # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
'ruby' => $common,
# rbc
# rtc
# HTML 5 section 4.6
'bdi' => $common,
- );
-
- if ( $wgHtml5 ) {
# HTML5 elements, defined by:
- # http://www.whatwg.org/specs/web-apps/current-work/multipage/
- $whitelist += array(
- 'data' => array_merge( $common, array( 'value' ) ),
- 'time' => array_merge( $common, array( 'datetime' ) ),
- 'mark' => $common,
-
- // meta and link are only permitted by removeHTMLtags when Microdata
- // is enabled so we don't bother adding a conditional to hide these
- // Also meta and link are only valid in WikiText as Microdata elements
- // (ie: validateTag rejects tags missing the attributes needed for Microdata)
- // So we don't bother including $common attributes that have no purpose.
- 'meta' => array( 'itemprop', 'content' ),
- 'link' => array( 'itemprop', 'href' ),
- );
- }
+ # http://www.whatwg.org/html/
+ 'data' => array_merge( $common, array( 'value' ) ),
+ 'time' => array_merge( $common, array( 'datetime' ) ),
+ 'mark' => $common,
+
+ // meta and link are only permitted by removeHTMLtags when Microdata
+ // is enabled so we don't bother adding a conditional to hide these
+ // Also meta and link are only valid in WikiText as Microdata elements
+ // (ie: validateTag rejects tags missing the attributes needed for Microdata)
+ // So we don't bother including $common attributes that have no purpose.
+ 'meta' => array( 'itemprop', 'content' ),
+ 'link' => array( 'itemprop', 'href' ),
+ );
$staticInitialised = $globalContext;
* Does a string look like an e-mail address?
*
* This validates an email address using an HTML5 specification found at:
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+ * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
* Which as of 2011-01-24 says:
*
* A valid e-mail address is a string that matches the ABNF production