* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
- * @addtogroup Parser
+ * @file
+ * @ingroup Parser
*/
/**
/**
* XHTML sanitizer for MediaWiki
- * @addtogroup Parser
+ * @ingroup Parser
*/
class Sanitizer {
/**
* @param array $args for the processing callback
* @return string
*/
- static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
+ static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) {
global $wgUseTidy;
static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
if ( !$staticInitialised ) {
- $htmlpairs = array( # Tags that must be closed
+ $htmlpairs = array_merge( $extratags, array( # Tags that must be closed
'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
- );
+ ) );
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
);
$htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
# Convert them all to hashtables for faster lookup
- $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
+ $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
foreach ( $vars as $var ) {
$$var = array_flip( $$var );
$optstack = array();
array_push ($optstack, $ot);
while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
- isset( $htmlsingleallowed[$ot] ) )
+ isset( $htmlsingleallowed[$ot] ) )
{
array_push ($optstack, $ot);
}
return Sanitizer::validateAttributes( $attribs,
Sanitizer::attributeWhitelist( $element ) );
}
-
+
/**
* Take an array of attribute names and values and normalize or discard
* illegal values for the given whitelist.
}
}
- if ( $attribute === 'id' )
- $value = Sanitizer::escapeId( $value );
+ if ( $attribute === 'id' ) {
+ global $wgEnforceHtmlIds;
+ $value = Sanitizer::escapeId( $value,
+ $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+ }
// If this attribute was previously set, override it.
// Output should only have one attribute of each name.
}
return $out;
}
-
+
/**
- * Merge two sets of HTML attributes.
- * Conflicting items in the second set will override those
- * in the first, except for 'class' attributes which will be
- * combined.
+ * Merge two sets of HTML attributes. Conflicting items in the second set
+ * will override those in the first, except for 'class' attributes which
+ * will be combined (if they're both strings).
*
* @todo implement merging for other attributes such as style
* @param array $a
*/
static function mergeAttributes( $a, $b ) {
$out = array_merge( $a, $b );
- if( isset( $a['class'] )
- && isset( $b['class'] )
- && $a['class'] !== $b['class'] ) {
-
- $out['class'] = implode( ' ',
- array_unique(
- preg_split( '/\s+/',
- $a['class'] . ' ' . $b['class'],
- -1,
- PREG_SPLIT_NO_EMPTY ) ) );
+ if( isset( $a['class'] ) && isset( $b['class'] )
+ && is_string( $a['class'] ) && is_string( $b['class'] )
+ && $a['class'] !== $b['class'] ) {
+ $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
+ -1, PREG_SPLIT_NO_EMPTY );
+ $out['class'] = implode( ' ', array_unique( $classes ) );
}
return $out;
}
-
+
/**
* Pick apart some CSS and check it for forbidden or unsafe structures.
* Returns a sanitized string, or false if it was just too evil.
// Remove any comments; IE gets token splitting wrong
$stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
-
+
$value = $stripped;
// ... and continue checks
# haxx0r
return false;
}
-
+
return $value;
}
* @return HTML-encoded text fragment
*/
static function encodeAttribute( $text ) {
- $encValue = htmlspecialchars( $text );
+ $encValue = htmlspecialchars( $text, ENT_QUOTES );
// Whitespace is normalized during attribute decoding,
// so if we've been passed non-spaces we must encode them
* name attributes
* @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
*
- * @static
- *
- * @param string $id
+ * @param string $id Id to validate
+ * @param mixed $options String or array of strings (default is array()):
+ * 'noninitial': This is a non-initial fragment of an id, not a full id,
+ * so don't pay attention if the first character isn't valid at the
+ * beginning of an id.
+ * 'xml': Don't restrict the id to be HTML4-compatible. This option
+ * allows any alphabetic character to be used, per the XML standard.
+ * Therefore, it also completely changes the type of escaping: instead
+ * of weird dot-encoding, runs of invalid characters (mostly
+ * whitespace) are just compressed into a single underscore.
* @return string
*/
- static function escapeId( $id ) {
- static $replace = array(
- '%3A' => ':',
- '%' => '.'
- );
+ static function escapeId( $id, $options = array() ) {
+ $options = (array)$options;
+
+ if ( !in_array( 'xml', $options ) ) {
+ # HTML4-style escaping
+ static $replace = array(
+ '%3A' => ':',
+ '%' => '.'
+ );
- $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+ $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+ $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
+
+ if ( !preg_match( '/^[a-zA-Z]/', $id )
+ && !in_array( 'noninitial', $options ) ) {
+ // Initial character must be a letter!
+ $id = "x$id";
+ }
+ return $id;
+ }
- return str_replace( array_keys( $replace ), array_values( $replace ), $id );
+ # XML-style escaping. For the patterns used, see the XML 1.0 standard,
+ # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
+ $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
+ . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
+ . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
+ $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
+ . '\x{203F}-\x{2040}';
+ # Replace _ as well so we don't get multiple consecutive underscores
+ $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
+ $id = trim( $id, '_' );
+
+ if ( !preg_match( "/^[$nameStartChar]/u", $id )
+ && !in_array( 'noninitial', $options ) ) {
+ $id = "_$id";
+ }
+
+ return $id;
}
/**
$class ), '_');
}
+ /**
+ * Given HTML input, escape with htmlspecialchars but un-escape entites.
+ * This allows (generally harmless) entities like to survive.
+ *
+ * @param string $html String to escape
+ * @return string Escaped input
+ */
+ static function escapeHtmlAllowEntities( $html ) {
+ # It seems wise to escape ' as well as ", as a matter of course. Can't
+ # hurt.
+ $html = htmlspecialchars( $html, ENT_QUOTES );
+ $html = str_replace( '&', '&', $html );
+ $html = Sanitizer::normalizeCharReferences( $html );
+ return $html;
+ }
+
/**
* Regex replace callback for armoring links against further processing.
* @param array $matches
* @param string
* @return array
*/
- static function decodeTagAttributes( $text ) {
+ public static function decodeTagAttributes( $text ) {
$attribs = array();
if( trim( $text ) == '' ) {
self::normalizeWhitespace(
Sanitizer::normalizeCharReferences( $text ) ) );
}
-
+
private static function normalizeWhitespace( $text ) {
return preg_replace(
'/\r\n|[\x20\x0d\x0a\x09]/',
/**
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
- * return the named entity reference as is. If the entity is a
- * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
+ * return the named entity reference as is. If the entity is a
+ * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
* returns HTML-escaped text of pseudo-entity source (eg &foo;)
*
* @param string $name
}
/**
- * @todo Document it a bit
+ * Foreach array key (an allowed HTML element), return an array
+ * of allowed attributes
* @return array
*/
static function setupAttributeWhitelist() {
# 11.2.6
'td' => array_merge( $common, $tablecell, $tablealign ),
'th' => array_merge( $common, $tablecell, $tablealign ),
-
+
# 13.2
# Not usually allowed, but may be used for extension-style hooks
# such as <math> when it is rasterized
'rb' => $common,
'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
'rp' => $common,
-
+
# MathML root element, where used for extensions
# 'title' may not be 100% valid here; it's XHTML
# http://www.w3.org/TR/REC-MathML/
return $out;
}
- static function cleanUrl( $url, $hostname=true ) {
+ static function cleanUrl( $url ) {
# Normalize any HTML entities in input. They will be
# re-escaped by makeExternalLink().
$url = Sanitizer::decodeCharReferences( $url );
}
}
-
-