X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=7fdd1df5f04c515104376ecbaaeb7a01a68ca39d;hb=1e2199b76bc6641bf0d3827903b45ab35db98bd1;hp=8249f969f1810e060a317006312d9ac43394da80;hpb=ec7276ea08af439384440b5b3e35c315c16e929e;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 8249f969f1..7fdd1df5f0 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -2,7 +2,7 @@ /** * XHTML sanitizer for MediaWiki * - * Copyright (C) 2002-2005 Brion Vibber et al + * Copyright © 2002-2005 Brion Vibber et al * http://www.mediawiki.org/ * * This program is free software; you can redistribute it and/or modify @@ -40,10 +40,11 @@ define( 'MW_CHAR_REFS_REGEX', * Allows some... latitude. * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes */ -$attrib = '[A-Za-z0-9]'; +$attrib_first = '[:A-Z_a-z]'; +$attrib = '[:A-Z_a-z-.0-9]'; $space = '[\x09\x0a\x0d\x20]'; define( 'MW_ATTRIBS_REGEX', - "/(?:^|$space)($attrib+) + "/(?:^|$space)({$attrib_first}{$attrib}*) ($space*=$space* (?: # The attribute value: quoted or alone @@ -56,6 +57,16 @@ define( 'MW_ATTRIBS_REGEX', ) )?(?=$space|\$)/sx" ); +/** + * Regular expression to match URIs that could trigger script execution + */ +define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' ); + +/** + * Regular expression to match namespace attributes + */ +define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" ); + /** * List of all named character entities defined in HTML 4.01 * http://www.w3.org/TR/html4/sgml/entities.html @@ -335,11 +346,11 @@ class Sanitizer { * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @private - * @param string $text - * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values - * @param array $args for the processing callback - * @param array $extratags for any extra tags to include - * @param array $removetags for any tags (default or extra) to exclude + * @param $text String + * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values + * @param $args Array for the processing callback + * @param $extratags Array for any extra tags to include + * @param $removetags Array for any tags (default or extra) to exclude * @return string */ static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { @@ -357,7 +368,7 @@ class Sanitizer { 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr', 'dfn' ); $htmlsingle = array( 'br', 'hr', 'li', 'dt', 'dd' @@ -379,6 +390,12 @@ class Sanitizer { 'li', ); + global $wgAllowImageTag; + if ( $wgAllowImageTag ) { + $htmlsingle[] = 'img'; + $htmlsingleonly[] = 'img'; + } + $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); @@ -394,43 +411,49 @@ class Sanitizer { $extratags = array_flip( $extratags ); $removetags = array_flip( $removetags ); $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); - $htmlelements = array_diff( array_unique( array_merge( $extratags, $htmlelementsStatic ) ), $removetags ); + $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags ); # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = str_replace( '>', '>', array_shift( $bits ) ); - if(!$wgUseTidy) { + if ( !$wgUseTidy ) { $tagstack = $tablestack = array(); foreach ( $bits as $x ) { $regs = array(); + # $slash: Does the current element start with a '/'? + # $t: Current element name + # $params: String between element name and > + # $brace: Ending '>' or '/>' + # $rest: Everything until the next element of $bits if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; } else { $slash = $t = $params = $brace = $rest = null; } - $badtag = 0 ; + $badtag = false; if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { # Check our stack - if ( $slash ) { - # Closing a tag... - if( isset( $htmlsingleonly[$t] ) ) { - $badtag = 1; - } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) { + if ( $slash && isset( $htmlsingleonly[$t] ) ) { + $badtag = true; + } elseif ( $slash ) { + # Closing a tag... is it the one we just opened? + $ot = @array_pop( $tagstack ); + if ( $ot != $t ) { if ( isset( $htmlsingleallowed[$ot] ) ) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); - array_push ($optstack, $ot); - while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && - isset( $htmlsingleallowed[$ot] ) ) - { - array_push ($optstack, $ot); + array_push( $optstack, $ot ); + $ot = @array_pop( $tagstack ); + while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { + array_push( $optstack, $ot ); + $ot = @array_pop( $tagstack ); } if ( $t != $ot ) { - # No match. Push the optinal elements back again - $badtag = 1; + # No match. Push the optional elements back again + $badtag = true; while ( $ot = @array_pop( $optstack ) ) { array_push( $tagstack, $ot ); } @@ -438,8 +461,8 @@ class Sanitizer { } else { @array_push( $tagstack, $ot ); #
  • can be nested in
      or
        , skip those cases: - if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) { - $badtag = 1; + if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { + $badtag = true; } } } else { @@ -451,23 +474,23 @@ class Sanitizer { } else { # Keep track for later if ( isset( $tabletags[$t] ) && - ! in_array( 'table', $tagstack ) ) { - $badtag = 1; - } else if ( in_array( $t, $tagstack ) && - ! isset( $htmlnest [$t ] ) ) { - $badtag = 1 ; + !in_array( 'table', $tagstack ) ) { + $badtag = true; + } elseif ( in_array( $t, $tagstack ) && + !isset( $htmlnest [$t ] ) ) { + $badtag = true; # Is it a self closed htmlpair ? (bug 5487) - } else if( $brace == '/>' && + } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) { - $badtag = 1; - } elseif( isset( $htmlsingleonly[$t] ) ) { + $badtag = true; + } elseif ( isset( $htmlsingleonly[$t] ) ) { # Hack to force empty tag for uncloseable elements $brace = '/>'; - } else if( isset( $htmlsingle[$t] ) ) { + } elseif ( isset( $htmlsingle[$t] ) ) { # Hack to not close $htmlsingle tags - $brace = NULL; - } else if( isset( $tabletags[$t] ) - && in_array($t ,$tagstack) ) { + $brace = null; + } elseif ( isset( $tabletags[$t] ) + && in_array( $t, $tagstack ) ) { // New table tag but forgot to close the previous one $text .= ""; } else { @@ -487,7 +510,7 @@ class Sanitizer { # Strip non-approved attributes from the tag $newparams = Sanitizer::fixTagAttributes( $params, $t ); } - if ( ! $badtag ) { + if ( !$badtag ) { $rest = str_replace( '>', '>', $rest ); $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; $text .= "<$slash$t$newparams$close>$rest"; @@ -530,7 +553,7 @@ class Sanitizer { * trailing spaces and one of the newlines. * * @private - * @param string $text + * @param $text String * @return string */ static function removeHTMLcomments( $text ) { @@ -576,9 +599,9 @@ class Sanitizer { * - Unsafe style attributes are discarded * - Invalid id attributes are reencoded * - * @param array $attribs - * @param string $element - * @return array + * @param $attribs Array + * @param $element String + * @return Array * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P @@ -596,40 +619,94 @@ class Sanitizer { * - Unsafe style attributes are discarded * - Invalid id attributes are reencoded * - * @param array $attribs - * @param array $whitelist list of allowed attribute names - * @return array + * @param $attribs Array + * @param $whitelist Array: list of allowed attribute names + * @return Array * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ static function validateAttributes( $attribs, $whitelist ) { + global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5; + $whitelist = array_flip( $whitelist ); + $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; + $out = array(); foreach( $attribs as $attribute => $value ) { - if( !isset( $whitelist[$attribute] ) ) { + #allow XML namespace declaration if RDFa is enabled + if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) { + if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) { + $out[$attribute] = $value; + } + + continue; + } + + # Allow any attribute beginning with "data-", if in HTML5 mode + if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) { continue; } + # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if( $attribute == 'style' ) { $value = Sanitizer::checkCss( $value ); - if( $value === false ) { - # haxx0r - continue; - } } if ( $attribute === 'id' ) { - global $wgEnforceHtmlIds; - $value = Sanitizer::escapeId( $value, - $wgEnforceHtmlIds ? 'noninitial' : 'xml' ); + $value = Sanitizer::escapeId( $value, 'noninitial' ); + } + + //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity + if ( $attribute === 'rel' || $attribute === 'rev' || + $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa + $attribute === 'datatype' || $attribute === 'typeof' || #RDFa + $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata + $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata + + //Paranoia. Allow "simple" values but suppress javascript + if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) { + continue; + } + } + + # NOTE: even though elements using href/src are not allowed directly, supply + # validation code that can be used by tag hook handlers, etc + if ( $attribute === 'href' || $attribute === 'src' ) { + if ( !preg_match( $hrefExp, $value ) ) { + continue; //drop any href or src attributes not using an allowed protocol. + //NOTE: this also drops all relative URLs + } } // If this attribute was previously set, override it. // Output should only have one attribute of each name. $out[$attribute] = $value; } + + if ( $wgAllowMicrodataAttributes ) { + # There are some complicated validity constraints we need to + # enforce here. First of all, we don't want to allow non-standard + # itemtypes. + $allowedTypes = array( + 'http://microformats.org/profile/hcard', + 'http://microformats.org/profile/hcalendar#vevent', + 'http://n.whatwg.org/work', + ); + if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'], + $allowedTypes ) ) { + # Kill everything + unset( $out['itemscope'] ); + } + # itemtype, itemid, itemref don't make sense without itemscope + if ( !array_key_exists( 'itemscope', $out ) ) { + unset( $out['itemtype'] ); + unset( $out['itemid'] ); + unset( $out['itemref'] ); + } + # TODO: Strip itemprop if we aren't descendants of an itemscope. + } return $out; } @@ -639,8 +716,8 @@ class Sanitizer { * will be combined (if they're both strings). * * @todo implement merging for other attributes such as style - * @param array $a - * @param array $b + * @param $a Array + * @param $b Array * @return array */ static function mergeAttributes( $a, $b ) { @@ -661,30 +738,108 @@ class Sanitizer { * * Currently URL references, 'expression', 'tps' are forbidden. * - * @param string $value - * @return mixed + * @param $value String + * @return Mixed */ static function checkCss( $value ) { - $stripped = Sanitizer::decodeCharReferences( $value ); + $value = Sanitizer::decodeCharReferences( $value ); // Remove any comments; IE gets token splitting wrong - $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped ); - - $value = $stripped; - - // ... and continue checks - $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', - 'codepointToUtf8(hexdec("$1"))', $stripped ); - $stripped = str_replace( '\\', '', $stripped ); - if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is', - $stripped ) ) { - # haxx0r - return false; + $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); + + // Decode escape sequences and line continuation + // See the grammar in the CSS 2 spec, appendix D. + static $decodeRegex; + if ( !$decodeRegex ) { + $space = '[\\x20\\t\\r\\n\\f]'; + $nl = '(?:\\n|\\r\\n|\\r|\\f)'; + $backslash = '\\\\'; + $decodeRegex = "/ $backslash + (?: + ($nl) | # 1. Line continuation + ([0-9A-Fa-f]{1,6})$space? | # 2. character number + (.) | # 3. backslash cancelling special meaning + () | # 4. backslash at end of string + )/xu"; + } + $value = preg_replace_callback( $decodeRegex, + array( __CLASS__, 'cssDecodeCallback' ), $value ); + + // Reject problematic keywords and control characters + if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) { + return '/* invalid control char */'; + } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) { + return '/* insecure input */'; } - return $value; } + static function cssDecodeCallback( $matches ) { + if ( $matches[1] !== '' ) { + // Line continuation + return ''; + } elseif ( $matches[2] !== '' ) { + $char = codepointToUtf8( hexdec( $matches[2] ) ); + } elseif ( $matches[3] !== '' ) { + $char = $matches[3]; + } else { + $char = '\\'; + } + if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { + // These characters need to be escaped in strings + // Clean up the escape sequence to avoid parsing errors by clients + return '\\' . dechex( ord( $char ) ) . ' '; + } else { + // Decode unnecessary escape + return $char; + } + } + + /** + * Take an associative array of attribute name/value pairs + * and generate a css style representing all the style-related + * attributes. If there already a style attribute in the array, + * it is also included in the value returned. + */ + static function styleFromAttributes( $attributes ) { + $styles = array(); + + foreach ( $attributes as $attribute => $value ) { + if ( $attribute == 'bgcolor' ) { + $styles[] = "background-color: $value"; + } else if ( $attribute == 'border' ) { + $styles[] = "border-width: $value"; + } else if ( $attribute == 'align' ) { + $styles[] = "text-align: $value"; + } else if ( $attribute == 'valign' ) { + $styles[] = "vertical-align: $value"; + } else if ( $attribute == 'width' ) { + if ( preg_match( '/\d+/', $value ) === false ) { + $value .= 'px'; + } + + $styles[] = "width: $value"; + } else if ( $attribute == 'height' ) { + if ( preg_match( '/\d+/', $value ) === false ) { + $value .= 'px'; + } + + $styles[] = "height: $value"; + } else if ( $attribute == 'nowrap' ) { + if ( $value ) { + $styles[] = "white-space: nowrap"; + } + } + } + + if ( isset( $attributes[ 'style' ] ) ) { + $styles[] = $attributes[ 'style' ]; + } + + if ( !$styles ) return ''; + else return implode( '; ', $styles ); + } + /** * Take a tag soup fragment listing an HTML element's attributes * and normalize it to well-formed XML, discarding unwanted attributes. @@ -700,31 +855,73 @@ class Sanitizer { * - Unsafe style attributes are discarded * - Prepends space if there are attributes. * - * @param string $text - * @param string $element - * @return string + * @param $text String + * @param $element String + * @param $defaults Array (optional) associative array of default attributes to splice in. + * class and style attributes are combined. Otherwise, values from + * $attributes take precedence over values from $defaults. + * @return String */ - static function fixTagAttributes( $text, $element ) { + static function fixTagAttributes( $text, $element, $defaults = null ) { if( trim( $text ) == '' ) { return ''; } - $stripped = Sanitizer::validateTagAttributes( - Sanitizer::decodeTagAttributes( $text ), $element ); + $decoded = Sanitizer::decodeTagAttributes( $text ); + $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); + $attribs = Sanitizer::collapseTagAttributes( $stripped, $defaults ); - $attribs = array(); - foreach( $stripped as $attribute => $value ) { + return $attribs; + } + + /** + * Take an associative array or attribute name/value pairs + * and collapses it to well-formed XML. + * Does not filter attributes. + * Output is safe for further wikitext processing, with escaping of + * values that could trigger problems. + * + * - Double-quotes all attribute values + * - Prepends space if there are attributes. + * + * @param $attributes Array is an associative array of attribute name/value pairs. + * Assumed to be sanitized already. + * @param $defaults Array (optional) associative array of default attributes to splice in. + * class and style attributes are combined. Otherwise, values from + * $attributes take precedence over values from $defaults. + * @return String + */ + static function collapseTagAttributes( $attributes, $defaults = null ) { + if ( $defaults ) { + foreach( $defaults as $attribute => $value ) { + if ( isset( $attributes[ $attribute ] ) ) { + if ( $attribute == 'class' ) { + $value .= ' '. $attributes[ $attribute ]; + } else if ( $attribute == 'style' ) { + $value .= '; ' . $attributes[ $attribute ]; + } else { + continue; + } + } + + $attributes[ $attribute ] = $value; + } + } + + $chunks = array(); + + foreach( $attributes as $attribute => $value ) { $encAttribute = htmlspecialchars( $attribute ); $encValue = Sanitizer::safeEncodeAttribute( $value ); - $attribs[] = "$encAttribute=\"$encValue\""; + $chunks[] = "$encAttribute=\"$encValue\""; } - return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; + return count( $chunks ) ? ' ' . implode( ' ', $chunks ) : ''; } /** * Encode an attribute value for HTML output. - * @param $text + * @param $text String * @return HTML-encoded text fragment */ static function encodeAttribute( $text ) { @@ -745,7 +942,7 @@ class Sanitizer { /** * Encode an attribute value for HTML tags, with extra armoring * against further wiki processing. - * @param $text + * @param $text String * @return HTML-encoded text fragment */ static function safeEncodeAttribute( $text ) { @@ -776,63 +973,66 @@ class Sanitizer { } /** - * Given a value escape it so that it can be used in an id attribute and - * return it, this does not validate the value however (see first link) + * Given a value, escape it so that it can be used in an id attribute and + * return it. This will use HTML5 validation if $wgExperimentalHtmlIds is + * true, allowing anything but ASCII whitespace. Otherwise it will use + * HTML 4 rules, which means a narrow subset of ASCII, with bad characters + * escaped with lots of dots. + * + * To ensure we don't have to bother escaping anything, we also strip ', ", + * & even if $wgExperimentalIds is true. TODO: Is this the best tactic? + * We also strip # because it upsets IE, and % because it could be + * ambiguous if it's part of something that looks like a percent escape + * (which don't work reliably in fragments cross-browser). * * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and * name attributes * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute + * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute + * HTML5 definition of id attribute * - * @param string $id Id to validate - * @param mixed $options String or array of strings (default is array()): + * @param $id String: id to escape + * @param $options Mixed: string or array of strings (default is array()): * 'noninitial': This is a non-initial fragment of an id, not a full id, * so don't pay attention if the first character isn't valid at the - * beginning of an id. - * 'xml': Don't restrict the id to be HTML4-compatible. This option - * allows any alphabetic character to be used, per the XML standard. - * Therefore, it also completely changes the type of escaping: instead - * of weird dot-encoding, runs of invalid characters (mostly - * whitespace) are just compressed into a single underscore. - * @return string + * beginning of an id. Only matters if $wgExperimentalHtmlIds is + * false. + * 'legacy': Behave the way the old HTML 4-based ID escaping worked even + * if $wgExperimentalHtmlIds is used, so we can generate extra + * anchors and links won't break. + * @return String */ static function escapeId( $id, $options = array() ) { + global $wgHtml5, $wgExperimentalHtmlIds; $options = (array)$options; - if ( !in_array( 'xml', $options ) ) { - # HTML4-style escaping - static $replace = array( - '%3A' => ':', - '%' => '.' - ); - - $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); - $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); - - if ( !preg_match( '/^[a-zA-Z]/', $id ) - && !in_array( 'noninitial', $options ) ) { - // Initial character must be a letter! - $id = "x$id"; + if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { + $id = Sanitizer::decodeCharReferences( $id ); + $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); + $id = trim( $id, '_' ); + if ( $id === '' ) { + # Must have been all whitespace to start with. + return '_'; + } else { + return $id; } - return $id; } - # XML-style escaping. For the patterns used, see the XML 1.0 standard, - # 5th edition, NameStartChar and NameChar: - $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}' - . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}' - . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}'; - $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}' - . '\x{203F}-\x{2040}'; - # Replace _ as well so we don't get multiple consecutive underscores - $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id ); - $id = trim( $id, '_' ); - - if ( !preg_match( "/^[$nameStartChar]/u", $id ) - && !in_array( 'noninitial', $options ) ) { - $id = "_$id"; - } + # HTML4-style escaping + static $replace = array( + '%3A' => ':', + '%' => '.' + ); + + $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); + $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); + if ( !preg_match( '/^[a-zA-Z]/', $id ) + && !in_array( 'noninitial', $options ) ) { + // Initial character must be a letter! + $id = "x$id"; + } return $id; } @@ -844,8 +1044,8 @@ class Sanitizer { * * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format * - * @param string $class - * @return string + * @param $class String + * @return String */ static function escapeClass( $class ) { // Convert ugly stuff to underscores and kill underscores in ugly places @@ -857,25 +1057,23 @@ class Sanitizer { /** * Given HTML input, escape with htmlspecialchars but un-escape entites. - * This allows (generally harmless) entities like   to survive. + * This allows (generally harmless) entities like   to survive. * - * @param string $html String to escape - * @return string Escaped input + * @param $html String to escape + * @return String: escaped input */ static function escapeHtmlAllowEntities( $html ) { + $html = Sanitizer::decodeCharReferences( $html ); # It seems wise to escape ' as well as ", as a matter of course. Can't # hurt. $html = htmlspecialchars( $html, ENT_QUOTES ); - $html = str_replace( '&', '&', $html ); - $html = Sanitizer::normalizeCharReferences( $html ); return $html; } /** * Regex replace callback for armoring links against further processing. - * @param array $matches + * @param $matches Array * @return string - * @private */ private static function armorLinksCallback( $matches ) { return str_replace( ':', ':', $matches[1] ); @@ -886,16 +1084,15 @@ class Sanitizer { * a partial tag string. Attribute names are forces to lowercase, * character references are decoded to UTF-8 text. * - * @param string - * @return array + * @param $text String + * @return Array */ public static function decodeTagAttributes( $text ) { - $attribs = array(); - if( trim( $text ) == '' ) { - return $attribs; + return array(); } + $attribs = array(); $pairs = array(); if( !preg_match_all( MW_ATTRIBS_REGEX, @@ -923,9 +1120,8 @@ class Sanitizer { * Pick the appropriate attribute value from a match set from the * MW_ATTRIBS_REGEX matches. * - * @param array $set - * @return string - * @private + * @param $set Array + * @return String */ private static function getTagAttributeCallback( $set ) { if( isset( $set[6] ) ) { @@ -957,9 +1153,8 @@ class Sanitizer { * but note that we're not returning the value, but are returning * XML source fragments that will be slapped into output. * - * @param string $text - * @return string - * @private + * @param $text String + * @return String */ private static function normalizeAttributeValue( $text ) { return str_replace( '"', '"', @@ -974,6 +1169,18 @@ class Sanitizer { $text ); } + /** + * Normalizes whitespace in a section name, such as might be returned + * by Parser::stripSectionName(), for use in the id's that are used for + * section links. + * + * @param $section String + * @return String + */ + static function normalizeSectionNameWhitespace( $section ) { + return trim( preg_replace( '/[ _]+/', ' ', $section ) ); + } + /** * Ensure that any entities and character references are legal * for XML and XHTML specifically. Any stray bits will be @@ -984,8 +1191,8 @@ class Sanitizer { * c. use &#x, not &#X * d. fix or reject non-valid attributes * - * @param string $text - * @return string + * @param $text String + * @return String * @private */ static function normalizeCharReferences( $text ) { @@ -995,8 +1202,8 @@ class Sanitizer { $text ); } /** - * @param string $matches - * @return string + * @param $matches String + * @return String */ static function normalizeCharReferencesCallback( $matches ) { $ret = null; @@ -1022,9 +1229,8 @@ class Sanitizer { * MediaWiki-specific alias, returns the HTML equivalent. Otherwise, * returns HTML-escaped text of pseudo-entity source (eg &foo;) * - * @param string $name - * @return string - * @static + * @param $name String + * @return String */ static function normalizeEntity( $name ) { global $wgHtmlEntities, $wgHtmlEntityAliases; @@ -1057,8 +1263,8 @@ class Sanitizer { /** * Returns true if a given Unicode codepoint is a valid character in XML. - * @param int $codepoint - * @return bool + * @param $codepoint Integer + * @return Boolean */ private static function validateCodepoint( $codepoint ) { return ($codepoint == 0x09) @@ -1073,10 +1279,8 @@ class Sanitizer { * Decode any character references, numeric or named entities, * in the text and return a UTF-8 string. * - * @param string $text - * @return string - * @public - * @static + * @param $text String + * @return String */ public static function decodeCharReferences( $text ) { return preg_replace_callback( @@ -1086,8 +1290,32 @@ class Sanitizer { } /** - * @param string $matches - * @return string + * Decode any character references, numeric or named entities, + * in the next and normalize the resulting string. (bug 14952) + * + * This is useful for page titles, not for text to be displayed, + * MediaWiki allows HTML entities to escape normalization as a feature. + * + * @param $text String (already normalized, containing entities) + * @return String (still normalized, without entities) + */ + public static function decodeCharReferencesAndNormalize( $text ) { + global $wgContLang; + $text = preg_replace_callback( + MW_CHAR_REFS_REGEX, + array( 'Sanitizer', 'decodeCharReferencesCallback' ), + $text, /* limit */ -1, $count ); + + if ( $count ) { + return $wgContLang->normalize( $text ); + } else { + return $text; + } + } + + /** + * @param $matches String + * @return String */ static function decodeCharReferencesCallback( $matches ) { if( $matches[1] != '' ) { @@ -1106,8 +1334,8 @@ class Sanitizer { /** * Return UTF-8 string for a codepoint if that is a valid * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. - * @param int $codepoint - * @return string + * @param $codepoint Integer + * @return String * @private */ static function decodeChar( $codepoint ) { @@ -1123,8 +1351,8 @@ class Sanitizer { * return the UTF-8 encoding of that character. Otherwise, returns * pseudo-entity source (eg &foo;) * - * @param string $name - * @return string + * @param $name Strings + * @return String */ static function decodeEntity( $name ) { global $wgHtmlEntities, $wgHtmlEntityAliases; @@ -1139,11 +1367,10 @@ class Sanitizer { } /** - * Fetch the whitelist of acceptable attributes for a given - * element name. + * Fetch the whitelist of acceptable attributes for a given element name. * - * @param string $element - * @return array + * @param $element String + * @return Array */ static function attributeWhitelist( $element ) { static $list; @@ -1158,10 +1385,27 @@ class Sanitizer { /** * Foreach array key (an allowed HTML element), return an array * of allowed attributes - * @return array + * @return Array */ static function setupAttributeWhitelist() { + global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes; + $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); + + if ( $wgAllowRdfaAttributes ) { + #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 + $common = array_merge( $common, array( + 'about', 'property', 'resource', 'datatype', 'typeof', + ) ); + } + + if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { + # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model + $common = array_merge( $common, array( + 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' + ) ); + } + $block = array_merge( $common, array( 'align' ) ); $tablealign = array( 'align', 'char', 'charoff', 'valign' ); $tablecell = array( 'abbr', @@ -1202,12 +1446,12 @@ class Sanitizer { 'em' => $common, 'strong' => $common, 'cite' => $common, - # dfn + 'dfn' => $common, 'code' => $common, # samp # kbd 'var' => $common, - # abbr + 'abbr' => $common, # acronym # 9.2.2 @@ -1267,10 +1511,14 @@ class Sanitizer { 'td' => array_merge( $common, $tablecell, $tablealign ), 'th' => array_merge( $common, $tablecell, $tablealign ), + # 12.2 # NOTE: is not allowed directly, but the attrib whitelist is used from the Parser object + 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa + # 13.2 # Not usually allowed, but may be used for extension-style hooks - # such as when it is rasterized - 'img' => array_merge( $common, array( 'alt' ) ), + # such as when it is rasterized, or if $wgAllowImageTag is + # true + 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), # 15.2.1 'tt' => $common, @@ -1313,8 +1561,8 @@ class Sanitizer { * Warning: this return value must be further escaped for literal * inclusion in HTML output as of 1.10! * - * @param string $text HTML fragment - * @return string + * @param $text String: HTML fragment + * @return String */ static function stripAllTags( $text ) { # Actual @@ -1334,8 +1582,7 @@ class Sanitizer { * * Use for passing XHTML fragments to PHP's XML parsing functions * - * @return string - * @static + * @return String */ static function hackDocType() { global $wgHtmlEntities; @@ -1353,7 +1600,7 @@ class Sanitizer { $url = Sanitizer::decodeCharReferences( $url ); # Escape any control characters introduced by the above step - $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url ); + $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url ); # Validate hostname portion $matches = array(); @@ -1381,7 +1628,7 @@ class Sanitizer { $host = preg_replace( $strip, '', $host ); - // @fixme: validate hostnames here + // @todo Fixme: validate hostnames here return $protocol . $host . $rest; } else {