X-Git-Url: https://git.heureux-cyclage.org/?p=lhc%2Fweb%2Fwiklou.git;a=blobdiff_plain;f=includes%2FSanitizer.php;h=ed09701d4bf714181d35663b34da2967f4e18755;hp=b08bc6942581dccb70025eaaa763b8a4d6e56b0d;hb=89539f2aa1b158fdcc703ad053e2580cb97a6385;hpb=b85207c9c014b4a639cf3ce978cbb5f59799ef2a diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index b08bc69425..ed09701d4b 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -56,6 +56,21 @@ class Sanitizer { const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; + /** + * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. + * + * @since 1.30 + */ + const ID_PRIMARY = 0; + + /** + * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false + * if no fallback is configured. + * + * @since 1.30 + */ + const ID_FALLBACK = 1; + /** * List of all named character entities defined in HTML 4.01 * https://www.w3.org/TR/html4/sgml/entities.html @@ -465,7 +480,7 @@ class Sanitizer { extract( self::getRecognizedTagData( $extratags, $removetags ) ); # Remove HTML comments - $text = Sanitizer::removeHTMLcomments( $text ); + $text = self::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = str_replace( '>', '>', array_shift( $bits ) ); if ( !MWTidy::isEnabled() ) { @@ -583,12 +598,12 @@ class Sanitizer { call_user_func_array( $processCallback, [ &$params, $args ] ); } - if ( !Sanitizer::validateTag( $params, $t ) ) { + if ( !self::validateTag( $params, $t ) ) { $badtag = true; } # Strip non-approved attributes from the tag - $newparams = Sanitizer::fixTagAttributes( $params, $t ); + $newparams = self::fixTagAttributes( $params, $t ); } if ( !$badtag ) { $rest = str_replace( '>', '>', $rest ); @@ -629,11 +644,11 @@ class Sanitizer { call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] ); } } - if ( !Sanitizer::validateTag( $params, $t ) ) { + if ( !self::validateTag( $params, $t ) ) { $badtag = true; } - $newparams = Sanitizer::fixTagAttributes( $params, $t ); + $newparams = self::fixTagAttributes( $params, $t ); if ( !$badtag ) { if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) { # Interpret self-closing tags as empty tags even when @@ -710,7 +725,7 @@ class Sanitizer { * @return bool */ static function validateTag( $params, $element ) { - $params = Sanitizer::decodeTagAttributes( $params ); + $params = self::decodeTagAttributes( $params ); if ( $element == 'meta' || $element == 'link' ) { if ( !isset( $params['itemprop'] ) ) { @@ -746,8 +761,8 @@ class Sanitizer { * @todo Check for unique id attribute :P */ static function validateTagAttributes( $attribs, $element ) { - return Sanitizer::validateAttributes( $attribs, - Sanitizer::attributeWhitelist( $element ) ); + return self::validateAttributes( $attribs, + self::attributeWhitelist( $element ) ); } /** @@ -795,12 +810,12 @@ class Sanitizer { # Strip javascript "expression" from stylesheets. # https://msdn.microsoft.com/en-us/library/ms537634.aspx if ( $attribute == 'style' ) { - $value = Sanitizer::checkCss( $value ); + $value = self::checkCss( $value ); } # Escape HTML id attributes if ( $attribute === 'id' ) { - $value = Sanitizer::escapeId( $value, 'noninitial' ); + $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY ); } # Escape HTML id reference lists @@ -809,7 +824,7 @@ class Sanitizer { || $attribute === 'aria-labelledby' || $attribute === 'aria-owns' ) { - $value = Sanitizer::escapeIdReferenceList( $value, 'noninitial' ); + $value = self::escapeIdReferenceList( $value, 'noninitial' ); } // RDFa and microdata properties allow URLs, URIs and/or CURIs. @@ -907,7 +922,7 @@ class Sanitizer { */ public static function normalizeCss( $value ) { // Decode character references like { - $value = Sanitizer::decodeCharReferences( $value ); + $value = self::decodeCharReferences( $value ); // Decode escape sequences and line continuation // See the grammar in the CSS 2 spec, appendix D. @@ -1087,14 +1102,14 @@ class Sanitizer { return ''; } - $decoded = Sanitizer::decodeTagAttributes( $text ); - $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); + $decoded = self::decodeTagAttributes( $text ); + $stripped = self::validateTagAttributes( $decoded, $element ); if ( $sorted ) { ksort( $stripped ); } - return Sanitizer::safeEncodeTagAttributes( $stripped ); + return self::safeEncodeTagAttributes( $stripped ); } /** @@ -1124,7 +1139,7 @@ class Sanitizer { * @return string HTML-encoded text fragment */ static function safeEncodeAttribute( $text ) { - $encValue = Sanitizer::encodeAttribute( $text ); + $encValue = self::encodeAttribute( $text ); # Templates and links may be expanded in later parsing, # creating invalid or dangerous output. Suppress this. @@ -1164,6 +1179,8 @@ class Sanitizer { * ambiguous if it's part of something that looks like a percent escape * (which don't work reliably in fragments cross-browser). * + * @deprecated since 1.30, use one of this class' escapeIdFor*() functions + * * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and name attributes * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with @@ -1186,7 +1203,7 @@ class Sanitizer { global $wgExperimentalHtmlIds; $options = (array)$options; - $id = Sanitizer::decodeCharReferences( $id ); + $id = self::decodeCharReferences( $id ); if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); @@ -1215,21 +1232,146 @@ class Sanitizer { return $id; } + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid HTML id attribute. + * + * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, + * be sure to use proper escaping. + * + * @param string $id String to escape + * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding + * should be used. + * @return string|bool Escaped ID or false if fallback encoding is requested but it's not + * configured. + * + * @since 1.30 + */ + public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) { + global $wgFragmentMode; + + if ( !isset( $wgFragmentMode[$mode] ) ) { + if ( $mode === self::ID_PRIMARY ) { + throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); + } + return false; + } + + $internalMode = $wgFragmentMode[$mode]; + + return self::escapeIdInternal( $id, $internalMode ); + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid URL fragment. + * + * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, + * be sure to use proper escaping. + * + * @param string $id String to escape + * @return string Escaped ID + * + * @since 1.30 + */ + public static function escapeIdForLink( $id ) { + global $wgFragmentMode; + + if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) { + throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); + } + + $mode = $wgFragmentMode[self::ID_PRIMARY]; + + $id = self::escapeIdInternal( $id, $mode ); + $id = self::urlEscapeId( $id, $mode ); + + return $id; + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid URL fragment for external interwikis. + * + * @param string $id String to escape + * @return string Escaped ID + * + * @since 1.30 + */ + public static function escapeIdForExternalInterwiki( $id ) { + global $wgExternalInterwikiFragmentMode; + + $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode ); + $id = self::urlEscapeId( $id, $wgExternalInterwikiFragmentMode ); + + return $id; + } + + /** + * Helper for escapeIdFor*() functions. URL-escapes the ID if needed. + * + * @param string $id String to escape + * @param string $mode One of modes from $wgFragmentMode + * @return string + */ + private static function urlEscapeId( $id, $mode ) { + if ( $mode === 'html5' ) { + $id = urlencode( $id ); + $id = str_replace( '%3A', ':', $id ); + } + + return $id; + } + + /** + * Helper for escapeIdFor*() functions. Performs most of the actual escaping. + * + * @param string $id String to escape + * @param string $mode One of modes from $wgFragmentMode + * @return string + */ + private static function escapeIdInternal( $id, $mode ) { + $id = self::decodeCharReferences( $id ); + + switch ( $mode ) { + case 'html5': + $id = str_replace( ' ', '_', $id ); + break; + case 'legacy': + // This corresponds to 'noninitial' mode of the old escapeId() + static $replace = [ + '%3A' => ':', + '%' => '.' + ]; + + $id = urlencode( str_replace( ' ', '_', $id ) ); + $id = strtr( $id, $replace ); + break; + case 'html5-legacy': + $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); + $id = trim( $id, '_' ); + if ( $id === '' ) { + // Must have been all whitespace to start with. + $id = '_'; + } + break; + default: + throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); + } + + return $id; + } + /** * Given a string containing a space delimited list of ids, escape each id * to match ids escaped by the escapeId() function. * + * @todo wfDeprecated() uses of $options in 1.31, remove completely in 1.32 + * * @since 1.27 * * @param string $referenceString Space delimited list of ids - * @param string|array $options String or array of strings (default is array()): - * 'noninitial': This is a non-initial fragment of an id, not a full id, - * so don't pay attention if the first character isn't valid at the - * beginning of an id. Only matters if $wgExperimentalHtmlIds is - * false. - * 'legacy': Behave the way the old HTML 4-based ID escaping worked even - * if $wgExperimentalHtmlIds is used, so we can generate extra - * anchors and links won't break. + * @param string|array $options Deprecated and does nothing. * @return string */ static function escapeIdReferenceList( $referenceString, $options = [] ) { @@ -1238,7 +1380,7 @@ class Sanitizer { # Escape each token as an id foreach ( $references as &$ref ) { - $ref = Sanitizer::escapeId( $ref, $options ); + $ref = self::escapeIdForAttribute( $ref ); } # Merge the array back to a space delimited list string @@ -1275,7 +1417,7 @@ class Sanitizer { * @return string Escaped input */ static function escapeHtmlAllowEntities( $html ) { - $html = Sanitizer::decodeCharReferences( $html ); + $html = self::decodeCharReferences( $html ); # It seems wise to escape ' as well as ", as a matter of course. Can't # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters # don't cause the entire string to disappear. @@ -1317,14 +1459,14 @@ class Sanitizer { foreach ( $pairs as $set ) { $attribute = strtolower( $set[1] ); - $value = Sanitizer::getTagAttributeCallback( $set ); + $value = self::getTagAttributeCallback( $set ); // Normalize whitespace $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); $value = trim( $value ); // Decode character references - $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); + $attribs[$attribute] = self::decodeCharReferences( $value ); } return $attribs; } @@ -1340,7 +1482,7 @@ class Sanitizer { $attribs = []; foreach ( $assoc_array as $attribute => $value ) { $encAttribute = htmlspecialchars( $attribute ); - $encValue = Sanitizer::safeEncodeAttribute( $value ); + $encValue = self::safeEncodeAttribute( $value ); $attribs[] = "$encAttribute=\"$encValue\""; } @@ -1427,11 +1569,11 @@ class Sanitizer { static function normalizeCharReferencesCallback( $matches ) { $ret = null; if ( $matches[1] != '' ) { - $ret = Sanitizer::normalizeEntity( $matches[1] ); + $ret = self::normalizeEntity( $matches[1] ); } elseif ( $matches[2] != '' ) { - $ret = Sanitizer::decCharReference( $matches[2] ); + $ret = self::decCharReference( $matches[2] ); } elseif ( $matches[3] != '' ) { - $ret = Sanitizer::hexCharReference( $matches[3] ); + $ret = self::hexCharReference( $matches[3] ); } if ( is_null( $ret ) ) { return htmlspecialchars( $matches[0] ); @@ -1468,7 +1610,7 @@ class Sanitizer { */ static function decCharReference( $codepoint ) { $point = intval( $codepoint ); - if ( Sanitizer::validateCodepoint( $point ) ) { + if ( self::validateCodepoint( $point ) ) { return sprintf( '&#%d;', $point ); } else { return null; @@ -1481,7 +1623,7 @@ class Sanitizer { */ static function hexCharReference( $codepoint ) { $point = hexdec( $codepoint ); - if ( Sanitizer::validateCodepoint( $point ) ) { + if ( self::validateCodepoint( $point ) ) { return sprintf( '&#x%x;', $point ); } else { return null; @@ -1535,7 +1677,10 @@ class Sanitizer { $text = preg_replace_callback( self::CHAR_REFS_REGEX, [ 'Sanitizer', 'decodeCharReferencesCallback' ], - $text, /* limit */ -1, $count ); + $text, + -1, //limit + $count + ); if ( $count ) { return $wgContLang->normalize( $text ); @@ -1550,11 +1695,11 @@ class Sanitizer { */ static function decodeCharReferencesCallback( $matches ) { if ( $matches[1] != '' ) { - return Sanitizer::decodeEntity( $matches[1] ); + return self::decodeEntity( $matches[1] ); } elseif ( $matches[2] != '' ) { - return Sanitizer::decodeChar( intval( $matches[2] ) ); + return self::decodeChar( intval( $matches[2] ) ); } elseif ( $matches[3] != '' ) { - return Sanitizer::decodeChar( hexdec( $matches[3] ) ); + return self::decodeChar( hexdec( $matches[3] ) ); } # Last case should be an ampersand by itself return $matches[0]; @@ -1568,7 +1713,7 @@ class Sanitizer { * @private */ static function decodeChar( $codepoint ) { - if ( Sanitizer::validateCodepoint( $codepoint ) ) { + if ( self::validateCodepoint( $codepoint ) ) { return UtfNormal\Utils::codepointToUtf8( $codepoint ); } else { return UtfNormal\Constants::UTF8_REPLACEMENT; @@ -1601,7 +1746,7 @@ class Sanitizer { * @return array */ static function attributeWhitelist( $element ) { - $list = Sanitizer::setupAttributeWhitelist(); + $list = self::setupAttributeWhitelist(); return isset( $list[$element] ) ? $list[$element] : []; @@ -1772,7 +1917,7 @@ class Sanitizer { # Not usually allowed, but may be used for extension-style hooks # such as when it is rasterized, or if $wgAllowImageTag is # true - 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height' ] ), + 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), 'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), 'source' => array_merge( $common, [ 'type', 'src' ] ), @@ -1809,6 +1954,10 @@ class Sanitizer { # https://www.w3.org/TR/REC-MathML/ 'math' => [ 'class', 'style', 'id', 'title' ], + // HTML 5 section 4.5 + 'figure' => $common, + 'figcaption' => $common, + # HTML 5 section 4.6 'bdi' => $common, @@ -1824,7 +1973,7 @@ class Sanitizer { // (ie: validateTag rejects tags missing the attributes needed for Microdata) // So we don't bother including $common attributes that have no purpose. 'meta' => [ 'itemprop', 'content' ], - 'link' => [ 'itemprop', 'href' ], + 'link' => [ 'itemprop', 'href', 'title' ], ]; return $whitelist; @@ -1876,7 +2025,7 @@ class Sanitizer { static function cleanUrl( $url ) { # Normalize any HTML entities in input. They will be # re-escaped by makeExternalLink(). - $url = Sanitizer::decodeCharReferences( $url ); + $url = self::decodeCharReferences( $url ); # Escape any control characters introduced by the above step $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',