X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=8424432f94595ca1edd415300761f1bda1331702;hb=956c2430c7b8fceb289aaeacc8d6c2e0def2c56e;hp=c4883ba289e02db9325c9072500a0cd43baf0134;hpb=e94c89e7e3374d28a21e531a171c210eebf4b245;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index c4883ba289..907da16054 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -56,6 +56,21 @@ class Sanitizer { const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; + /** + * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. + * + * @since 1.30 + */ + const ID_PRIMARY = 0; + + /** + * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false + * if no fallback is configured. + * + * @since 1.30 + */ + const ID_FALLBACK = 1; + /** * List of all named character entities defined in HTML 4.01 * https://www.w3.org/TR/html4/sgml/entities.html @@ -339,8 +354,8 @@ class Sanitizer { */ static function getAttribsRegex() { if ( self::$attribsRegex === null ) { - $attribFirst = '[:A-Z_a-z0-9]'; - $attrib = '[:A-Z_a-z-.0-9]'; + $attribFirst = "[:_\p{L}\p{N}]"; + $attrib = "[:_\.\-\p{L}\p{N}]"; $space = '[\x09\x0a\x0c\x0d\x20]'; self::$attribsRegex = "/(?:^|$space)({$attribFirst}{$attrib}*) @@ -351,7 +366,7 @@ class Sanitizer { | '([^']*)(?:'|\$) | (((?!$space|>).)*) ) - )?(?=$space|\$)/sx"; + )?(?=$space|\$)/sxu"; } return self::$attribsRegex; } @@ -465,7 +480,7 @@ class Sanitizer { extract( self::getRecognizedTagData( $extratags, $removetags ) ); # Remove HTML comments - $text = Sanitizer::removeHTMLcomments( $text ); + $text = self::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = str_replace( '>', '>', array_shift( $bits ) ); if ( !MWTidy::isEnabled() ) { @@ -583,12 +598,12 @@ class Sanitizer { call_user_func_array( $processCallback, [ &$params, $args ] ); } - if ( !Sanitizer::validateTag( $params, $t ) ) { + if ( !self::validateTag( $params, $t ) ) { $badtag = true; } # Strip non-approved attributes from the tag - $newparams = Sanitizer::fixTagAttributes( $params, $t ); + $newparams = self::fixTagAttributes( $params, $t ); } if ( !$badtag ) { $rest = str_replace( '>', '>', $rest ); @@ -629,11 +644,11 @@ class Sanitizer { call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] ); } } - if ( !Sanitizer::validateTag( $params, $t ) ) { + if ( !self::validateTag( $params, $t ) ) { $badtag = true; } - $newparams = Sanitizer::fixTagAttributes( $params, $t ); + $newparams = self::fixTagAttributes( $params, $t ); if ( !$badtag ) { if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) { # Interpret self-closing tags as empty tags even when @@ -710,7 +725,7 @@ class Sanitizer { * @return bool */ static function validateTag( $params, $element ) { - $params = Sanitizer::decodeTagAttributes( $params ); + $params = self::decodeTagAttributes( $params ); if ( $element == 'meta' || $element == 'link' ) { if ( !isset( $params['itemprop'] ) ) { @@ -746,8 +761,8 @@ class Sanitizer { * @todo Check for unique id attribute :P */ static function validateTagAttributes( $attribs, $element ) { - return Sanitizer::validateAttributes( $attribs, - Sanitizer::attributeWhitelist( $element ) ); + return self::validateAttributes( $attribs, + self::attributeWhitelist( $element ) ); } /** @@ -782,28 +797,25 @@ class Sanitizer { # Allow any attribute beginning with "data-" # However: - # * data-ooui is reserved for ooui - # * data-mw and data-parsoid are reserved for parsoid - # * data-mw- is reserved for extensions (or core) if - # they need to communicate some data to the client and want to be - # sure that it isn't coming from an untrusted user. + # * Disallow data attributes used by MediaWiki code # * Ensure that the attribute is not namespaced by banning # colons. - if ( !preg_match( '/^data-(?!ooui|mw|parsoid)[^:]*$/i', $attribute ) + if ( !preg_match( '/^data-[^:]*$/i', $attribute ) && !isset( $whitelist[$attribute] ) + || self::isReservedDataAttribute( $attribute ) ) { continue; } # Strip javascript "expression" from stylesheets. - # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp + # https://msdn.microsoft.com/en-us/library/ms537634.aspx if ( $attribute == 'style' ) { - $value = Sanitizer::checkCss( $value ); + $value = self::checkCss( $value ); } # Escape HTML id attributes if ( $attribute === 'id' ) { - $value = Sanitizer::escapeId( $value, 'noninitial' ); + $value = self::escapeIdForAttribute( $value, Sanitizer::ID_PRIMARY ); } # Escape HTML id reference lists @@ -812,7 +824,7 @@ class Sanitizer { || $attribute === 'aria-labelledby' || $attribute === 'aria-owns' ) { - $value = Sanitizer::escapeIdReferenceList( $value, 'noninitial' ); + $value = self::escapeIdReferenceList( $value, 'noninitial' ); } // RDFa and microdata properties allow URLs, URIs and/or CURIs. @@ -858,6 +870,24 @@ class Sanitizer { return $out; } + /** + * Given an attribute name, checks whether it is a reserved data attribute + * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki + * core and extension code can safely use it to communicate with frontend code. + * @param string $attr Attribute name. + * @return bool + */ + public static function isReservedDataAttribute( $attr ) { + // data-ooui is reserved for ooui. + // data-mw and data-parsoid are reserved for parsoid. + // data-mw- is reserved for extensions (or core) if + // they need to communicate some data to the client and want to be + // sure that it isn't coming from an untrusted user. + // We ignore the possibility of namespaces since user-generated HTML + // can't use them anymore. + return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr ); + } + /** * Merge two sets of HTML attributes. Conflicting items in the second set * will override those in the first, except for 'class' attributes which @@ -891,9 +921,8 @@ class Sanitizer { * @return string normalized css */ public static function normalizeCss( $value ) { - // Decode character references like { - $value = Sanitizer::decodeCharReferences( $value ); + $value = self::decodeCharReferences( $value ); // Decode escape sequences and line continuation // See the grammar in the CSS 2 spec, appendix D. @@ -1073,14 +1102,14 @@ class Sanitizer { return ''; } - $decoded = Sanitizer::decodeTagAttributes( $text ); - $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); + $decoded = self::decodeTagAttributes( $text ); + $stripped = self::validateTagAttributes( $decoded, $element ); if ( $sorted ) { ksort( $stripped ); } - return Sanitizer::safeEncodeTagAttributes( $stripped ); + return self::safeEncodeTagAttributes( $stripped ); } /** @@ -1110,7 +1139,7 @@ class Sanitizer { * @return string HTML-encoded text fragment */ static function safeEncodeAttribute( $text ) { - $encValue = Sanitizer::encodeAttribute( $text ); + $encValue = self::encodeAttribute( $text ); # Templates and links may be expanded in later parsing, # creating invalid or dangerous output. Suppress this. @@ -1150,6 +1179,8 @@ class Sanitizer { * ambiguous if it's part of something that looks like a percent escape * (which don't work reliably in fragments cross-browser). * + * @deprecated since 1.30, use one of this class' escapeIdFor*() functions + * * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and name attributes * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with @@ -1172,7 +1203,7 @@ class Sanitizer { global $wgExperimentalHtmlIds; $options = (array)$options; - $id = Sanitizer::decodeCharReferences( $id ); + $id = self::decodeCharReferences( $id ); if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); @@ -1192,7 +1223,7 @@ class Sanitizer { ]; $id = urlencode( strtr( $id, ' ', '_' ) ); - $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); + $id = strtr( $id, $replace ); if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) { // Initial character must be a letter! @@ -1201,21 +1232,146 @@ class Sanitizer { return $id; } + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid HTML id attribute. + * + * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, + * be sure to use proper escaping. + * + * @param string $id String to escape + * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding + * should be used. + * @return string|bool Escaped ID or false if fallback encoding is requested but it's not + * configured. + * + * @since 1.30 + */ + public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) { + global $wgFragmentMode; + + if ( !isset( $wgFragmentMode[$mode] ) ) { + if ( $mode === self::ID_PRIMARY ) { + throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); + } + return false; + } + + $internalMode = $wgFragmentMode[$mode]; + + return self::escapeIdInternal( $id, $internalMode ); + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid URL fragment. + * + * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, + * be sure to use proper escaping. + * + * @param string $id String to escape + * @return string Escaped ID + * + * @since 1.30 + */ + public static function escapeIdForLink( $id ) { + global $wgFragmentMode; + + if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) { + throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); + } + + $mode = $wgFragmentMode[self::ID_PRIMARY]; + + $id = self::escapeIdInternal( $id, $mode ); + $id = self::urlEscapeId( $id, $mode ); + + return $id; + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid URL fragment for external interwikis. + * + * @param string $id String to escape + * @return string Escaped ID + * + * @since 1.30 + */ + public static function escapeIdForExternalInterwiki( $id ) { + global $wgExternalInterwikiFragmentMode; + + $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode ); + $id = self::urlEscapeId( $id, $wgExternalInterwikiFragmentMode ); + + return $id; + } + + /** + * Helper for escapeIdFor*() functions. URL-escapes the ID if needed. + * + * @param string $id String to escape + * @param string $mode One of modes from $wgFragmentMode + * @return string + */ + private static function urlEscapeId( $id, $mode ) { + if ( $mode === 'html5' ) { + $id = urlencode( $id ); + $id = str_replace( '%3A', ':', $id ); + } + + return $id; + } + + /** + * Helper for escapeIdFor*() functions. Performs most of the actual escaping. + * + * @param string $id String to escape + * @param string $mode One of modes from $wgFragmentMode + * @return string + */ + private static function escapeIdInternal( $id, $mode ) { + $id = Sanitizer::decodeCharReferences( $id ); + + switch ( $mode ) { + case 'html5': + $id = str_replace( ' ', '_', $id ); + break; + case 'legacy': + // This corresponds to 'noninitial' mode of the old escapeId() + static $replace = [ + '%3A' => ':', + '%' => '.' + ]; + + $id = urlencode( str_replace( ' ', '_', $id ) ); + $id = strtr( $id, $replace ); + break; + case 'html5-legacy': + $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); + $id = trim( $id, '_' ); + if ( $id === '' ) { + // Must have been all whitespace to start with. + $id = '_'; + } + break; + default: + throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); + } + + return $id; + } + /** * Given a string containing a space delimited list of ids, escape each id * to match ids escaped by the escapeId() function. * + * @todo wfDeprecated() uses of $options in 1.31, remove completely in 1.32 + * * @since 1.27 * * @param string $referenceString Space delimited list of ids - * @param string|array $options String or array of strings (default is array()): - * 'noninitial': This is a non-initial fragment of an id, not a full id, - * so don't pay attention if the first character isn't valid at the - * beginning of an id. Only matters if $wgExperimentalHtmlIds is - * false. - * 'legacy': Behave the way the old HTML 4-based ID escaping worked even - * if $wgExperimentalHtmlIds is used, so we can generate extra - * anchors and links won't break. + * @param string|array $options Deprecated and does nothing. * @return string */ static function escapeIdReferenceList( $referenceString, $options = [] ) { @@ -1224,7 +1380,7 @@ class Sanitizer { # Escape each token as an id foreach ( $references as &$ref ) { - $ref = Sanitizer::escapeId( $ref, $options ); + $ref = self::escapeIdForAttribute( $ref ); } # Merge the array back to a space delimited list string @@ -1261,7 +1417,7 @@ class Sanitizer { * @return string Escaped input */ static function escapeHtmlAllowEntities( $html ) { - $html = Sanitizer::decodeCharReferences( $html ); + $html = self::decodeCharReferences( $html ); # It seems wise to escape ' as well as ", as a matter of course. Can't # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters # don't cause the entire string to disappear. @@ -1303,14 +1459,14 @@ class Sanitizer { foreach ( $pairs as $set ) { $attribute = strtolower( $set[1] ); - $value = Sanitizer::getTagAttributeCallback( $set ); + $value = self::getTagAttributeCallback( $set ); // Normalize whitespace $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); $value = trim( $value ); // Decode character references - $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); + $attribs[$attribute] = self::decodeCharReferences( $value ); } return $attribs; } @@ -1326,7 +1482,7 @@ class Sanitizer { $attribs = []; foreach ( $assoc_array as $attribute => $value ) { $encAttribute = htmlspecialchars( $attribute ); - $encValue = Sanitizer::safeEncodeAttribute( $value ); + $encValue = self::safeEncodeAttribute( $value ); $attribs[] = "$encAttribute=\"$encValue\""; } @@ -1413,11 +1569,11 @@ class Sanitizer { static function normalizeCharReferencesCallback( $matches ) { $ret = null; if ( $matches[1] != '' ) { - $ret = Sanitizer::normalizeEntity( $matches[1] ); + $ret = self::normalizeEntity( $matches[1] ); } elseif ( $matches[2] != '' ) { - $ret = Sanitizer::decCharReference( $matches[2] ); + $ret = self::decCharReference( $matches[2] ); } elseif ( $matches[3] != '' ) { - $ret = Sanitizer::hexCharReference( $matches[3] ); + $ret = self::hexCharReference( $matches[3] ); } if ( is_null( $ret ) ) { return htmlspecialchars( $matches[0] ); @@ -1454,7 +1610,7 @@ class Sanitizer { */ static function decCharReference( $codepoint ) { $point = intval( $codepoint ); - if ( Sanitizer::validateCodepoint( $point ) ) { + if ( self::validateCodepoint( $point ) ) { return sprintf( '&#%d;', $point ); } else { return null; @@ -1467,7 +1623,7 @@ class Sanitizer { */ static function hexCharReference( $codepoint ) { $point = hexdec( $codepoint ); - if ( Sanitizer::validateCodepoint( $point ) ) { + if ( self::validateCodepoint( $point ) ) { return sprintf( '&#x%x;', $point ); } else { return null; @@ -1536,11 +1692,11 @@ class Sanitizer { */ static function decodeCharReferencesCallback( $matches ) { if ( $matches[1] != '' ) { - return Sanitizer::decodeEntity( $matches[1] ); + return self::decodeEntity( $matches[1] ); } elseif ( $matches[2] != '' ) { - return Sanitizer::decodeChar( intval( $matches[2] ) ); + return self::decodeChar( intval( $matches[2] ) ); } elseif ( $matches[3] != '' ) { - return Sanitizer::decodeChar( hexdec( $matches[3] ) ); + return self::decodeChar( hexdec( $matches[3] ) ); } # Last case should be an ampersand by itself return $matches[0]; @@ -1554,7 +1710,7 @@ class Sanitizer { * @private */ static function decodeChar( $codepoint ) { - if ( Sanitizer::validateCodepoint( $codepoint ) ) { + if ( self::validateCodepoint( $codepoint ) ) { return UtfNormal\Utils::codepointToUtf8( $codepoint ); } else { return UtfNormal\Constants::UTF8_REPLACEMENT; @@ -1587,7 +1743,7 @@ class Sanitizer { * @return array */ static function attributeWhitelist( $element ) { - $list = Sanitizer::setupAttributeWhitelist(); + $list = self::setupAttributeWhitelist(); return isset( $list[$element] ) ? $list[$element] : []; @@ -1862,7 +2018,7 @@ class Sanitizer { static function cleanUrl( $url ) { # Normalize any HTML entities in input. They will be # re-escaped by makeExternalLink(). - $url = Sanitizer::decodeCharReferences( $url ); + $url = self::decodeCharReferences( $url ); # Escape any control characters introduced by the above step $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',