X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=8424432f94595ca1edd415300761f1bda1331702;hb=956c2430c7b8fceb289aaeacc8d6c2e0def2c56e;hp=c4883ba289e02db9325c9072500a0cd43baf0134;hpb=e94c89e7e3374d28a21e531a171c210eebf4b245;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index c4883ba289..907da16054 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -56,6 +56,21 @@ class Sanitizer {
 	const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
 	const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
 
+	/**
+	 * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
+	 *
+	 * @since 1.30
+	 */
+	const ID_PRIMARY = 0;
+
+	/**
+	 * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
+	 * if no fallback is configured.
+	 *
+	 * @since 1.30
+	 */
+	const ID_FALLBACK = 1;
+
 	/**
 	 * List of all named character entities defined in HTML 4.01
 	 * https://www.w3.org/TR/html4/sgml/entities.html
@@ -339,8 +354,8 @@ class Sanitizer {
 	 */
 	static function getAttribsRegex() {
 		if ( self::$attribsRegex === null ) {
-			$attribFirst = '[:A-Z_a-z0-9]';
-			$attrib = '[:A-Z_a-z-.0-9]';
+			$attribFirst = "[:_\p{L}\p{N}]";
+			$attrib = "[:_\.\-\p{L}\p{N}]";
 			$space = '[\x09\x0a\x0c\x0d\x20]';
 			self::$attribsRegex =
 				"/(?:^|$space)({$attribFirst}{$attrib}*)
@@ -351,7 +366,7 @@ class Sanitizer {
 						| '([^']*)(?:'|\$)
 						| (((?!$space|>).)*)
 					)
-				)?(?=$space|\$)/sx";
+				)?(?=$space|\$)/sxu";
 		}
 		return self::$attribsRegex;
 	}
@@ -465,7 +480,7 @@ class Sanitizer {
 		extract( self::getRecognizedTagData( $extratags, $removetags ) );
 
 		# Remove HTML comments
-		$text = Sanitizer::removeHTMLcomments( $text );
+		$text = self::removeHTMLcomments( $text );
 		$bits = explode( '<', $text );
 		$text = str_replace( '>', '&gt;', array_shift( $bits ) );
 		if ( !MWTidy::isEnabled() ) {
@@ -583,12 +598,12 @@ class Sanitizer {
 							call_user_func_array( $processCallback, [ &$params, $args ] );
 						}
 
-						if ( !Sanitizer::validateTag( $params, $t ) ) {
+						if ( !self::validateTag( $params, $t ) ) {
 							$badtag = true;
 						}
 
 						# Strip non-approved attributes from the tag
-						$newparams = Sanitizer::fixTagAttributes( $params, $t );
+						$newparams = self::fixTagAttributes( $params, $t );
 					}
 					if ( !$badtag ) {
 						$rest = str_replace( '>', '&gt;', $rest );
@@ -629,11 +644,11 @@ class Sanitizer {
 								call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
 							}
 						}
-						if ( !Sanitizer::validateTag( $params, $t ) ) {
+						if ( !self::validateTag( $params, $t ) ) {
 							$badtag = true;
 						}
 
-						$newparams = Sanitizer::fixTagAttributes( $params, $t );
+						$newparams = self::fixTagAttributes( $params, $t );
 						if ( !$badtag ) {
 							if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
 								# Interpret self-closing tags as empty tags even when
@@ -710,7 +725,7 @@ class Sanitizer {
 	 * @return bool
 	 */
 	static function validateTag( $params, $element ) {
-		$params = Sanitizer::decodeTagAttributes( $params );
+		$params = self::decodeTagAttributes( $params );
 
 		if ( $element == 'meta' || $element == 'link' ) {
 			if ( !isset( $params['itemprop'] ) ) {
@@ -746,8 +761,8 @@ class Sanitizer {
 	 * @todo Check for unique id attribute :P
 	 */
 	static function validateTagAttributes( $attribs, $element ) {
-		return Sanitizer::validateAttributes( $attribs,
-			Sanitizer::attributeWhitelist( $element ) );
+		return self::validateAttributes( $attribs,
+			self::attributeWhitelist( $element ) );
 	}
 
 	/**
@@ -782,28 +797,25 @@ class Sanitizer {
 
 			# Allow any attribute beginning with "data-"
 			# However:
-			# * data-ooui is reserved for ooui
-			# * data-mw and data-parsoid are reserved for parsoid
-			# * data-mw-<name here> is reserved for extensions (or core) if
-			#   they need to communicate some data to the client and want to be
-			#   sure that it isn't coming from an untrusted user.
+			# * Disallow data attributes used by MediaWiki code
 			# * Ensure that the attribute is not namespaced by banning
 			#   colons.
-			if ( !preg_match( '/^data-(?!ooui|mw|parsoid)[^:]*$/i', $attribute )
+			if ( !preg_match( '/^data-[^:]*$/i', $attribute )
 				&& !isset( $whitelist[$attribute] )
+				|| self::isReservedDataAttribute( $attribute )
 			) {
 				continue;
 			}
 
 			# Strip javascript "expression" from stylesheets.
-			# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
+			# https://msdn.microsoft.com/en-us/library/ms537634.aspx
 			if ( $attribute == 'style' ) {
-				$value = Sanitizer::checkCss( $value );
+				$value = self::checkCss( $value );
 			}
 
 			# Escape HTML id attributes
 			if ( $attribute === 'id' ) {
-				$value = Sanitizer::escapeId( $value, 'noninitial' );
+				$value = self::escapeIdForAttribute( $value, Sanitizer::ID_PRIMARY );
 			}
 
 			# Escape HTML id reference lists
@@ -812,7 +824,7 @@ class Sanitizer {
 				|| $attribute === 'aria-labelledby'
 				|| $attribute === 'aria-owns'
 			) {
-				$value = Sanitizer::escapeIdReferenceList( $value, 'noninitial' );
+				$value = self::escapeIdReferenceList( $value, 'noninitial' );
 			}
 
 			// RDFa and microdata properties allow URLs, URIs and/or CURIs.
@@ -858,6 +870,24 @@ class Sanitizer {
 		return $out;
 	}
 
+	/**
+	 * Given an attribute name, checks whether it is a reserved data attribute
+	 * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
+	 * core and extension code can safely use it to communicate with frontend code.
+	 * @param string $attr Attribute name.
+	 * @return bool
+	 */
+	public static function isReservedDataAttribute( $attr ) {
+		// data-ooui is reserved for ooui.
+		// data-mw and data-parsoid are reserved for parsoid.
+		// data-mw-<name here> is reserved for extensions (or core) if
+		// they need to communicate some data to the client and want to be
+		// sure that it isn't coming from an untrusted user.
+		// We ignore the possibility of namespaces since user-generated HTML
+		// can't use them anymore.
+		return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
+	}
+
 	/**
 	 * Merge two sets of HTML attributes.  Conflicting items in the second set
 	 * will override those in the first, except for 'class' attributes which
@@ -891,9 +921,8 @@ class Sanitizer {
 	 * @return string normalized css
 	 */
 	public static function normalizeCss( $value ) {
-
 		// Decode character references like &#123;
-		$value = Sanitizer::decodeCharReferences( $value );
+		$value = self::decodeCharReferences( $value );
 
 		// Decode escape sequences and line continuation
 		// See the grammar in the CSS 2 spec, appendix D.
@@ -1073,14 +1102,14 @@ class Sanitizer {
 			return '';
 		}
 
-		$decoded = Sanitizer::decodeTagAttributes( $text );
-		$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+		$decoded = self::decodeTagAttributes( $text );
+		$stripped = self::validateTagAttributes( $decoded, $element );
 
 		if ( $sorted ) {
 			ksort( $stripped );
 		}
 
-		return Sanitizer::safeEncodeTagAttributes( $stripped );
+		return self::safeEncodeTagAttributes( $stripped );
 	}
 
 	/**
@@ -1110,7 +1139,7 @@ class Sanitizer {
 	 * @return string HTML-encoded text fragment
 	 */
 	static function safeEncodeAttribute( $text ) {
-		$encValue = Sanitizer::encodeAttribute( $text );
+		$encValue = self::encodeAttribute( $text );
 
 		# Templates and links may be expanded in later parsing,
 		# creating invalid or dangerous output. Suppress this.
@@ -1150,6 +1179,8 @@ class Sanitizer {
 	 * ambiguous if it's part of something that looks like a percent escape
 	 * (which don't work reliably in fragments cross-browser).
 	 *
+	 * @deprecated since 1.30, use one of this class' escapeIdFor*() functions
+	 *
 	 * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
 	 *   in the id and name attributes
 	 * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
@@ -1172,7 +1203,7 @@ class Sanitizer {
 		global $wgExperimentalHtmlIds;
 		$options = (array)$options;
 
-		$id = Sanitizer::decodeCharReferences( $id );
+		$id = self::decodeCharReferences( $id );
 
 		if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
 			$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
@@ -1192,7 +1223,7 @@ class Sanitizer {
 		];
 
 		$id = urlencode( strtr( $id, ' ', '_' ) );
-		$id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
+		$id = strtr( $id, $replace );
 
 		if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
 			// Initial character must be a letter!
@@ -1201,21 +1232,146 @@ class Sanitizer {
 		return $id;
 	}
 
+	/**
+	 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+	 * a valid HTML id attribute.
+	 *
+	 * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+	 * be sure to use proper escaping.
+	 *
+	 * @param string $id String to escape
+	 * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
+	 *     should be used.
+	 * @return string|bool Escaped ID or false if fallback encoding is requested but it's not
+	 *     configured.
+	 *
+	 * @since 1.30
+	 */
+	public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
+		global $wgFragmentMode;
+
+		if ( !isset( $wgFragmentMode[$mode] ) ) {
+			if ( $mode === self::ID_PRIMARY ) {
+				throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+			}
+			return false;
+		}
+
+		$internalMode = $wgFragmentMode[$mode];
+
+		return self::escapeIdInternal( $id, $internalMode );
+	}
+
+	/**
+	 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+	 * a valid URL fragment.
+	 *
+	 * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+	 * be sure to use proper escaping.
+	 *
+	 * @param string $id String to escape
+	 * @return string Escaped ID
+	 *
+	 * @since 1.30
+	 */
+	public static function escapeIdForLink( $id ) {
+		global $wgFragmentMode;
+
+		if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
+			throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+		}
+
+		$mode = $wgFragmentMode[self::ID_PRIMARY];
+
+		$id = self::escapeIdInternal( $id, $mode );
+		$id = self::urlEscapeId( $id, $mode );
+
+		return $id;
+	}
+
+	/**
+	 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+	 * a valid URL fragment for external interwikis.
+	 *
+	 * @param string $id String to escape
+	 * @return string Escaped ID
+	 *
+	 * @since 1.30
+	 */
+	public static function escapeIdForExternalInterwiki( $id ) {
+		global $wgExternalInterwikiFragmentMode;
+
+		$id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
+		$id = self::urlEscapeId( $id, $wgExternalInterwikiFragmentMode );
+
+		return $id;
+	}
+
+	/**
+	 * Helper for escapeIdFor*() functions. URL-escapes the ID if needed.
+	 *
+	 * @param string $id String to escape
+	 * @param string $mode One of modes from $wgFragmentMode
+	 * @return string
+	 */
+	private static function urlEscapeId( $id, $mode ) {
+		if ( $mode === 'html5' ) {
+			$id = urlencode( $id );
+			$id = str_replace( '%3A', ':', $id );
+		}
+
+		return $id;
+	}
+
+	/**
+	 * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
+	 *
+	 * @param string $id String to escape
+	 * @param string $mode One of modes from $wgFragmentMode
+	 * @return string
+	 */
+	private static function escapeIdInternal( $id, $mode ) {
+		$id = Sanitizer::decodeCharReferences( $id );
+
+		switch ( $mode ) {
+			case 'html5':
+				$id = str_replace( ' ', '_', $id );
+				break;
+			case 'legacy':
+				// This corresponds to 'noninitial' mode of the old escapeId()
+				static $replace = [
+					'%3A' => ':',
+					'%' => '.'
+				];
+
+				$id = urlencode( str_replace( ' ', '_', $id ) );
+				$id = strtr( $id, $replace );
+				break;
+			case 'html5-legacy':
+				$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
+				$id = trim( $id, '_' );
+				if ( $id === '' ) {
+					// Must have been all whitespace to start with.
+					$id = '_';
+				}
+				break;
+			default:
+				throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
+		}
+
+		return $id;
+	}
+
 	/**
 	 * Given a string containing a space delimited list of ids, escape each id
 	 * to match ids escaped by the escapeId() function.
 	 *
+	 * @todo wfDeprecated() uses of $options in 1.31, remove completely in 1.32
+	 *
 	 * @since 1.27
 	 *
 	 * @param string $referenceString Space delimited list of ids
-	 * @param string|array $options String or array of strings (default is array()):
-	 *   'noninitial': This is a non-initial fragment of an id, not a full id,
-	 *       so don't pay attention if the first character isn't valid at the
-	 *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
-	 *       false.
-	 *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
-	 *       if $wgExperimentalHtmlIds is used, so we can generate extra
-	 *       anchors and links won't break.
+	 * @param string|array $options Deprecated and does nothing.
 	 * @return string
 	 */
 	static function escapeIdReferenceList( $referenceString, $options = [] ) {
@@ -1224,7 +1380,7 @@ class Sanitizer {
 
 		# Escape each token as an id
 		foreach ( $references as &$ref ) {
-			$ref = Sanitizer::escapeId( $ref, $options );
+			$ref = self::escapeIdForAttribute( $ref );
 		}
 
 		# Merge the array back to a space delimited list string
@@ -1261,7 +1417,7 @@ class Sanitizer {
 	 * @return string Escaped input
 	 */
 	static function escapeHtmlAllowEntities( $html ) {
-		$html = Sanitizer::decodeCharReferences( $html );
+		$html = self::decodeCharReferences( $html );
 		# It seems wise to escape ' as well as ", as a matter of course.  Can't
 		# hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
 		# don't cause the entire string to disappear.
@@ -1303,14 +1459,14 @@ class Sanitizer {
 
 		foreach ( $pairs as $set ) {
 			$attribute = strtolower( $set[1] );
-			$value = Sanitizer::getTagAttributeCallback( $set );
+			$value = self::getTagAttributeCallback( $set );
 
 			// Normalize whitespace
 			$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 			$value = trim( $value );
 
 			// Decode character references
-			$attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
+			$attribs[$attribute] = self::decodeCharReferences( $value );
 		}
 		return $attribs;
 	}
@@ -1326,7 +1482,7 @@ class Sanitizer {
 		$attribs = [];
 		foreach ( $assoc_array as $attribute => $value ) {
 			$encAttribute = htmlspecialchars( $attribute );
-			$encValue = Sanitizer::safeEncodeAttribute( $value );
+			$encValue = self::safeEncodeAttribute( $value );
 
 			$attribs[] = "$encAttribute=\"$encValue\"";
 		}
@@ -1413,11 +1569,11 @@ class Sanitizer {
 	static function normalizeCharReferencesCallback( $matches ) {
 		$ret = null;
 		if ( $matches[1] != '' ) {
-			$ret = Sanitizer::normalizeEntity( $matches[1] );
+			$ret = self::normalizeEntity( $matches[1] );
 		} elseif ( $matches[2] != '' ) {
-			$ret = Sanitizer::decCharReference( $matches[2] );
+			$ret = self::decCharReference( $matches[2] );
 		} elseif ( $matches[3] != '' ) {
-			$ret = Sanitizer::hexCharReference( $matches[3] );
+			$ret = self::hexCharReference( $matches[3] );
 		}
 		if ( is_null( $ret ) ) {
 			return htmlspecialchars( $matches[0] );
@@ -1454,7 +1610,7 @@ class Sanitizer {
 	 */
 	static function decCharReference( $codepoint ) {
 		$point = intval( $codepoint );
-		if ( Sanitizer::validateCodepoint( $point ) ) {
+		if ( self::validateCodepoint( $point ) ) {
 			return sprintf( '&#%d;', $point );
 		} else {
 			return null;
@@ -1467,7 +1623,7 @@ class Sanitizer {
 	 */
 	static function hexCharReference( $codepoint ) {
 		$point = hexdec( $codepoint );
-		if ( Sanitizer::validateCodepoint( $point ) ) {
+		if ( self::validateCodepoint( $point ) ) {
 			return sprintf( '&#x%x;', $point );
 		} else {
 			return null;
@@ -1536,11 +1692,11 @@ class Sanitizer {
 	 */
 	static function decodeCharReferencesCallback( $matches ) {
 		if ( $matches[1] != '' ) {
-			return Sanitizer::decodeEntity( $matches[1] );
+			return self::decodeEntity( $matches[1] );
 		} elseif ( $matches[2] != '' ) {
-			return Sanitizer::decodeChar( intval( $matches[2] ) );
+			return self::decodeChar( intval( $matches[2] ) );
 		} elseif ( $matches[3] != '' ) {
-			return Sanitizer::decodeChar( hexdec( $matches[3] ) );
+			return self::decodeChar( hexdec( $matches[3] ) );
 		}
 		# Last case should be an ampersand by itself
 		return $matches[0];
@@ -1554,7 +1710,7 @@ class Sanitizer {
 	 * @private
 	 */
 	static function decodeChar( $codepoint ) {
-		if ( Sanitizer::validateCodepoint( $codepoint ) ) {
+		if ( self::validateCodepoint( $codepoint ) ) {
 			return UtfNormal\Utils::codepointToUtf8( $codepoint );
 		} else {
 			return UtfNormal\Constants::UTF8_REPLACEMENT;
@@ -1587,7 +1743,7 @@ class Sanitizer {
 	 * @return array
 	 */
 	static function attributeWhitelist( $element ) {
-		$list = Sanitizer::setupAttributeWhitelist();
+		$list = self::setupAttributeWhitelist();
 		return isset( $list[$element] )
 			? $list[$element]
 			: [];
@@ -1862,7 +2018,7 @@ class Sanitizer {
 	static function cleanUrl( $url ) {
 		# Normalize any HTML entities in input. They will be
 		# re-escaped by makeExternalLink().
-		$url = Sanitizer::decodeCharReferences( $url );
+		$url = self::decodeCharReferences( $url );
 
 		# Escape any control characters introduced by the above step
 		$url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',