X-Git-Url: https://git.heureux-cyclage.org/?p=lhc%2Fweb%2Fwiklou.git;a=blobdiff_plain;f=includes%2FSanitizer.php;h=ed09701d4bf714181d35663b34da2967f4e18755;hp=b08bc6942581dccb70025eaaa763b8a4d6e56b0d;hb=89539f2aa1b158fdcc703ad053e2580cb97a6385;hpb=b85207c9c014b4a639cf3ce978cbb5f59799ef2a

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index b08bc69425..ed09701d4b 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -56,6 +56,21 @@ class Sanitizer {
 	const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
 	const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
 
+	/**
+	 * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
+	 *
+	 * @since 1.30
+	 */
+	const ID_PRIMARY = 0;
+
+	/**
+	 * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
+	 * if no fallback is configured.
+	 *
+	 * @since 1.30
+	 */
+	const ID_FALLBACK = 1;
+
 	/**
 	 * List of all named character entities defined in HTML 4.01
 	 * https://www.w3.org/TR/html4/sgml/entities.html
@@ -465,7 +480,7 @@ class Sanitizer {
 		extract( self::getRecognizedTagData( $extratags, $removetags ) );
 
 		# Remove HTML comments
-		$text = Sanitizer::removeHTMLcomments( $text );
+		$text = self::removeHTMLcomments( $text );
 		$bits = explode( '<', $text );
 		$text = str_replace( '>', '&gt;', array_shift( $bits ) );
 		if ( !MWTidy::isEnabled() ) {
@@ -583,12 +598,12 @@ class Sanitizer {
 							call_user_func_array( $processCallback, [ &$params, $args ] );
 						}
 
-						if ( !Sanitizer::validateTag( $params, $t ) ) {
+						if ( !self::validateTag( $params, $t ) ) {
 							$badtag = true;
 						}
 
 						# Strip non-approved attributes from the tag
-						$newparams = Sanitizer::fixTagAttributes( $params, $t );
+						$newparams = self::fixTagAttributes( $params, $t );
 					}
 					if ( !$badtag ) {
 						$rest = str_replace( '>', '&gt;', $rest );
@@ -629,11 +644,11 @@ class Sanitizer {
 								call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
 							}
 						}
-						if ( !Sanitizer::validateTag( $params, $t ) ) {
+						if ( !self::validateTag( $params, $t ) ) {
 							$badtag = true;
 						}
 
-						$newparams = Sanitizer::fixTagAttributes( $params, $t );
+						$newparams = self::fixTagAttributes( $params, $t );
 						if ( !$badtag ) {
 							if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
 								# Interpret self-closing tags as empty tags even when
@@ -710,7 +725,7 @@ class Sanitizer {
 	 * @return bool
 	 */
 	static function validateTag( $params, $element ) {
-		$params = Sanitizer::decodeTagAttributes( $params );
+		$params = self::decodeTagAttributes( $params );
 
 		if ( $element == 'meta' || $element == 'link' ) {
 			if ( !isset( $params['itemprop'] ) ) {
@@ -746,8 +761,8 @@ class Sanitizer {
 	 * @todo Check for unique id attribute :P
 	 */
 	static function validateTagAttributes( $attribs, $element ) {
-		return Sanitizer::validateAttributes( $attribs,
-			Sanitizer::attributeWhitelist( $element ) );
+		return self::validateAttributes( $attribs,
+			self::attributeWhitelist( $element ) );
 	}
 
 	/**
@@ -795,12 +810,12 @@ class Sanitizer {
 			# Strip javascript "expression" from stylesheets.
 			# https://msdn.microsoft.com/en-us/library/ms537634.aspx
 			if ( $attribute == 'style' ) {
-				$value = Sanitizer::checkCss( $value );
+				$value = self::checkCss( $value );
 			}
 
 			# Escape HTML id attributes
 			if ( $attribute === 'id' ) {
-				$value = Sanitizer::escapeId( $value, 'noninitial' );
+				$value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
 			}
 
 			# Escape HTML id reference lists
@@ -809,7 +824,7 @@ class Sanitizer {
 				|| $attribute === 'aria-labelledby'
 				|| $attribute === 'aria-owns'
 			) {
-				$value = Sanitizer::escapeIdReferenceList( $value, 'noninitial' );
+				$value = self::escapeIdReferenceList( $value, 'noninitial' );
 			}
 
 			// RDFa and microdata properties allow URLs, URIs and/or CURIs.
@@ -907,7 +922,7 @@ class Sanitizer {
 	 */
 	public static function normalizeCss( $value ) {
 		// Decode character references like &#123;
-		$value = Sanitizer::decodeCharReferences( $value );
+		$value = self::decodeCharReferences( $value );
 
 		// Decode escape sequences and line continuation
 		// See the grammar in the CSS 2 spec, appendix D.
@@ -1087,14 +1102,14 @@ class Sanitizer {
 			return '';
 		}
 
-		$decoded = Sanitizer::decodeTagAttributes( $text );
-		$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+		$decoded = self::decodeTagAttributes( $text );
+		$stripped = self::validateTagAttributes( $decoded, $element );
 
 		if ( $sorted ) {
 			ksort( $stripped );
 		}
 
-		return Sanitizer::safeEncodeTagAttributes( $stripped );
+		return self::safeEncodeTagAttributes( $stripped );
 	}
 
 	/**
@@ -1124,7 +1139,7 @@ class Sanitizer {
 	 * @return string HTML-encoded text fragment
 	 */
 	static function safeEncodeAttribute( $text ) {
-		$encValue = Sanitizer::encodeAttribute( $text );
+		$encValue = self::encodeAttribute( $text );
 
 		# Templates and links may be expanded in later parsing,
 		# creating invalid or dangerous output. Suppress this.
@@ -1164,6 +1179,8 @@ class Sanitizer {
 	 * ambiguous if it's part of something that looks like a percent escape
 	 * (which don't work reliably in fragments cross-browser).
 	 *
+	 * @deprecated since 1.30, use one of this class' escapeIdFor*() functions
+	 *
 	 * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
 	 *   in the id and name attributes
 	 * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
@@ -1186,7 +1203,7 @@ class Sanitizer {
 		global $wgExperimentalHtmlIds;
 		$options = (array)$options;
 
-		$id = Sanitizer::decodeCharReferences( $id );
+		$id = self::decodeCharReferences( $id );
 
 		if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
 			$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
@@ -1215,21 +1232,146 @@ class Sanitizer {
 		return $id;
 	}
 
+	/**
+	 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+	 * a valid HTML id attribute.
+	 *
+	 * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+	 * be sure to use proper escaping.
+	 *
+	 * @param string $id String to escape
+	 * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
+	 *     should be used.
+	 * @return string|bool Escaped ID or false if fallback encoding is requested but it's not
+	 *     configured.
+	 *
+	 * @since 1.30
+	 */
+	public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
+		global $wgFragmentMode;
+
+		if ( !isset( $wgFragmentMode[$mode] ) ) {
+			if ( $mode === self::ID_PRIMARY ) {
+				throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+			}
+			return false;
+		}
+
+		$internalMode = $wgFragmentMode[$mode];
+
+		return self::escapeIdInternal( $id, $internalMode );
+	}
+
+	/**
+	 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+	 * a valid URL fragment.
+	 *
+	 * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+	 * be sure to use proper escaping.
+	 *
+	 * @param string $id String to escape
+	 * @return string Escaped ID
+	 *
+	 * @since 1.30
+	 */
+	public static function escapeIdForLink( $id ) {
+		global $wgFragmentMode;
+
+		if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
+			throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+		}
+
+		$mode = $wgFragmentMode[self::ID_PRIMARY];
+
+		$id = self::escapeIdInternal( $id, $mode );
+		$id = self::urlEscapeId( $id, $mode );
+
+		return $id;
+	}
+
+	/**
+	 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+	 * a valid URL fragment for external interwikis.
+	 *
+	 * @param string $id String to escape
+	 * @return string Escaped ID
+	 *
+	 * @since 1.30
+	 */
+	public static function escapeIdForExternalInterwiki( $id ) {
+		global $wgExternalInterwikiFragmentMode;
+
+		$id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
+		$id = self::urlEscapeId( $id, $wgExternalInterwikiFragmentMode );
+
+		return $id;
+	}
+
+	/**
+	 * Helper for escapeIdFor*() functions. URL-escapes the ID if needed.
+	 *
+	 * @param string $id String to escape
+	 * @param string $mode One of modes from $wgFragmentMode
+	 * @return string
+	 */
+	private static function urlEscapeId( $id, $mode ) {
+		if ( $mode === 'html5' ) {
+			$id = urlencode( $id );
+			$id = str_replace( '%3A', ':', $id );
+		}
+
+		return $id;
+	}
+
+	/**
+	 * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
+	 *
+	 * @param string $id String to escape
+	 * @param string $mode One of modes from $wgFragmentMode
+	 * @return string
+	 */
+	private static function escapeIdInternal( $id, $mode ) {
+		$id = self::decodeCharReferences( $id );
+
+		switch ( $mode ) {
+			case 'html5':
+				$id = str_replace( ' ', '_', $id );
+				break;
+			case 'legacy':
+				// This corresponds to 'noninitial' mode of the old escapeId()
+				static $replace = [
+					'%3A' => ':',
+					'%' => '.'
+				];
+
+				$id = urlencode( str_replace( ' ', '_', $id ) );
+				$id = strtr( $id, $replace );
+				break;
+			case 'html5-legacy':
+				$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
+				$id = trim( $id, '_' );
+				if ( $id === '' ) {
+					// Must have been all whitespace to start with.
+					$id = '_';
+				}
+				break;
+			default:
+				throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
+		}
+
+		return $id;
+	}
+
 	/**
 	 * Given a string containing a space delimited list of ids, escape each id
 	 * to match ids escaped by the escapeId() function.
 	 *
+	 * @todo wfDeprecated() uses of $options in 1.31, remove completely in 1.32
+	 *
 	 * @since 1.27
 	 *
 	 * @param string $referenceString Space delimited list of ids
-	 * @param string|array $options String or array of strings (default is array()):
-	 *   'noninitial': This is a non-initial fragment of an id, not a full id,
-	 *       so don't pay attention if the first character isn't valid at the
-	 *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
-	 *       false.
-	 *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
-	 *       if $wgExperimentalHtmlIds is used, so we can generate extra
-	 *       anchors and links won't break.
+	 * @param string|array $options Deprecated and does nothing.
 	 * @return string
 	 */
 	static function escapeIdReferenceList( $referenceString, $options = [] ) {
@@ -1238,7 +1380,7 @@ class Sanitizer {
 
 		# Escape each token as an id
 		foreach ( $references as &$ref ) {
-			$ref = Sanitizer::escapeId( $ref, $options );
+			$ref = self::escapeIdForAttribute( $ref );
 		}
 
 		# Merge the array back to a space delimited list string
@@ -1275,7 +1417,7 @@ class Sanitizer {
 	 * @return string Escaped input
 	 */
 	static function escapeHtmlAllowEntities( $html ) {
-		$html = Sanitizer::decodeCharReferences( $html );
+		$html = self::decodeCharReferences( $html );
 		# It seems wise to escape ' as well as ", as a matter of course.  Can't
 		# hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
 		# don't cause the entire string to disappear.
@@ -1317,14 +1459,14 @@ class Sanitizer {
 
 		foreach ( $pairs as $set ) {
 			$attribute = strtolower( $set[1] );
-			$value = Sanitizer::getTagAttributeCallback( $set );
+			$value = self::getTagAttributeCallback( $set );
 
 			// Normalize whitespace
 			$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 			$value = trim( $value );
 
 			// Decode character references
-			$attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
+			$attribs[$attribute] = self::decodeCharReferences( $value );
 		}
 		return $attribs;
 	}
@@ -1340,7 +1482,7 @@ class Sanitizer {
 		$attribs = [];
 		foreach ( $assoc_array as $attribute => $value ) {
 			$encAttribute = htmlspecialchars( $attribute );
-			$encValue = Sanitizer::safeEncodeAttribute( $value );
+			$encValue = self::safeEncodeAttribute( $value );
 
 			$attribs[] = "$encAttribute=\"$encValue\"";
 		}
@@ -1427,11 +1569,11 @@ class Sanitizer {
 	static function normalizeCharReferencesCallback( $matches ) {
 		$ret = null;
 		if ( $matches[1] != '' ) {
-			$ret = Sanitizer::normalizeEntity( $matches[1] );
+			$ret = self::normalizeEntity( $matches[1] );
 		} elseif ( $matches[2] != '' ) {
-			$ret = Sanitizer::decCharReference( $matches[2] );
+			$ret = self::decCharReference( $matches[2] );
 		} elseif ( $matches[3] != '' ) {
-			$ret = Sanitizer::hexCharReference( $matches[3] );
+			$ret = self::hexCharReference( $matches[3] );
 		}
 		if ( is_null( $ret ) ) {
 			return htmlspecialchars( $matches[0] );
@@ -1468,7 +1610,7 @@ class Sanitizer {
 	 */
 	static function decCharReference( $codepoint ) {
 		$point = intval( $codepoint );
-		if ( Sanitizer::validateCodepoint( $point ) ) {
+		if ( self::validateCodepoint( $point ) ) {
 			return sprintf( '&#%d;', $point );
 		} else {
 			return null;
@@ -1481,7 +1623,7 @@ class Sanitizer {
 	 */
 	static function hexCharReference( $codepoint ) {
 		$point = hexdec( $codepoint );
-		if ( Sanitizer::validateCodepoint( $point ) ) {
+		if ( self::validateCodepoint( $point ) ) {
 			return sprintf( '&#x%x;', $point );
 		} else {
 			return null;
@@ -1535,7 +1677,10 @@ class Sanitizer {
 		$text = preg_replace_callback(
 			self::CHAR_REFS_REGEX,
 			[ 'Sanitizer', 'decodeCharReferencesCallback' ],
-			$text, /* limit */ -1, $count );
+			$text,
+			-1, //limit
+			$count
+		);
 
 		if ( $count ) {
 			return $wgContLang->normalize( $text );
@@ -1550,11 +1695,11 @@ class Sanitizer {
 	 */
 	static function decodeCharReferencesCallback( $matches ) {
 		if ( $matches[1] != '' ) {
-			return Sanitizer::decodeEntity( $matches[1] );
+			return self::decodeEntity( $matches[1] );
 		} elseif ( $matches[2] != '' ) {
-			return Sanitizer::decodeChar( intval( $matches[2] ) );
+			return self::decodeChar( intval( $matches[2] ) );
 		} elseif ( $matches[3] != '' ) {
-			return Sanitizer::decodeChar( hexdec( $matches[3] ) );
+			return self::decodeChar( hexdec( $matches[3] ) );
 		}
 		# Last case should be an ampersand by itself
 		return $matches[0];
@@ -1568,7 +1713,7 @@ class Sanitizer {
 	 * @private
 	 */
 	static function decodeChar( $codepoint ) {
-		if ( Sanitizer::validateCodepoint( $codepoint ) ) {
+		if ( self::validateCodepoint( $codepoint ) ) {
 			return UtfNormal\Utils::codepointToUtf8( $codepoint );
 		} else {
 			return UtfNormal\Constants::UTF8_REPLACEMENT;
@@ -1601,7 +1746,7 @@ class Sanitizer {
 	 * @return array
 	 */
 	static function attributeWhitelist( $element ) {
-		$list = Sanitizer::setupAttributeWhitelist();
+		$list = self::setupAttributeWhitelist();
 		return isset( $list[$element] )
 			? $list[$element]
 			: [];
@@ -1772,7 +1917,7 @@ class Sanitizer {
 			# Not usually allowed, but may be used for extension-style hooks
 			# such as <math> when it is rasterized, or if $wgAllowImageTag is
 			# true
-			'img'        => array_merge( $common, [ 'alt', 'src', 'width', 'height' ] ),
+			'img'        => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
 
 			'video'      => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
 			'source'     => array_merge( $common, [ 'type', 'src' ] ),
@@ -1809,6 +1954,10 @@ class Sanitizer {
 			# https://www.w3.org/TR/REC-MathML/
 			'math'       => [ 'class', 'style', 'id', 'title' ],
 
+			// HTML 5 section 4.5
+			'figure'     => $common,
+			'figcaption' => $common,
+
 			# HTML 5 section 4.6
 			'bdi' => $common,
 
@@ -1824,7 +1973,7 @@ class Sanitizer {
 			// (ie: validateTag rejects tags missing the attributes needed for Microdata)
 			// So we don't bother including $common attributes that have no purpose.
 			'meta' => [ 'itemprop', 'content' ],
-			'link' => [ 'itemprop', 'href' ],
+			'link' => [ 'itemprop', 'href', 'title' ],
 		];
 
 		return $whitelist;
@@ -1876,7 +2025,7 @@ class Sanitizer {
 	static function cleanUrl( $url ) {
 		# Normalize any HTML entities in input. They will be
 		# re-escaped by makeExternalLink().
-		$url = Sanitizer::decodeCharReferences( $url );
+		$url = self::decodeCharReferences( $url );
 
 		# Escape any control characters introduced by the above step
 		$url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',