X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=44e4e3eb914244348fc243ab0bafb59040d78b3b;hb=22806b0a4509e97b56fb52b387e17e3c80fb7eb2;hp=8f1fc99fcde07fec42629d7b934960a3154891fd;hpb=425090d4eb0b9de89ad6818f40ab8295368f645e;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 8f1fc99fcd..5f6abee485 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -41,7 +41,7 @@ class Sanitizer {
 
 	/**
 	 * Acceptable tag name charset from HTML5 parsing spec
-	 * http://www.w3.org/TR/html5/syntax.html#tag-open-state
+	 * https://www.w3.org/TR/html5/syntax.html#tag-open-state
 	 */
 	const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
 
@@ -58,7 +58,7 @@ class Sanitizer {
 
 	/**
 	 * List of all named character entities defined in HTML 4.01
-	 * http://www.w3.org/TR/html4/sgml/entities.html
+	 * https://www.w3.org/TR/html4/sgml/entities.html
 	 * As well as &apos; which is only defined starting in XHTML1.
 	 */
 	private static $htmlEntities = [
@@ -333,7 +333,7 @@ class Sanitizer {
 	/**
 	 * Regular expression to match HTML/XML attribute pairs within a tag.
 	 * Allows some... latitude. Based on,
-	 * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+	 * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 	 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
 	 * @return string
 	 */
@@ -344,12 +344,12 @@ class Sanitizer {
 			$space = '[\x09\x0a\x0c\x0d\x20]';
 			self::$attribsRegex =
 				"/(?:^|$space)({$attribFirst}{$attrib}*)
-				  ($space*=$space*
+					($space*=$space*
 					(?:
-					 # The attribute value: quoted or alone
-					  \"([^\"]*)(?:\"|\$)
-					 | '([^']*)(?:'|\$)
-					 |  (((?!$space|>).)*)
+						# The attribute value: quoted or alone
+						\"([^\"]*)(?:\"|\$)
+						| '([^']*)(?:'|\$)
+						| (((?!$space|>).)*)
 					)
 				)?(?=$space|\$)/sx";
 		}
@@ -545,7 +545,7 @@ class Sanitizer {
 							$badtag = true;
 						} elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
 							$badtag = true;
-						# Â Is it a self closed htmlpair ? (bug 5487)
+						# Â Is it a self closed htmlpair ? (T7487)
 						} elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
 							// Eventually we'll just remove the self-closing
 							// slash, in order to be consistent with HTML5
@@ -922,7 +922,7 @@ class Sanitizer {
 
 		// Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
 		$value = preg_replace_callback(
-			'/[ï¼-ï¼»ï¼½-ï½]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
+			'/[ï¼-ï¼»ï¼½-ï½]/u', // U+FF01 to U+FF5A, excluding U+FF3C (T60088)
 			function ( $matches ) {
 				$cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] );
 				if ( $cp === false ) {
@@ -1015,6 +1015,7 @@ class Sanitizer {
 				| url\s*\(
 				| image\s*\(
 				| image-set\s*\(
+				| attr\s*\([^)]+[\s,]+url
 			!ix', $value ) ) {
 			return '/* insecure input */';
 		}
@@ -1118,6 +1119,7 @@ class Sanitizer {
 			'>'    => '&gt;',   // we've received invalid input
 			'"'    => '&quot;', // which should have been escaped.
 			'{'    => '&#123;',
+			'}'    => '&#125;', // prevent unpaired language conversion syntax
 			'['    => '&#91;',
 			"''"   => '&#39;&#39;',
 			'ISBN' => '&#73;SBN',
@@ -1148,11 +1150,11 @@ class Sanitizer {
 	 * ambiguous if it's part of something that looks like a percent escape
 	 * (which don't work reliably in fragments cross-browser).
 	 *
-	 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
+	 * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
 	 *   in the id and name attributes
-	 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
+	 * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
 	 *   the id attribute
-	 * @see http://www.whatwg.org/html/elements.html#the-id-attribute
+	 * @see https://www.w3.org/TR/html5/dom.html#the-id-attribute
 	 *   HTML5 definition of id attribute
 	 *
 	 * @param string $id Id to escape
@@ -1238,7 +1240,7 @@ class Sanitizer {
 	 *
 	 * @todo For extra validity, input should be validated UTF-8.
 	 *
-	 * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
+	 * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 	 *
 	 * @param string $class
 	 * @return string
@@ -1261,8 +1263,9 @@ class Sanitizer {
 	static function escapeHtmlAllowEntities( $html ) {
 		$html = Sanitizer::decodeCharReferences( $html );
 		# It seems wise to escape ' as well as ", as a matter of course.  Can't
-		# hurt.
-		$html = htmlspecialchars( $html, ENT_QUOTES );
+		# hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
+		# don't cause the entire string to disappear.
+		$html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
 		return $html;
 	}
 
@@ -1351,7 +1354,7 @@ class Sanitizer {
 		} elseif ( !isset( $set[2] ) ) {
 			# In XHTML, attributes must have a value so return an empty string.
 			# See "Empty attribute syntax",
-			# http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
+			# https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
 			return "";
 		} else {
 			throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
@@ -1505,7 +1508,7 @@ class Sanitizer {
 
 	/**
 	 * Decode any character references, numeric or named entities,
-	 * in the next and normalize the resulting string. (bug 14952)
+	 * in the next and normalize the resulting string. (T16952)
 	 *
 	 * This is useful for page titles, not for text to be displayed,
 	 * MediaWiki allows HTML entities to escape normalization as a feature.
@@ -1621,7 +1624,7 @@ class Sanitizer {
 
 			# RDFa
 			# These attributes are specified in section 9 of
-			# http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+			# https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
 			'about',
 			'property',
 			'resource',
@@ -1629,7 +1632,7 @@ class Sanitizer {
 			'typeof',
 
 			# Microdata. These are specified by
-			# http://www.whatwg.org/html/microdata.html#the-microdata-model
+			# https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
 			'itemid',
 			'itemprop',
 			'itemref',
@@ -1653,7 +1656,7 @@ class Sanitizer {
 		];
 
 		# Numbers refer to sections in HTML 4.01 standard describing the element.
-		# See: http://www.w3.org/TR/html4/
+		# See: https://www.w3.org/TR/html4/
 		$whitelist = [
 			# 7.5.4
 			'div'        => $block,
@@ -1700,7 +1703,7 @@ class Sanitizer {
 			# 9.3.2
 			'br'         => array_merge( $common, [ 'clear' ] ),
 
-			# http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
+			# https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
 			'wbr'        => $common,
 
 			# 9.3.4
@@ -1775,7 +1778,7 @@ class Sanitizer {
 			'hr'         => array_merge( $common, [ 'width' ] ),
 
 			# HTML Ruby annotation text module, simple ruby only.
-			# http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
+			# https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
 			'ruby'       => $common,
 			# rbc
 			'rb'         => $common,
@@ -1785,14 +1788,14 @@ class Sanitizer {
 
 			# MathML root element, where used for extensions
 			# 'title' may not be 100% valid here; it's XHTML
-			# http://www.w3.org/TR/REC-MathML/
+			# https://www.w3.org/TR/REC-MathML/
 			'math'       => [ 'class', 'style', 'id', 'title' ],
 
 			# HTML 5 section 4.6
 			'bdi' => $common,
 
 			# HTML5 elements, defined by:
-			# http://www.whatwg.org/html/
+			# https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
 			'data' => array_merge( $common, [ 'value' ] ),
 			'time' => array_merge( $common, [ 'datetime' ] ),
 			'mark' => $common,
@@ -1867,7 +1870,7 @@ class Sanitizer {
 			list( /* $whole */, $protocol, $host, $rest ) = $matches;
 
 			// Characters that will be ignored in IDNs.
-			// http://tools.ietf.org/html/3454#section-3.1
+			// https://tools.ietf.org/html/rfc3454#section-3.1
 			// Strip them before further processing so blacklists and such work.
 			$strip = "/
 				\\s|          # general whitespace
@@ -1923,7 +1926,7 @@ class Sanitizer {
 	 *   3.5.
 	 *
 	 * This function is an implementation of the specification as requested in
-	 * bug 22449.
+	 * T24449.
 	 *
 	 * Client-side forms will use the same standard validation rules via JS or
 	 * HTML 5 validation; additional restrictions can be enforced server-side
@@ -1946,7 +1949,7 @@ class Sanitizer {
 
 		// Please note strings below are enclosed in brackets [], this make the
 		// hyphen "-" a range indicator. Hence it is double backslashed below.
-		// See bug 26948
+		// See T28948
 		$rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
 		$rfc1034_ldh_str = "a-z0-9\\-";