X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=44e4e3eb914244348fc243ab0bafb59040d78b3b;hb=22806b0a4509e97b56fb52b387e17e3c80fb7eb2;hp=8f1fc99fcde07fec42629d7b934960a3154891fd;hpb=425090d4eb0b9de89ad6818f40ab8295368f645e;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 8f1fc99fcd..5f6abee485 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -41,7 +41,7 @@ class Sanitizer { /** * Acceptable tag name charset from HTML5 parsing spec - * http://www.w3.org/TR/html5/syntax.html#tag-open-state + * https://www.w3.org/TR/html5/syntax.html#tag-open-state */ const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; @@ -58,7 +58,7 @@ class Sanitizer { /** * List of all named character entities defined in HTML 4.01 - * http://www.w3.org/TR/html4/sgml/entities.html + * https://www.w3.org/TR/html4/sgml/entities.html * As well as ' which is only defined starting in XHTML1. */ private static $htmlEntities = [ @@ -333,7 +333,7 @@ class Sanitizer { /** * Regular expression to match HTML/XML attribute pairs within a tag. * Allows some... latitude. Based on, - * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state + * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes * @return string */ @@ -344,12 +344,12 @@ class Sanitizer { $space = '[\x09\x0a\x0c\x0d\x20]'; self::$attribsRegex = "/(?:^|$space)({$attribFirst}{$attrib}*) - ($space*=$space* + ($space*=$space* (?: - # The attribute value: quoted or alone - \"([^\"]*)(?:\"|\$) - | '([^']*)(?:'|\$) - | (((?!$space|>).)*) + # The attribute value: quoted or alone + \"([^\"]*)(?:\"|\$) + | '([^']*)(?:'|\$) + | (((?!$space|>).)*) ) )?(?=$space|\$)/sx"; } @@ -545,7 +545,7 @@ class Sanitizer { $badtag = true; } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) { $badtag = true; - #  Is it a self closed htmlpair ? (bug 5487) + #  Is it a self closed htmlpair ? (T7487) } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) { // Eventually we'll just remove the self-closing // slash, in order to be consistent with HTML5 @@ -922,7 +922,7 @@ class Sanitizer { // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii $value = preg_replace_callback( - '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088) + '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (T60088) function ( $matches ) { $cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] ); if ( $cp === false ) { @@ -1015,6 +1015,7 @@ class Sanitizer { | url\s*\( | image\s*\( | image-set\s*\( + | attr\s*\([^)]+[\s,]+url !ix', $value ) ) { return '/* insecure input */'; } @@ -1118,6 +1119,7 @@ class Sanitizer { '>' => '>', // we've received invalid input '"' => '"', // which should have been escaped. '{' => '{', + '}' => '}', // prevent unpaired language conversion syntax '[' => '[', "''" => '''', 'ISBN' => 'ISBN', @@ -1148,11 +1150,11 @@ class Sanitizer { * ambiguous if it's part of something that looks like a percent escape * (which don't work reliably in fragments cross-browser). * - * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters + * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and name attributes - * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with + * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with * the id attribute - * @see http://www.whatwg.org/html/elements.html#the-id-attribute + * @see https://www.w3.org/TR/html5/dom.html#the-id-attribute * HTML5 definition of id attribute * * @param string $id Id to escape @@ -1238,7 +1240,7 @@ class Sanitizer { * * @todo For extra validity, input should be validated UTF-8. * - * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format + * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format * * @param string $class * @return string @@ -1261,8 +1263,9 @@ class Sanitizer { static function escapeHtmlAllowEntities( $html ) { $html = Sanitizer::decodeCharReferences( $html ); # It seems wise to escape ' as well as ", as a matter of course. Can't - # hurt. - $html = htmlspecialchars( $html, ENT_QUOTES ); + # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters + # don't cause the entire string to disappear. + $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE ); return $html; } @@ -1351,7 +1354,7 @@ class Sanitizer { } elseif ( !isset( $set[2] ) ) { # In XHTML, attributes must have a value so return an empty string. # See "Empty attribute syntax", - # http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name + # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name return ""; } else { throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); @@ -1505,7 +1508,7 @@ class Sanitizer { /** * Decode any character references, numeric or named entities, - * in the next and normalize the resulting string. (bug 14952) + * in the next and normalize the resulting string. (T16952) * * This is useful for page titles, not for text to be displayed, * MediaWiki allows HTML entities to escape normalization as a feature. @@ -1621,7 +1624,7 @@ class Sanitizer { # RDFa # These attributes are specified in section 9 of - # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 + # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 'about', 'property', 'resource', @@ -1629,7 +1632,7 @@ class Sanitizer { 'typeof', # Microdata. These are specified by - # http://www.whatwg.org/html/microdata.html#the-microdata-model + # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model 'itemid', 'itemprop', 'itemref', @@ -1653,7 +1656,7 @@ class Sanitizer { ]; # Numbers refer to sections in HTML 4.01 standard describing the element. - # See: http://www.w3.org/TR/html4/ + # See: https://www.w3.org/TR/html4/ $whitelist = [ # 7.5.4 'div' => $block, @@ -1700,7 +1703,7 @@ class Sanitizer { # 9.3.2 'br' => array_merge( $common, [ 'clear' ] ), - # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element + # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element 'wbr' => $common, # 9.3.4 @@ -1775,7 +1778,7 @@ class Sanitizer { 'hr' => array_merge( $common, [ 'width' ] ), # HTML Ruby annotation text module, simple ruby only. - # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element + # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element 'ruby' => $common, # rbc 'rb' => $common, @@ -1785,14 +1788,14 @@ class Sanitizer { # MathML root element, where used for extensions # 'title' may not be 100% valid here; it's XHTML - # http://www.w3.org/TR/REC-MathML/ + # https://www.w3.org/TR/REC-MathML/ 'math' => [ 'class', 'style', 'id', 'title' ], # HTML 5 section 4.6 'bdi' => $common, # HTML5 elements, defined by: - # http://www.whatwg.org/html/ + # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element 'data' => array_merge( $common, [ 'value' ] ), 'time' => array_merge( $common, [ 'datetime' ] ), 'mark' => $common, @@ -1867,7 +1870,7 @@ class Sanitizer { list( /* $whole */, $protocol, $host, $rest ) = $matches; // Characters that will be ignored in IDNs. - // http://tools.ietf.org/html/3454#section-3.1 + // https://tools.ietf.org/html/rfc3454#section-3.1 // Strip them before further processing so blacklists and such work. $strip = "/ \\s| # general whitespace @@ -1923,7 +1926,7 @@ class Sanitizer { * 3.5. * * This function is an implementation of the specification as requested in - * bug 22449. + * T24449. * * Client-side forms will use the same standard validation rules via JS or * HTML 5 validation; additional restrictions can be enforced server-side @@ -1946,7 +1949,7 @@ class Sanitizer { // Please note strings below are enclosed in brackets [], this make the // hyphen "-" a range indicator. Hence it is double backslashed below. - // See bug 26948 + // See T28948 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; $rfc1034_ldh_str = "a-z0-9\\-";