T106578: Update Sanitizer to match legal HTML5 character entities.
authorC. Scott Ananian <cscott@cscott.net>
Wed, 22 Jul 2015 20:07:27 +0000 (15:07 -0500)
committerTim Starling <tstarling@wikimedia.org>
Tue, 18 Aug 2015 23:05:10 +0000 (23:05 +0000)
Invalid HTML5 character entities become instances of UTF8_REPLACEMENT,
so we also ensure that checkCSS notices this and emits the proper
human-friendly sanitization notice.

Change-Id: I76cef7c772b1e3eba0af8dab6403e9100beab03a

includes/Sanitizer.php
tests/parser/parserTests.txt

index 30981c3..e8f06c4 100644 (file)
@@ -966,7 +966,8 @@ class Sanitizer {
                $value = self::normalizeCss( $value );
 
                // Reject problematic keywords and control characters
-               if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
+               if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
+                       strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
                        return '/* invalid control char */';
                } elseif ( preg_match(
                        '! expression
@@ -1399,15 +1400,19 @@ class Sanitizer {
        }
 
        /**
-        * Returns true if a given Unicode codepoint is a valid character in XML.
+        * Returns true if a given Unicode codepoint is a valid character in
+        * both HTML5 and XML.
         * @param int $codepoint
         * @return bool
         */
        private static function validateCodepoint( $codepoint ) {
+               # U+000C is valid in HTML5 but not allowed in XML.
+               # U+000D is valid in XML but not allowed in HTML5.
+               # U+007F - U+009F are disallowed in HTML5 (control characters).
                return $codepoint == 0x09
                        || $codepoint == 0x0a
-                       || $codepoint == 0x0d
-                       || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
+                       || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
+                       || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
                        || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
                        || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
        }
index f6ca577..266b2b0 100644 (file)
@@ -15831,7 +15831,7 @@ CSS line continuation 2
 !! wikitext
 <div style="background-image: u\&#13;rl(test.jpg); "></div>
 !! html
-<div style="/* insecure input */"></div>
+<div style="/* invalid control char */"></div>
 
 !! end
 
@@ -18164,6 +18164,38 @@ parsoid=wt2html,wt2wt,html2html
 <p><span typeof="mw:Entity">î</span><span typeof="mw:Entity">î</span></p>
 !! end
 
+# See: http://www.w3.org/TR/html5/syntax.html#character-references
+# Note that U+000C (form feed) is not a valid XML character, so
+# it is banned even though allowed in HTML5.
+!! test
+Illegal character references (T106578)
+!! wikitext
+; Null: &#00;
+; FF: &#xC;
+; CR: &#xD;
+; Control (low): &#8;
+; Control (high): &#x7F; &#x9F;
+; Surrogate: &#xD83D;&#xDCA9;
+; This is an okay astral character: &#x1F4A9;
+!! html+tidy
+<dl>
+<dt>Null</dt>
+<dd>&amp;#00;</dd>
+<dt>FF</dt>
+<dd>&amp;#xC;</dd>
+<dt>CR</dt>
+<dd>&amp;#xD;</dd>
+<dt>Control (low)</dt>
+<dd>&amp;#8;</dd>
+<dt>Control (high)</dt>
+<dd>&amp;#x7F; &amp;#x9F;</dd>
+<dt>Surrogate</dt>
+<dd>&amp;#xD83D;&amp;#xDCA9;</dd>
+<dt>This is an okay astral character</dt>
+<dd>💩</dd>
+</dl>
+!! end
+
 !! test
 __FORCETOC__ override
 !! wikitext