Non-word characters shouldn't terminate tag names on the tidy side too
authorArlo Breault <abreault@wikimedia.org>
Wed, 7 Jan 2015 20:46:59 +0000 (12:46 -0800)
committerArlo Breault <abreault@wikimedia.org>
Tue, 3 Feb 2015 20:41:55 +0000 (12:41 -0800)
 * Follow up to Iceec404f46703065bf080dd2cbfed1f88c204fa5.

 * The accepted charset is changed to match the HTML5 parsing spec at:
   http://dev.w3.org/html5/spec-preview/tokenization.html#tag-open-state

 * Equivalent in parsoid at I462c336f9a00c8ccd11f3220a8738389e8ba7c7c.

Change-Id: I69cb000538fe195dd77273da5f91697fe1e7d283

includes/Sanitizer.php
tests/parser/parserTests.txt

index f79e94d..a2de004 100644 (file)
@@ -39,6 +39,12 @@ class Sanitizer {
                 |&\#[xX]([0-9A-Fa-f]+);
                 |(&)/x';
 
+       /**
+        * Acceptable tag name charset from HTML5 parsing spec
+        * http://dev.w3.org/html5/spec-preview/tokenization.html#tag-open-state
+        */
+       const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
+
        /**
         * Blacklist for evil uris like javascript:
         * WARNING: DO NOT use this in any place that actually requires blacklisting
@@ -444,7 +450,7 @@ class Sanitizer {
                                # $params: String between element name and >
                                # $brace: Ending '>' or '/>'
                                # $rest: Everything until the next element of $bits
-                               if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
+                               if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
                                        list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
                                } else {
                                        $slash = $t = $params = $brace = $rest = null;
@@ -567,11 +573,7 @@ class Sanitizer {
                } else {
                        # this might be possible using tidy itself
                        foreach ( $bits as $x ) {
-                               preg_match(
-                                       '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
-                                       $x,
-                                       $regs
-                               );
+                               preg_match( self::ELEMENT_BITS_REGEX, $x, $regs );
 
                                wfSuppressWarnings();
                                list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
index 18aeabb..04e39ee 100644 (file)
@@ -1191,28 +1191,49 @@ Ruby markup (W3C-style)
 </p>
 !! end
 
-# There is a tidy bug here: http://sourceforge.net/p/tidy/bugs/946/
+# The next two test different paths in the sanitizer.
 !! test
 Non-word characters don't terminate tag names (bug 17663, 40670, 52022)
 !! wikitext
-<b→> doesn't work! </b→>
+<b→> doesn't terminate </b→>
 
-<bä> doesn't work! </bä>
+<bä> doesn't terminate </bä>
 
-<boo> works fine </boo>
+<boo> doesn't terminate </boo>
 
-<s.foo>s.foo</s.foo>
+<s.foo> doesn't terminate </s.foo>
 
 <sub-ID#1>
 !! html
-<p>&lt;b→&gt; doesn't work! &lt;/b→&gt;
-</p><p>&lt;bä&gt; doesn't work! &lt;/bä&gt;
-</p><p>&lt;boo&gt; works fine &lt;/boo&gt;
-</p><p>&lt;s.foo&gt;s.foo&lt;/s.foo&gt;
+<p>&lt;b→&gt; doesn't terminate &lt;/b→&gt;
+</p><p>&lt;bä&gt; doesn't terminate &lt;/bä&gt;
+</p><p>&lt;boo&gt; doesn't terminate &lt;/boo&gt;
+</p><p>&lt;s.foo&gt; doesn't terminate &lt;/s.foo&gt;
 </p><p>&lt;sub-ID#1&gt;
 </p>
 !! end
 
+# There is a tidy bug here: http://sourceforge.net/p/tidy/bugs/946/
+!! test
+Non-word characters don't terminate tag names + tidy
+!! wikitext
+<b→> doesn't terminate </b→>
+
+<bä> doesn't terminate </bä>
+
+<boo> doesn't terminate </boo>
+
+<s.foo> doesn't terminate </s.foo>
+
+<sub-ID#1>
+!! html+tidy
+<p>&lt;b→&gt; doesn't terminate &lt;/b→&gt;</p>
+<p>&lt;bä&gt; doesn't terminate &lt;/bä&gt;</p>
+<p>&lt;boo&gt; doesn't terminate &lt;/boo&gt;</p>
+<p>&lt;s.foo&gt; doesn't terminate &lt;/s.foo&gt;</p>
+<p>&lt;sub-ID#1&gt;</p>
+!! end
+
 !! test
 Isolated close tags should be treated as literal text (bug 52760)
 !! wikitext