From 8e8b15afc684e65946b5f6101e74ced518d2eee6 Mon Sep 17 00:00:00 2001 From: Arlo Breault Date: Wed, 7 Jan 2015 12:46:59 -0800 Subject: [PATCH] Non-word characters shouldn't terminate tag names on the tidy side too * Follow up to Iceec404f46703065bf080dd2cbfed1f88c204fa5. * The accepted charset is changed to match the HTML5 parsing spec at: http://dev.w3.org/html5/spec-preview/tokenization.html#tag-open-state * Equivalent in parsoid at I462c336f9a00c8ccd11f3220a8738389e8ba7c7c. Change-Id: I69cb000538fe195dd77273da5f91697fe1e7d283 --- includes/Sanitizer.php | 14 +++++++------ tests/parser/parserTests.txt | 39 +++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index f79e94d459..a2de0044a6 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -39,6 +39,12 @@ class Sanitizer { |&\#[xX]([0-9A-Fa-f]+); |(&)/x'; + /** + * Acceptable tag name charset from HTML5 parsing spec + * http://dev.w3.org/html5/spec-preview/tokenization.html#tag-open-state + */ + const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; + /** * Blacklist for evil uris like javascript: * WARNING: DO NOT use this in any place that actually requires blacklisting @@ -444,7 +450,7 @@ class Sanitizer { # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits - if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { + if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; } else { $slash = $t = $params = $brace = $rest = null; @@ -567,11 +573,7 @@ class Sanitizer { } else { # this might be possible using tidy itself foreach ( $bits as $x ) { - preg_match( - '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', - $x, - $regs - ); + preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ); wfSuppressWarnings(); list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 18aeabb540..04e39ee3fb 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -1191,28 +1191,49 @@ Ruby markup (W3C-style)

!! end -# There is a tidy bug here: http://sourceforge.net/p/tidy/bugs/946/ +# The next two test different paths in the sanitizer. !! test Non-word characters don't terminate tag names (bug 17663, 40670, 52022) !! wikitext - doesn't work! + doesn't terminate - doesn't work! + doesn't terminate - works fine + doesn't terminate -s.foo + doesn't terminate !! html -

<b→> doesn't work! </b→> -

<bä> doesn't work! </bä> -

<boo> works fine </boo> -

<s.foo>s.foo</s.foo> +

<b→> doesn't terminate </b→> +

<bä> doesn't terminate </bä> +

<boo> doesn't terminate </boo> +

<s.foo> doesn't terminate </s.foo>

<sub-ID#1>

!! end +# There is a tidy bug here: http://sourceforge.net/p/tidy/bugs/946/ +!! test +Non-word characters don't terminate tag names + tidy +!! wikitext + doesn't terminate + + doesn't terminate + + doesn't terminate + + doesn't terminate + + +!! html+tidy +

<b→> doesn't terminate </b→>

+

<bä> doesn't terminate </bä>

+

<boo> doesn't terminate </boo>

+

<s.foo> doesn't terminate </s.foo>

+

<sub-ID#1>

+!! end + !! test Isolated close tags should be treated as literal text (bug 52760) !! wikitext -- 2.20.1