* bug 26437: fix for Sanitizer::decodeCharReferences converting invalid hex character...
authorBrion Vibber <brion@users.mediawiki.org>
Mon, 27 Dec 2010 03:21:43 +0000 (03:21 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Mon, 27 Dec 2010 03:21:43 +0000 (03:21 +0000)
Patch by Umherirrender: https://bugzilla.wikimedia.org/attachment.cgi?id=7931&action=edit

Also added a parser regression test case: "HTML Hex character encoding bogus encoding (bug 26437 regression check)"

includes/Sanitizer.php
tests/parser/parserTests.txt

index fba0156..ab67010 100644 (file)
@@ -31,8 +31,7 @@
 define( 'MW_CHAR_REFS_REGEX',
        '/&([A-Za-z0-9\x80-\xff]+);
         |&\#([0-9]+);
-        |&\#x([0-9A-Za-z]+);
-        |&\#X([0-9A-Za-z]+);
+        |&\#[xX]([0-9A-Fa-f]+);
         |(&)/x' );
 
 /**
@@ -1127,8 +1126,6 @@ class Sanitizer {
                        $ret = Sanitizer::decCharReference( $matches[2] );
                } elseif( $matches[3] != ''  ) {
                        $ret = Sanitizer::hexCharReference( $matches[3] );
-               } elseif( $matches[4] != '' ) {
-                       $ret = Sanitizer::hexCharReference( $matches[4] );
                }
                if( is_null( $ret ) ) {
                        return htmlspecialchars( $matches[0] );
@@ -1238,8 +1235,6 @@ class Sanitizer {
                        return  Sanitizer::decodeChar( intval( $matches[2] ) );
                } elseif( $matches[3] != ''  ) {
                        return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
-               } elseif( $matches[4] != '' ) {
-                       return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
                }
                # Last case should be an ampersand by itself
                return $matches[0];
index 7b21f3f..9ebcba3 100644 (file)
@@ -6764,6 +6764,24 @@ HTML Hex character encoding (spells the word "JavaScript")
 </p>
 !! end
 
+!! test
+HTML Hex character encoding bogus encoding (bug 26437 regression check)
+!! input
+&#xsee;&#XSEE;
+!! result
+<p>&amp;#xsee;&amp;#XSEE;
+</p>
+!! end
+
+!! test
+HTML Hex character encoding mixed case
+!! input
+&#xEE;&#Xee;
+!! result
+<p>&#xee;&#xee;
+</p>
+!! end
+
 !! test
 __FORCETOC__ override
 !! input