Sanitizer: Allow attribute names to use any Unicode "Letter" or "Number"
authorEd Sanders <esanders@wikimedia.org>
Thu, 22 Jun 2017 04:43:36 +0000 (21:43 -0700)
committerLegoktm <legoktm@member.fsf.org>
Sun, 25 Jun 2017 10:00:05 +0000 (10:00 +0000)
Bug: T73386
Change-Id: If712841ba56c5d8f30bbbad500403446a165b07c

includes/Sanitizer.php
tests/phpunit/includes/SanitizerTest.php

index 8920e92..01fe60e 100644 (file)
@@ -339,8 +339,8 @@ class Sanitizer {
         */
        static function getAttribsRegex() {
                if ( self::$attribsRegex === null ) {
-                       $attribFirst = '[:A-Z_a-z0-9]';
-                       $attrib = '[:A-Z_a-z-.0-9]';
+                       $attribFirst = "[:_\p{L}\p{N}]";
+                       $attrib = "[:_\.\-\p{L}\p{N}]";
                        $space = '[\x09\x0a\x0c\x0d\x20]';
                        self::$attribsRegex =
                                "/(?:^|$space)({$attribFirst}{$attrib}*)
@@ -351,7 +351,7 @@ class Sanitizer {
                                                | '([^']*)(?:'|\$)
                                                | (((?!$space|>).)*)
                                        )
-                               )?(?=$space|\$)/sx";
+                               )?(?=$space|\$)/sxu";
                }
                return self::$attribsRegex;
        }
index c237c50..abcf1d4 100644 (file)
@@ -178,6 +178,10 @@ class SanitizerTest extends MediaWikiTestCase {
        public static function provideTagAttributesToDecode() {
                return [
                        [ [ 'foo' => 'bar' ], 'foo=bar', 'Unquoted attribute' ],
+                       [ [ 'עברית' => 'bar' ], 'עברית=bar', 'Non-Latin attribute' ],
+                       [ [ '६' => 'bar' ], '६=bar', 'Devanagari number' ],
+                       [ [ '搭𨋢' => 'bar' ], '搭𨋢=bar', 'Non-BMP character' ],
+                       [ [], 'ńgh=bar', 'Combining accent is not allowed' ],
                        [ [ 'foo' => 'bar' ], '    foo   =   bar    ', 'Spaced attribute' ],
                        [ [ 'foo' => 'bar' ], 'foo="bar"', 'Double-quoted attribute' ],
                        [ [ 'foo' => 'bar' ], 'foo=\'bar\'', 'Single-quoted attribute' ],