From dc9c9ee7fc6d96f957e15b4f56276000cb8e8f06 Mon Sep 17 00:00:00 2001 From: Timo Tijhof Date: Fri, 30 Aug 2013 14:00:40 -0700 Subject: [PATCH] Title: Add byte class to unicode class conversion for js The upcoming rewrite of mw.Title needs to use wgLegalTitleChars, but for that to work, it needs to be converted into something that can work in javascript. Signed-off-by: Timo Tijhof Signed-off-by: David Chan Change-Id: I163f3d7e3a680d52640a93f4bd195d8209669918 --- includes/Title.php | 102 ++++++++++++++++++ .../ResourceLoaderStartUpModule.php | 1 + tests/phpunit/includes/TitleTest.php | 68 ++++++++++++ 3 files changed, 171 insertions(+) diff --git a/includes/Title.php b/includes/Title.php index 13350cfd4b..cdbebf188b 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -491,6 +491,108 @@ class Title { return $rxTc; } + /** + * Utility method for converting a character sequence from bytes to Unicode. + * + * Primary usecase being converting $wgLegalTitleChars to a sequence usable in + * javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units. + * + * @param string $byteClass + * @return string + */ + public static function convertByteClassToUnicodeClass( $byteClass ) { + $length = strlen( $byteClass ); + // Input token queue + $x0 = $x1 = $x2 = ''; + // Decoded queue + $d0 = $d1 = $d2 = ''; + // Decoded integer codepoints + $ord0 = $ord1 = $ord2 = 0; + // Re-encoded queue + $r0 = $r1 = $r2 = ''; + // Output + $out = ''; + // Flags + $allowUnicode = false; + for ( $pos = 0; $pos < $length; $pos++ ) { + // Shift the queues down + $x2 = $x1; + $x1 = $x0; + $d2 = $d1; + $d1 = $d0; + $ord2 = $ord1; + $ord1 = $ord0; + $r2 = $r1; + $r1 = $r0; + // Load the current input token and decoded values + $inChar = $byteClass[$pos]; + if ( $inChar == '\\' ) { + if ( preg_match( '/x([0-9a-fA-F]{2})/A', $byteClass, $m, 0, $pos + 1 ) ) { + $x0 = $inChar . $m[0]; + $d0 = chr( hexdec( $m[1] ) ); + $pos += strlen( $m[0] ); + } elseif ( preg_match( '/[0-7]{3}/A', $byteClass, $m, 0, $pos + 1 ) ) { + $x0 = $inChar . $m[0]; + $d0 = chr( octdec( $m[0] ) ); + $pos += strlen( $m[0] ); + } elseif ( $pos + 1 >= $length ) { + $x0 = $d0 = '\\'; + } else { + $d0 = $byteClass[$pos + 1]; + $x0 = $inChar . $d0; + $pos += 1; + } + } else { + $x0 = $d0 = $inChar; + } + $ord0 = ord( $d0 ); + // Load the current re-encoded value + if ( $ord0 < 32 || $ord0 == 0x7f ) { + $r0 = sprintf( '\x%02x', $ord0 ); + } elseif ( $ord0 >= 0x80 ) { + // Allow unicode if a single high-bit character appears + $r0 = sprintf( '\x%02x', $ord0 ); + $allowUnicode = true; + } elseif ( strpos( '-\\[]^', $d0 ) !== false ) { + $r0 = '\\' . $d0; + } else { + $r0 = $d0; + } + // Do the output + if ( $x0 !== '' && $x1 === '-' && $x2 !== '' ) { + // Range + if ( $ord2 > $ord0 ) { + // Empty range + } elseif ( $ord0 >= 0x80 ) { + // Unicode range + $allowUnicode = true; + if ( $ord2 < 0x80 ) { + // Keep the non-unicode section of the range + $out .= "$r2-\\x7F"; + } + } else { + // Normal range + $out .= "$r2-$r0"; + } + // Reset state to the initial value + $x0 = $x1 = $d0 = $d1 = $r0 = $r1 = ''; + } elseif ( $ord2 < 0x80 ) { + // ASCII character + $out .= $r2; + } + } + if ( $ord1 < 0x80 ) { + $out .= $r1; + } + if ( $ord0 < 0x80 ) { + $out .= $r0; + } + if ( $allowUnicode ) { + $out .= '\u0080-\uFFFF'; + } + return $out; + } + /** * Get a string representation of a title suitable for * including in a search index diff --git a/includes/resourceloader/ResourceLoaderStartUpModule.php b/includes/resourceloader/ResourceLoaderStartUpModule.php index 861ff18c29..7d13e7b53f 100644 --- a/includes/resourceloader/ResourceLoaderStartUpModule.php +++ b/includes/resourceloader/ResourceLoaderStartUpModule.php @@ -95,6 +95,7 @@ class ResourceLoaderStartUpModule extends ResourceLoaderModule { 'wgCookiePrefix' => $wgCookiePrefix, 'wgResourceLoaderMaxQueryLength' => $wgResourceLoaderMaxQueryLength, 'wgCaseSensitiveNamespaces' => $caseSensitiveNamespaces, + 'wgLegalTitleChars' => Title::convertByteClassToUnicodeClass( Title::legalChars() ), ); wfRunHooks( 'ResourceLoaderGetConfigVars', array( &$vars ) ); diff --git a/tests/phpunit/includes/TitleTest.php b/tests/phpunit/includes/TitleTest.php index 33bd8d6ebd..73786b9db4 100644 --- a/tests/phpunit/includes/TitleTest.php +++ b/tests/phpunit/includes/TitleTest.php @@ -32,6 +32,74 @@ class TitleTest extends MediaWikiTestCase { } } + public static function provideConvertByteClassToUnicodeClass() { + return array( + array( + ' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+', + ' %!"$&\'()*,\\-./0-9:;=?@A-Z\\\\\\^_`a-z~+\\u0080-\\uFFFF', + ), + array( + 'QWERTYf-\\xFF+', + 'QWERTYf-\\x7F+\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\x66-\\xFD+', + 'QWERTYf-\\x7F+\\u0080-\\uFFFF', + ), + array( + 'QWERTYf-y+', + 'QWERTYf-y+', + ), + array( + 'QWERTYf-\\x80+', + 'QWERTYf-\\x7F+\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\x66-\\x80+\\x23', + 'QWERTYf-\\x7F+#\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\x66-\\x80+\\xD3', + 'QWERTYf-\\x7F+\\u0080-\\uFFFF', + ), + array( + '\\\\\\x99', + '\\\\\\u0080-\\uFFFF', + ), + array( + '-\\x99', + '\\-\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\-\\x99', + 'QWERTY\\-\\u0080-\\uFFFF', + ), + array( + '\\\\x99', + '\\\\x99', + ), + array( + 'A-\\x9F', + 'A-\\x7F\\u0080-\\uFFFF', + ), + array( + '\\x66-\\x77QWERTY\\x88-\\x91FXZ', + 'f-wQWERTYFXZ\\u0080-\\uFFFF', + ), + array( + '\\x66-\\x99QWERTY\\xAA-\\xEEFXZ', + 'f-\\x7FQWERTYFXZ\\u0080-\\uFFFF', + ), + ); + } + + /** + * @dataProvider provideConvertByteClassToUnicodeClass + */ + function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass ) { + $this->assertEquals( $unicodeClass, Title::convertByteClassToUnicodeClass( $byteClass ) ); + } + /** * @dataProvider provideBug31100 */ -- 2.20.1