break;
default:
- // FIXME: add support for unicode and unicode escape sequence \uHHHH
- if (preg_match('/^[$\w]+/', $input, $match))
+ // Fast path for identifiers: word chars followed by whitespace or various other tokens.
+ // Note we don't need to exclude digits in the first char, as they've already been found
+ // above.
+ if (!preg_match('/^[$\w]+(?=[\s\/\|\^\&<>\+\-\*%=!.;,\?:~\[\]\{\}\(\)@])/', $input, $match))
{
- $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
+ // Character classes per ECMA-262 edition 5.1 section 7.6
+ // Per spec, must accept Unicode 3.0, *may* accept later versions.
+ // We'll take whatever PCRE understands, which should be more recent.
+ $identifierStartChars = "\\p{L}\\p{Nl}" . # UnicodeLetter
+ "\$" .
+ "_";
+ $identifierPartChars = $identifierStartChars .
+ "\\p{Mn}\\p{Mc}" . # UnicodeCombiningMark
+ "\\p{Nd}" . # UnicodeDigit
+ "\\p{Pc}"; # UnicodeConnectorPunctuation
+ $unicodeEscape = "\\\\u[0-9A-F-a-f]{4}";
+ $identifierRegex = "/^" .
+ "(?:[$identifierStartChars]|$unicodeEscape)" .
+ "(?:[$identifierPartChars]|$unicodeEscape)*" .
+ "/uS";
+ if (preg_match($identifierRegex, $input, $match))
+ {
+ if (strpos($match[0], '\\') !== false) {
+ // Per ECMA-262 edition 5.1, section 7.6 escape sequences should behave as if they were
+ // the original chars, but only within the boundaries of the identifier.
+ $decoded = preg_replace_callback('/\\\\u([0-9A-Fa-f]{4})/',
+ array(__CLASS__, 'unicodeEscapeCallback'),
+ $match[0]);
+
+ // Since our original regex didn't de-escape the originals, we need to check for validity again.
+ // No need to worry about token boundaries, as anything outside the identifier is illegal!
+ if (!preg_match("/^[$identifierStartChars][$identifierPartChars]*$/u", $decoded)) {
+ throw $this->newSyntaxError('Illegal token');
+ }
+
+ // Per spec it _ought_ to work to use these escapes for keywords words as well...
+ // but IE rejects them as invalid, while Firefox and Chrome treat them as identifiers
+ // that don't match the keyword.
+ if (in_array($decoded, $this->keywords)) {
+ throw $this->newSyntaxError('Illegal token');
+ }
+
+ // TODO: save the decoded form for output?
+ }
+ }
+ else
+ throw $this->newSyntaxError('Illegal token');
}
- else
- throw $this->newSyntaxError('Illegal token');
+ $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
}
}
{
return new Exception('Parse error: ' . $m . ' in file \'' . $this->filename . '\' on line ' . $this->lineno);
}
+
+ public static function unicodeEscapeCallback($m)
+ {
+ return html_entity_decode('&#x' . $m[1]. ';', ENT_QUOTES, 'UTF-8');
+ }
}
class JSToken
// newline insertion after 1000 chars: break after the "++", not before
array( str_repeat( ';', 996 ) . "if(x++);", str_repeat( ';', 996 ) . "if(x++\n);" ),
+
+ // Unicode letter characters should pass through ok in identifiers (bug 31187)
+ array( "var KaŝSkatolVal = {}", 'var KaŝSkatolVal={}'),
+ // And also per spec unicode char escape values should work in identifiers,
+ // as long as it's a valid char. In future it might get normalized.
+ array( "var Ka\\u015dSkatolVal = {}", 'var Ka\\u015dSkatolVal={}'),
);
}
--- /dev/null
+/* Some misc JavaScript compatibility tests, just to make sure the environments we run in are consistent */
+
+module( 'mediawiki.jscompat' );
+
+test( 'Variable with Unicode letter in name', function() {
+ expect(3);
+ var orig = "some token";
+ var ŝablono = orig;
+ deepEqual( ŝablono, orig, 'ŝablono' );
+ deepEqual( \u015dablono, orig, '\\u015dablono' );
+ deepEqual( \u015Dablono, orig, '\\u015Dablono' );
+});
+
+/*
+// Not that we need this. ;)
+// This fails on IE 6-8
+// Works on IE 9, Firefox 6, Chrome 14
+test( 'Keyword workaround: "if" as variable name using Unicode escapes', function() {
+ var orig = "another token";
+ \u0069\u0066 = orig;
+ deepEqual( \u0069\u0066, orig, '\\u0069\\u0066' );
+});
+*/
+
+/*
+// Not that we need this. ;)
+// This fails on IE 6-9
+// Works on Firefox 6, Chrome 14
+test( 'Keyword workaround: "if" as member variable name using Unicode escapes', function() {
+ var orig = "another token";
+ var foo = {};
+ foo.\u0069\u0066 = orig;
+ deepEqual( foo.\u0069\u0066, orig, 'foo.\\u0069\\u0066' );
+});
+*/