Adding JavaScript CLDR plural parser.
authorNiklas Laxström <niklas.laxstrom@gmail.com>
Wed, 22 Aug 2012 13:15:44 +0000 (13:15 +0000)
committerSanthosh Thottingal <santhosh.thottingal@gmail.com>
Thu, 23 Aug 2012 06:51:54 +0000 (12:21 +0530)
The JavaScript code of the parser was written by Santhosh.
The original project is hosted at GitHub:
https://github.com/santhoshtr/CLDRPluralRuleParser

Introduces resourceloader modules: mediawiki.cldr and
mediawiki.libs.pluralruleparser.

hi.js and ar.js removed since it has only convertPlural method. More [lang].js
needs to remove convertPlural, but not done in this commit.

The actual rules will be taken straight from CLDR and they are not
integrated in this commit yet.

Change-Id: I1aa3b081f4dad68515fd6cd46e4ab2dbdb3291eb

resources/Resources.php
resources/mediawiki.language/languages/ar.js [deleted file]
resources/mediawiki.language/languages/he.js
resources/mediawiki.language/languages/hi.js [deleted file]
resources/mediawiki.language/mediawiki.cldr.js [new file with mode: 0644]
resources/mediawiki.language/mediawiki.language.js
resources/mediawiki.libs/CLDRPluralRuleParser.js [new file with mode: 0644]
tests/qunit/QUnitTestResources.php
tests/qunit/suites/resources/mediawiki/mediawiki.cldr.test.js [new file with mode: 0644]

index 380a099..5a24355 100644 (file)
@@ -671,7 +671,6 @@ return array(
                'scripts' => 'resources/mediawiki.language/mediawiki.language.js',
                'languageScripts' => array(
                        'am' => 'resources/mediawiki.language/languages/am.js',
-                       'ar' => 'resources/mediawiki.language/languages/ar.js',
                        'bat-smg' => 'resources/mediawiki.language/languages/bat-smg.js',
                        'be' => 'resources/mediawiki.language/languages/be.js',
                        'be-tarask' => 'resources/mediawiki.language/languages/be-tarask.js',
@@ -687,7 +686,6 @@ return array(
                        'gd' => 'resources/mediawiki.language/languages/gd.js',
                        'gv' => 'resources/mediawiki.language/languages/gv.js',
                        'he' => 'resources/mediawiki.language/languages/he.js',
-                       'hi' => 'resources/mediawiki.language/languages/hi.js',
                        'hr' => 'resources/mediawiki.language/languages/hr.js',
                        'hsb' => 'resources/mediawiki.language/languages/hsb.js',
                        'hu' => 'resources/mediawiki.language/languages/hu.js',
@@ -719,7 +717,21 @@ return array(
                        'uk' => 'resources/mediawiki.language/languages/uk.js',
                        'wa' => 'resources/mediawiki.language/languages/wa.js',
                ),
-               'dependencies' => array( 'mediawiki.language.data' ),
+               'dependencies' => array(
+                               'mediawiki.language.data',
+                               'mediawiki.cldr'
+                       ),
+       ),
+
+       'mediawiki.cldr' => array(
+               'scripts' => 'resources/mediawiki.language/mediawiki.cldr.js',
+               'dependencies' => array(
+                       'mediawiki.libs.pluralruleparser',
+               ),
+       ),
+
+       'mediawiki.libs.pluralruleparser' => array(
+               'scripts' => 'resources/mediawiki.libs/CLDRPluralRuleParser.js',
        ),
 
        'mediawiki.language.init' => array(
diff --git a/resources/mediawiki.language/languages/ar.js b/resources/mediawiki.language/languages/ar.js
deleted file mode 100644 (file)
index d21df7e..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-/**
- * Arabic (العربية) language functions
- */
-
-mediaWiki.language.convertPlural = function( count, forms ) {
-       forms = mediaWiki.language.preConvertPlural( forms, 6 );
-       if ( count == 0 ) {
-               return forms[0];
-       }
-       if ( count == 1 ) {
-               return forms[1];
-       }
-       if ( count == 2 ) {
-               return forms[2];
-       }
-       if ( count % 100 >= 3 && count % 100 <= 10 ) {
-               return forms[3];
-       }
-       if ( count % 100 >= 11 && count % 100 <= 99 ) {
-               return forms[4];
-       }
-       return forms[5];
-};
-
index e737a7c..d35f77e 100644 (file)
@@ -2,17 +2,6 @@
  * Hebrew (עברית) language functions
  */
 
-mediaWiki.language.convertPlural = function( count, forms ) {
-       forms = mediaWiki.language.preConvertPlural( forms, 3 );
-       if ( count == 1 ) {
-               return forms[0];
-       }
-       if ( count == 2 && forms[2] ) {
-               return forms[2];
-       }
-       return forms[1];
-};
-
 mediaWiki.language.convertGrammar = function( word, form ) {
        var grammarForms = mw.language.getData( 'he', 'grammarForms' );
        if ( grammarForms && grammarForms[form] ) {
diff --git a/resources/mediawiki.language/languages/hi.js b/resources/mediawiki.language/languages/hi.js
deleted file mode 100644 (file)
index a22a0e1..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-/**
- * Hindi (हिन्दी) language functions
- */
-
-mediaWiki.language.convertPlural = function( count, forms ) {
-       forms = mediaWiki.language.preConvertPlural( forms, 2 );
-       return ( count <= 1 ) ? forms[0] : forms[1];
-};
diff --git a/resources/mediawiki.language/mediawiki.cldr.js b/resources/mediawiki.language/mediawiki.cldr.js
new file mode 100644 (file)
index 0000000..6660eca
--- /dev/null
@@ -0,0 +1,28 @@
+/**
+ *  CLDR related utility methods
+ */
+( function( mw ) {
+       "use strict";
+
+       var cldr = {
+               /**
+                * For the number, get the plural for index
+                * In case none of the rules passed, we return pluralRules.length
+                * That means it is the "other" form.
+                * @param number
+                * @param pluralRules
+                * @return plural form index
+                */
+               getPluralForm: function( number, pluralRules ) {
+                       var pluralFormIndex = 0;
+                       for ( pluralFormIndex = 0; pluralFormIndex < pluralRules.length; pluralFormIndex++ ) {
+                               if ( mw.libs.pluralRuleParser( pluralRules[pluralFormIndex], number ) ) {
+                                       break;
+                               }
+                       }
+                       return pluralFormIndex;
+               }
+       };
+
+       mw.cldr = cldr;
+} )( mediaWiki );
index 1234637..935d4ff 100644 (file)
@@ -43,11 +43,19 @@ var language = {
         * @param forms array List of plural forms
         * @return string Correct form for quantifier in this language
         */
-       convertPlural: function ( count, forms ){
+       convertPlural: function( count, forms ) {
+               var pluralFormIndex = 0;
                if ( !forms || forms.length === 0 ) {
                        return '';
                }
-               return ( parseInt( count, 10 ) === 1 ) ? forms[0] : forms[1];
+               var pluralRules = mw.language.getData( mw.config.get( 'wgUserLanguage' ), 'pluralRules' );
+               if ( !pluralRules ) {
+                       // default fallback.
+                       return ( count === 1 ) ? forms[0] : forms[1];
+               }
+               pluralFormIndex = mw.cldr.getPluralForm( count, pluralRules );
+               pluralFormIndex = Math.min( pluralFormIndex, forms.length - 1 );
+               return forms[pluralFormIndex];
        },
 
        /**
diff --git a/resources/mediawiki.libs/CLDRPluralRuleParser.js b/resources/mediawiki.libs/CLDRPluralRuleParser.js
new file mode 100644 (file)
index 0000000..bb1491d
--- /dev/null
@@ -0,0 +1,306 @@
+/* This is cldrpluralparser 1.0, ported to MediaWiki ResourceLoader */
+
+/**
+* cldrpluralparser.js
+* A parser engine for CLDR plural rules.
+*
+* Copyright 2012 GPLV3+, Santhosh Thottingal
+*
+* @version 0.1.0-alpha
+* @source https://github.com/santhoshtr/CLDRPluralRuleParser
+* @author Santhosh Thottingal <santhosh.thottingal@gmail.com>
+* @author Timo Tijhof
+* @author Amir Aharoni
+*/
+
+/**
+ * Evaluates a plural rule in CLDR syntax for a number
+ * @param rule
+ * @param number
+ * @return true|false|null
+ */
+( function( mw ) {
+
+function pluralRuleParser(rule, number) {
+       /*
+       Syntax: see http://unicode.org/reports/tr35/#Language_Plural_Rules
+       -----------------------------------------------------------------
+
+       condition     = and_condition ('or' and_condition)*
+       and_condition = relation ('and' relation)*
+       relation      = is_relation | in_relation | within_relation | 'n' <EOL>
+       is_relation   = expr 'is' ('not')? value
+       in_relation   = expr ('not')? 'in' range_list
+       within_relation = expr ('not')? 'within' range_list
+       expr          = 'n' ('mod' value)?
+       range_list    = (range | value) (',' range_list)*
+       value         = digit+
+       digit         = 0|1|2|3|4|5|6|7|8|9
+       range         = value'..'value
+
+       */
+       // Indicates current position in the rule as we parse through it.
+       // Shared among all parsing functions below.
+       var pos = 0;
+
+       var whitespace = makeRegexParser(/^\s+/);
+       var digits = makeRegexParser(/^\d+/);
+
+       var _n_ = makeStringParser('n');
+       var _is_ = makeStringParser('is');
+       var _mod_ = makeStringParser('mod');
+       var _not_ = makeStringParser('not');
+       var _in_ = makeStringParser('in');
+       var _within_ = makeStringParser('within');
+       var _range_ = makeStringParser('..');
+       var _comma_ = makeStringParser(',');
+       var _or_ = makeStringParser('or');
+       var _and_ = makeStringParser('and');
+
+       function debug() {
+               /* console.log.apply(console, arguments);*/
+       }
+
+       debug('pluralRuleParser', rule, number);
+
+       // Try parsers until one works, if none work return null
+       function choice(parserSyntax) {
+               return function () {
+                       for (var i = 0; i < parserSyntax.length; i++) {
+                               var result = parserSyntax[i]();
+                               if (result !== null) {
+                                       return result;
+                               }
+                       }
+                       return null;
+               };
+       }
+
+       // Try several parserSyntax-es in a row.
+       // All must succeed; otherwise, return null.
+       // This is the only eager one.
+       function sequence(parserSyntax) {
+               var originalPos = pos;
+               var result = [];
+               for (var i = 0; i < parserSyntax.length; i++) {
+                       var res = parserSyntax[i]();
+                       if (res === null) {
+                               pos = originalPos;
+                               return null;
+                       }
+                       result.push(res);
+               }
+               return result;
+       }
+
+       // Run the same parser over and over until it fails.
+       // Must succeed a minimum of n times; otherwise, return null.
+       function nOrMore(n, p) {
+               return function () {
+                       var originalPos = pos;
+                       var result = [];
+                       var parsed = p();
+                       while (parsed !== null) {
+                               result.push(parsed);
+                               parsed = p();
+                       }
+                       if (result.length < n) {
+                               pos = originalPos;
+                               return null;
+                       }
+                       return result;
+               };
+       }
+
+       // Helpers -- just make parserSyntax out of simpler JS builtin types
+
+       function makeStringParser(s) {
+               var len = s.length;
+               return function () {
+                       var result = null;
+                       if (rule.substr(pos, len) === s) {
+                               result = s;
+                               pos += len;
+                       }
+                       return result;
+               };
+       }
+
+       function makeRegexParser(regex) {
+               return function () {
+                       var matches = rule.substr(pos).match(regex);
+                       if (matches === null) {
+                               return null;
+                       }
+                       pos += matches[0].length;
+                       return matches[0];
+               };
+       }
+
+       function n() {
+               var result = _n_();
+               if (result === null) {
+                       debug(" -- failed n");
+                       return result;
+               }
+               result = parseInt(number, 10);
+               debug(" -- passed n ", result);
+               return result;
+       }
+
+       var expression = choice([mod, n]);
+
+       function mod() {
+               var result = sequence([n, whitespace, _mod_, whitespace, digits]);
+               if (result === null) {
+                       debug(" -- failed mod");
+                       return null;
+               }
+               debug(" -- passed mod");
+               return parseInt(result[0], 10) % parseInt(result[4], 10);
+       }
+
+       function not() {
+               var result = sequence([whitespace, _not_]);
+               if (result === null) {
+                       debug(" -- failed not");
+                       return null;
+               } else {
+                       return result[1];
+               }
+       }
+
+       function is() {
+               var result = sequence([expression, whitespace, _is_, nOrMore(0, not), whitespace, digits]);
+               if (result !== null) {
+                       debug(" -- passed is");
+                       if (result[3][0] === 'not') {
+                               return result[0] !== parseInt(result[5], 10);
+                       } else {
+                               return result[0] === parseInt(result[5], 10);
+                       }
+               }
+               debug(" -- failed is");
+               return null;
+       }
+
+       function rangeList() {
+               // range_list    = (range | value) (',' range_list)*
+               var result = sequence([choice([range, digits]), nOrMore(0, rangeTail)]);
+               var resultList = [];
+               if (result !== null) {
+                       resultList = resultList.concat(result[0], result[1][0]);
+                       return resultList;
+               }
+               debug(" -- failed rangeList");
+               return null;
+       }
+
+       function rangeTail() {
+               // ',' range_list
+               var result = sequence([_comma_, rangeList]);
+               if (result !== null) {
+                       return result[1];
+               }
+               debug(" -- failed rangeTail");
+               return null;
+       }
+
+       function range() {
+               var result = sequence([digits, _range_, digits]);
+               if (result !== null) {
+                       debug(" -- passed range");
+                       var array = [];
+                       var left = parseInt(result[0], 10);
+                       var right = parseInt(result[2], 10);
+                       for ( i = left; i <= right; i++) {
+                               array.push(i);
+                       }
+                       return array;
+               }
+               debug(" -- failed range");
+               return null;
+       }
+
+       function _in() {
+               // in_relation   = expr ('not')? 'in' range_list
+               var result = sequence([expression, nOrMore(0, not), whitespace, _in_, whitespace, rangeList]);
+               if (result !== null) {
+                       debug(" -- passed _in");
+                       var range_list = result[5];
+                       for (var i = 0; i < range_list.length; i++) {
+                               if (parseInt(range_list[i], 10) === result[0]) {
+                                       return (result[1][0] !== 'not');
+                               }
+                       }
+                       return (result[1][0] === 'not');
+               }
+               debug(" -- failed _in ");
+               return null;
+       }
+
+       function within() {
+               var result = sequence([expression, whitespace, _within_, whitespace, rangeList]);
+               if (result !== null) {
+                       debug(" -- passed within ");
+                       var range_list = result[4];
+                       return (parseInt( range_list[0],10 )<= result[0] && result[0] <= parseInt( range_list[1], 10));
+               }
+               debug(" -- failed within ");
+               return null;
+       }
+
+
+       var relation = choice([is, _in, within]);
+
+       function and() {
+               var result = sequence([relation, whitespace, _and_, whitespace, condition]);
+               if (result) {
+                       debug(" -- passed and");
+                       return result[0] && result[4];
+               }
+               debug(" -- failed and");
+               return null;
+       }
+
+       function or() {
+               var result = sequence([relation, whitespace, _or_, whitespace, condition]);
+               if (result) {
+                       debug(" -- passed or");
+                       return result[0] || result[4];
+               }
+               debug(" -- failed or");
+               return null;
+       }
+
+       var condition = choice([and, or, relation]);
+
+       function start() {
+               var result = condition();
+               return result;
+       }
+
+
+       var result = start();
+
+       /*
+        * For success, the pos must have gotten to the end of the rule
+        * and returned a non-null.
+        * n.b. This is part of language infrastructure, so we do not throw an internationalizable message.
+        */
+       if (result === null || pos !== rule.length) {
+               // throw new Error("Parse error at position " + pos.toString() + " in input: " + rule + " result is " + result);
+       }
+
+       return result;
+}
+
+/* For module loaders, e.g. NodeJS, NPM */
+if (typeof module !== 'undefined' && module.exports) {
+       module.exports = pluralRuleParser;
+}
+
+/* pluralRuleParser ends here */
+mw.libs.pluralRuleParser = pluralRuleParser;
+
+} )( mediaWiki );
index 1cd085f..59ae73c 100644 (file)
@@ -30,6 +30,7 @@ return array(
                        'tests/qunit/suites/resources/mediawiki.api/mediawiki.api.parse.test.js',
                        'tests/qunit/suites/resources/mediawiki.special/mediawiki.special.recentchanges.test.js',
                        'tests/qunit/suites/resources/mediawiki/mediawiki.language.test.js',
+                       'tests/qunit/suites/resources/mediawiki/mediawiki.cldr.test.js',
                ),
                'dependencies' => array(
                        'jquery.autoEllipsis',
@@ -55,6 +56,7 @@ return array(
                        'mediawiki.util',
                        'mediawiki.special.recentchanges',
                        'mediawiki.language',
+                       'mediawiki.cldr',
                ),
                'position' => 'top',
        )
diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.cldr.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.cldr.test.js
new file mode 100644 (file)
index 0000000..09a11b0
--- /dev/null
@@ -0,0 +1,67 @@
+module( 'mediawiki.cldr' );
+
+test( '-- Initial check', function() {
+       expect( 1 );
+       ok( mw.cldr, 'mw.cldr defined' );
+} );
+
+var pluralTestcases = {
+       /*
+        * Sample:
+        *"languagecode" : [
+        *      [ number, [ "form1", "form2", ... ],  "expected", "description" ],
+        * ]
+        */
+       "en": [
+               [ 0, [ "one", "other" ], "other", "English plural test- 0 is other" ],
+               [ 1, [ "one", "other" ], "one", "English plural test- 1 is one" ]
+       ],
+       "hi": [
+               [ 0, [ "one", "other" ], "one", "Hindi plural test- 0 is one" ],
+               [ 1, [ "one", "other" ], "one", "Hindi plural test- 1 is one" ],
+               [ 2, [ "one", "other" ], "other", "Hindi plural test- 2 is other" ]
+       ],
+       "he": [
+               [ 0, [ "one", "other" ], "other", "Hebrew plural test- 0 is other" ],
+               [ 1, [ "one", "other" ], "one", "Hebrew plural test- 1 is one" ],
+               [ 2, [ "one", "other" ], "other", "Hebrew plural test- 2 is other with 2 forms" ],
+               [ 2, [ "one", "dual", "other" ], "dual", "Hebrew plural test- 2 is dual with 3 forms" ]
+       ],
+       "ar": [
+               [ 0, [ "zero", "one", "two", "few", "many", "other" ], "zero", "Arabic plural test - 0 is zero" ],
+               [ 1, [ "zero", "one", "two", "few", "many", "other" ], "one", "Arabic plural test - 1 is one" ],
+               [ 2, [ "zero", "one", "two", "few", "many", "other" ], "two", "Arabic plural test - 2 is two" ],
+               [ 3, [ "zero", "one", "two", "few", "many", "other" ], "few", "Arabic plural test - 3 is few" ],
+               [ 9, [ "zero", "one", "two", "few", "many", "other" ], "few", "Arabic plural test - 9 is few" ],
+               [ "9", [ "zero", "one", "two", "few", "many", "other" ], "few", "Arabic plural test - 9 is few" ],
+               [ 110, [ "zero", "one", "two", "few", "many", "other" ], "few", "Arabic plural test - 110 is few" ],
+               [ 11, [ "zero", "one", "two", "few", "many", "other" ], "many", "Arabic plural test - 11 is many" ],
+               [ 15, [ "zero", "one", "two", "few", "many", "other" ], "many", "Arabic plural test - 15 is many" ],
+               [ 99, [ "zero", "one", "two", "few", "many", "other" ], "many", "Arabic plural test - 99 is many" ],
+               [ 9999, [ "zero", "one", "two", "few", "many", "other" ], "many", "Arabic plural test - 9999 is many" ],
+               [ 100, [ "zero", "one", "two", "few", "many", "other" ], "other", "Arabic plural test - 100 is other" ],
+               [ 102, [ "zero", "one", "two", "few", "many", "other" ], "other", "Arabic plural test - 102 is other" ],
+               [ 1000, [ "zero", "one", "two", "few", "many", "other" ], "other", "Arabic plural test - 1000 is other" ]
+               // FIXME plural rules for decimal numbers does not work
+               // [ 1.7, [ "zero", "one", "two", "few", "many", "other" ], "other", "Arabic plural test - 1.7 is other" ],
+       ]
+};
+
+function pluralTest( langCode, tests ) {
+       QUnit.test('-- Plural Test for ' + langCode, function( assert ) {
+               QUnit.expect( tests.length );
+               for ( var i = 0; i < tests.length; i++ ) {
+                       assert.equal(
+                               mw.language.convertPlural( tests[i][0], tests[i][1] ),
+                               tests[i][2], // Expected plural form
+                               tests[i][3] // Description
+                       );
+               }
+       } );
+}
+
+$.each( pluralTestcases, function( langCode, tests ) {
+       if ( langCode === mw.config.get( 'wgUserLanguage' ) ) {
+               pluralTest( langCode, tests );
+       }
+} );