Make the code for grammar data processing common
authorAmir E. Aharoni <amir.aharoni@mail.huji.ac.il>
Mon, 28 Sep 2015 10:26:08 +0000 (13:26 +0300)
committerAmire80 <amir.aharoni@mail.huji.ac.il>
Fri, 16 Dec 2016 13:52:14 +0000 (15:52 +0200)
This makes the code for processing JSON files with
grammar transformations reusable by different languages
and applies the same logic to Russian and Hebrew.
It will be done to other languages in further patches.

This patch is not supposed to change any functionality,
and the tests are intact (except a comment in the test
for Hebrew - the class doesn't exist any longer).

PHP:
* Move the JSON grammar transformation data processing logic
  from LanguageRu.php to convertGrammar() in Language.php.
  By default all these data files are supposed to be
  processed identically, so the code should be common.
  If there is no JSON data file, nothing new happens.
* LanguageRu's own convertGrammar() method is removed.
* The LanguageHe class is removed, now that all its functionality
  is handled by generic JSON data processing in the Language class.
  LanguageHe.php file is removed from the repo and from autoloading.

JavaScript:
* Move the JSON grammar transformation data processing logic
  from ru.js to mediawiki.language.js.
* JavaScript grammar code files he.js and ru.js are removed
  from the repo and from Resources.php, because all the data
  is in JSON, and the default logic in mediawiki.language.js
  works for both languages.

Bug: T115217
Change-Id: I5e75467121c3d791bb84f9e6fdfcf07c1840f81a

autoload.php
languages/Language.php
languages/classes/LanguageHe.php [deleted file]
languages/classes/LanguageRu.php
languages/data/grammarTransformations/he.json [new file with mode: 0644]
resources/Resources.php
resources/src/mediawiki.language/languages/he.js [deleted file]
resources/src/mediawiki.language/languages/ru.js [deleted file]
resources/src/mediawiki.language/mediawiki.language.js
tests/phpunit/languages/classes/LanguageHeTest.php

index 6dbcc1d..e1b808a 100644 (file)
@@ -695,7 +695,6 @@ $wgAutoloadLocalClasses = [
        'LanguageFi' => __DIR__ . '/languages/classes/LanguageFi.php',
        'LanguageGa' => __DIR__ . '/languages/classes/LanguageGa.php',
        'LanguageGan' => __DIR__ . '/languages/classes/LanguageGan.php',
-       'LanguageHe' => __DIR__ . '/languages/classes/LanguageHe.php',
        'LanguageHsb' => __DIR__ . '/languages/classes/LanguageHsb.php',
        'LanguageHu' => __DIR__ . '/languages/classes/LanguageHu.php',
        'LanguageHy' => __DIR__ . '/languages/classes/LanguageHy.php',
index bc5ab7e..ac8d4cb 100644 (file)
@@ -3737,6 +3737,43 @@ class Language {
                        return $wgGrammarForms[$this->getCode()][$case][$word];
                }
 
+               $grammarTransformations = $this->getGrammarTransformations();
+
+               if ( isset( $grammarTransformations[$case] ) ) {
+                       $forms = $grammarTransformations[$case];
+
+                       // Some names of grammar rules are aliases for other rules.
+                       // In such cases the value is a string rather than object,
+                       // so load the actual rules.
+                       if ( is_string( $forms ) ) {
+                               $forms = $grammarTransformations[$forms];
+                       }
+
+                       foreach ( array_values( $forms ) as $rule ) {
+                               $form = $rule[0];
+
+                               if ( $form === '@metadata' ) {
+                                       continue;
+                               }
+
+                               $replacement = $rule[1];
+
+                               $regex = '/' . addcslashes( $form, '/' ) . '/u';
+                               $patternMatches = preg_match( $regex, $word );
+
+                               if ( $patternMatches === false ) {
+                                       wfLogWarning(
+                                               'An error occurred while processing grammar. ' .
+                                               "Word: '$word'. Regex: /$form/."
+                                       );
+                               } elseif ( $patternMatches === 1 ) {
+                                       $word = preg_replace( $regex, $replacement, $word );
+
+                                       break;
+                               }
+                       }
+               }
+
                return $word;
        }
 
diff --git a/languages/classes/LanguageHe.php b/languages/classes/LanguageHe.php
deleted file mode 100644 (file)
index a6aaf6d..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-<?php
-/**
- * Hebrew (עברית) specific code.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- * @author Rotem Liss
- * @ingroup Language
- */
-
-/**
- * Hebrew (עברית)
- *
- * @ingroup Language
- */
-class LanguageHe extends Language {
-
-       /**
-        * Convert grammar forms of words.
-        *
-        * Available cases:
-        * "prefixed" (or "תחילית") - when the word has a prefix
-        *
-        * @param string $word The word to convert
-        * @param string $case The case
-        *
-        * @return string
-        */
-       public function convertGrammar( $word, $case ) {
-               global $wgGrammarForms;
-               if ( isset( $wgGrammarForms['he'][$case][$word] ) ) {
-                       return $wgGrammarForms['he'][$case][$word];
-               }
-
-               switch ( $case ) {
-                       case 'prefixed':
-                       case 'תחילית':
-                               # Duplicate the "Waw" if prefixed, but not if it is already double.
-                               if ( substr( $word, 0, 2 ) === "ו" && substr( $word, 0, 4 ) !== "וו" ) {
-                                       $word = "ו" . $word;
-                               }
-
-                               # Remove the "He" article if prefixed.
-                               if ( substr( $word, 0, 2 ) === "ה" ) {
-                                       $word = substr( $word, 2 );
-                               }
-
-                               # Add a hyphen (maqaf) before non-Hebrew letters.
-                               if ( substr( $word, 0, 2 ) < "א" || substr( $word, 0, 2 ) > "ת" ) {
-                                       $word = "־" . $word;
-                               }
-               }
-
-               return $word;
-       }
-}
index 62de390..7b15721 100644 (file)
  * @ingroup Language
  */
 class LanguageRu extends Language {
-       /**
-        * Convert from the nominative form of a noun to some other case
-        * Invoked with {{grammar:case|word}}
-        *
-        * @param string $word
-        * @param string $case
-        * @return string
-        */
-       function convertGrammar( $word, $case ) {
-               global $wgGrammarForms;
-               if ( isset( $wgGrammarForms['ru'][$case][$word] ) ) {
-                       return $wgGrammarForms['ru'][$case][$word];
-               }
-
-               $grammarTransformations = $this->getGrammarTransformations();
-
-               if ( isset( $grammarTransformations[$case] ) ) {
-                       foreach ( array_values( $grammarTransformations[$case] ) as $rule ) {
-                               $form = $rule[0];
-
-                               if ( $form === '@metadata' ) {
-                                       continue;
-                               }
-
-                               $replacement = $rule[1];
-
-                               $regex = "/$form/";
-
-                               if ( preg_match( $regex, $word ) ) {
-                                       $word = preg_replace( $regex, $replacement, $word );
-
-                                       break;
-                               }
-                       }
-               }
-
-               return $word;
-       }
-
        /**
         * Four-digit number should be without group commas (spaces)
         * See manual of style at https://ru.wikipedia.org/wiki/Википедия:Оформление_статей
diff --git a/languages/data/grammarTransformations/he.json b/languages/data/grammarTransformations/he.json
new file mode 100644 (file)
index 0000000..50620b1
--- /dev/null
@@ -0,0 +1,26 @@
+{
+       "@metadata": {
+               "authors": [
+                       "Rotem Liss",
+                       "Amir E. Aharoni (amir.aharoni@mail.huji.ac.il)"
+               ]
+       },
+       "prefixed": "תחילית",
+       "תחילית": [
+               [ "@metadata", [
+                       "comment", "הכפלת ו, מחיקת ה הידיעה, הוספת מקף"
+               ] ],
+               [
+                       "^(ו[^ו].+)$",
+                       "ו$1"
+               ],
+               [
+                       "ה(.+)$",
+                       "$1"
+               ],
+               [
+                       "^([^א-ת].+)$",
+                       "־$1"
+               ]
+       ]
+}
index 4932a29..d1dd668 100644 (file)
@@ -1592,13 +1592,11 @@ return [
                        'dsb' => 'resources/src/mediawiki.language/languages/dsb.js',
                        'fi' => 'resources/src/mediawiki.language/languages/fi.js',
                        'ga' => 'resources/src/mediawiki.language/languages/ga.js',
-                       'he' => 'resources/src/mediawiki.language/languages/he.js',
                        'hsb' => 'resources/src/mediawiki.language/languages/hsb.js',
                        'hu' => 'resources/src/mediawiki.language/languages/hu.js',
                        'hy' => 'resources/src/mediawiki.language/languages/hy.js',
                        'la' => 'resources/src/mediawiki.language/languages/la.js',
                        'os' => 'resources/src/mediawiki.language/languages/os.js',
-                       'ru' => 'resources/src/mediawiki.language/languages/ru.js',
                        'sl' => 'resources/src/mediawiki.language/languages/sl.js',
                        'uk' => 'resources/src/mediawiki.language/languages/uk.js',
                ],
diff --git a/resources/src/mediawiki.language/languages/he.js b/resources/src/mediawiki.language/languages/he.js
deleted file mode 100644 (file)
index 5bf8c4d..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/*!
- * Hebrew (עברית) language functions
- */
-
-mediaWiki.language.convertGrammar = function ( word, form ) {
-       var grammarForms = mediaWiki.language.getData( 'he', 'grammarForms' );
-       if ( grammarForms && grammarForms[ form ] ) {
-               return grammarForms[ form ][ word ];
-       }
-       switch ( form ) {
-               case 'prefixed':
-               case 'תחילית': // the same word in Hebrew
-                       // Duplicate prefixed "Waw", but only if it's not already double
-                       if ( word.slice( 0, 1 ) === 'ו' && word.slice( 0, 2 ) !== 'וו' ) {
-                               word = 'ו' + word;
-                       }
-
-                       // Remove the "He" if prefixed
-                       if ( word.slice( 0, 1 ) === 'ה' ) {
-                               word = word.slice( 1 );
-                       }
-
-                       // Add a hyphen (maqaf) before numbers and non-Hebrew letters
-                       if ( word.slice( 0, 1 ) < 'א' || word.slice( 0, 1 ) > 'ת' ) {
-                               word = '־' + word;
-                       }
-       }
-       return word;
-};
diff --git a/resources/src/mediawiki.language/languages/ru.js b/resources/src/mediawiki.language/languages/ru.js
deleted file mode 100644 (file)
index 09d7c0b..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/*!
- * Russian (Русский) language functions
- */
-
-mediaWiki.language.convertGrammar = function ( word, form ) {
-       'use strict';
-
-       var forms, transformations, i, rule, sourcePattern, regexp, replacement;
-
-       forms = mediaWiki.language.getData( 'ru', 'grammarForms' );
-       if ( forms && forms[ form ] ) {
-               return forms[ form ][ word ];
-       }
-
-       transformations = mediaWiki.language.getData( 'ru', 'grammarTransformations' );
-
-       if ( !transformations[ form ] ) {
-               return word;
-       }
-
-       for ( i = 0; i < transformations[ form ].length; i++ ) {
-               rule = transformations[ form ][ i ];
-               sourcePattern = rule[ 0 ];
-
-               if ( sourcePattern === '@metadata' ) {
-                       continue;
-               }
-
-               regexp = new RegExp( sourcePattern );
-               replacement = rule[ 1 ];
-
-               if ( word.match( regexp ) ) {
-                       return word.replace( regexp, replacement );
-               }
-       }
-
-       return word;
-};
index fc2af3d..3726a68 100644 (file)
 
                /**
                 * Grammatical transformations, needed for inflected languages.
-                * Invoked by putting `{{grammar:form|word}}` in a message.
+                * Invoked by putting `{{grammar:case|word}}` in a message.
                 *
                 * The rules can be defined in $wgGrammarForms global or computed
                 * dynamically by overriding this method per language.
                 * @return {string}
                 */
                convertGrammar: function ( word, form ) {
-                       var grammarForms = mw.language.getData( mw.config.get( 'wgUserLanguage' ), 'grammarForms' );
-                       if ( grammarForms && grammarForms[ form ] ) {
-                               return grammarForms[ form ][ word ] || word;
+                       var userLanguage, forms, transformations,
+                               patterns, i, rule, sourcePattern, regexp, replacement;
+
+                       userLanguage = mw.config.get( 'wgUserLanguage' );
+
+                       forms = mw.language.getData( userLanguage, 'grammarForms' );
+                       if ( forms && forms[ form ] ) {
+                               return forms[ form ][ word ];
+                       }
+
+                       transformations = mediaWiki.language.getData( userLanguage, 'grammarTransformations' );
+
+                       if ( !( transformations && transformations[ form ] ) ) {
+                               return word;
+                       }
+
+                       patterns = transformations[ form ];
+
+                       // Some names of grammar rules are aliases for other rules.
+                       // In such cases the value is a string rather than object,
+                       // so load the actual rules.
+                       if ( typeof patterns === 'string' ) {
+                               patterns = transformations[ patterns ];
                        }
+
+                       for ( i = 0; i < patterns.length; i++ ) {
+                               rule = patterns[ i ];
+                               sourcePattern = rule[ 0 ];
+
+                               if ( sourcePattern === '@metadata' ) {
+                                       continue;
+                               }
+
+                               regexp = new RegExp( sourcePattern );
+                               replacement = rule[ 1 ];
+
+                               if ( word.match( regexp ) ) {
+                                       return word.replace( regexp, replacement );
+                               }
+                       }
+
                        return word;
                },
 
index 771cda5..c1b774a 100644 (file)
@@ -5,7 +5,7 @@
  * @file
  */
 
-/** Tests for MediaWiki languages/classes/LanguageHe.php */
+/** Tests for MediaWiki Hebrew grammar transformation handling */
 class LanguageHeTest extends LanguageClassesTestCase {
        /**
         * The most common usage for the plural forms is two forms,