Make the code for grammar data processing common

author Amir E. Aharoni <amir.aharoni@mail.huji.ac.il>

Mon, 28 Sep 2015 10:26:08 +0000 (13:26 +0300)

committer Amire80 <amir.aharoni@mail.huji.ac.il>

Fri, 16 Dec 2016 13:52:14 +0000 (15:52 +0200)
author Amir E. Aharoni <amir.aharoni@mail.huji.ac.il>
Mon, 28 Sep 2015 10:26:08 +0000 (13:26 +0300)
committer Amire80 <amir.aharoni@mail.huji.ac.il>
Fri, 16 Dec 2016 13:52:14 +0000 (15:52 +0200)
diff --git a/autoload.php b/autoload.php

index 6dbcc1d..e1b808a 100644 (file)
--- a/autoload.php
+++ b/autoload.php
@@ -695,7 +695,6 @@ $wgAutoloadLocalClasses = [
         'LanguageFi' => __DIR__ . '/languages/classes/LanguageFi.php',
         'LanguageGa' => __DIR__ . '/languages/classes/LanguageGa.php',
         'LanguageGan' => __DIR__ . '/languages/classes/LanguageGan.php',
-       'LanguageHe' => __DIR__ . '/languages/classes/LanguageHe.php',
         'LanguageHsb' => __DIR__ . '/languages/classes/LanguageHsb.php',
         'LanguageHu' => __DIR__ . '/languages/classes/LanguageHu.php',
         'LanguageHy' => __DIR__ . '/languages/classes/LanguageHy.php',
diff --git a/languages/Language.php b/languages/Language.php

index bc5ab7e..ac8d4cb 100644 (file)
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -3737,6 +3737,43 @@ class Language {
                         return $wgGrammarForms[$this->getCode()][$case][$word];
                 }
  
+               $grammarTransformations = $this->getGrammarTransformations();
+
+               if ( isset( $grammarTransformations[$case] ) ) {
+                       $forms = $grammarTransformations[$case];
+
+                       // Some names of grammar rules are aliases for other rules.
+                       // In such cases the value is a string rather than object,
+                       // so load the actual rules.
+                       if ( is_string( $forms ) ) {
+                               $forms = $grammarTransformations[$forms];
+                       }
+
+                       foreach ( array_values( $forms ) as $rule ) {
+                               $form = $rule[0];
+
+                               if ( $form === '@metadata' ) {
+                                       continue;
+                               }
+
+                               $replacement = $rule[1];
+
+                               $regex = '/' . addcslashes( $form, '/' ) . '/u';
+                               $patternMatches = preg_match( $regex, $word );
+
+                               if ( $patternMatches === false ) {
+                                       wfLogWarning(
+                                               'An error occurred while processing grammar. ' .
+                                               "Word: '$word'. Regex: /$form/."
+                                       );
+                               } elseif ( $patternMatches === 1 ) {
+                                       $word = preg_replace( $regex, $replacement, $word );
+
+                                       break;
+                               }
+                       }
+               }
+
                 return $word;
         }
  
diff --git a/languages/classes/LanguageHe.php b/languages/classes/LanguageHe.php

deleted file mode 100644 (file)

index a6aaf6d..0000000
--- a/languages/classes/LanguageHe.php
+++ /dev/null
@@ -1,70 +0,0 @@
-<?php
-/**
- * Hebrew (עברית) specific code.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- * @author Rotem Liss
- * @ingroup Language
- */
-
-/**
- * Hebrew (עברית)
- *
- * @ingroup Language
- */
-class LanguageHe extends Language {
-
-       /**
-        * Convert grammar forms of words.
-        *
-        * Available cases:
-        * "prefixed" (or "תחילית") - when the word has a prefix
-        *
-        * @param string $word The word to convert
-        * @param string $case The case
-        *
-        * @return string
-        */
-       public function convertGrammar( $word, $case ) {
-               global $wgGrammarForms;
-               if ( isset( $wgGrammarForms['he'][$case][$word] ) ) {
-                       return $wgGrammarForms['he'][$case][$word];
-               }
-
-               switch ( $case ) {
-                       case 'prefixed':
-                       case 'תחילית':
-                               # Duplicate the "Waw" if prefixed, but not if it is already double.
-                               if ( substr( $word, 0, 2 ) === "ו" && substr( $word, 0, 4 ) !== "וו" ) {
-                                       $word = "ו" . $word;
-                               }
-
-                               # Remove the "He" article if prefixed.
-                               if ( substr( $word, 0, 2 ) === "ה" ) {
-                                       $word = substr( $word, 2 );
-                               }
-
-                               # Add a hyphen (maqaf) before non-Hebrew letters.
-                               if ( substr( $word, 0, 2 ) < "א" || substr( $word, 0, 2 ) > "ת" ) {
-                                       $word = "־" . $word;
-                               }
-               }
-
-               return $word;
-       }
-}
diff --git a/languages/classes/LanguageRu.php b/languages/classes/LanguageRu.php

index 62de390..7b15721 100644 (file)
--- a/languages/classes/LanguageRu.php
+++ b/languages/classes/LanguageRu.php
@@ -31,45 +31,6 @@
   * @ingroup Language
   */
  class LanguageRu extends Language {
-       /**
-        * Convert from the nominative form of a noun to some other case
-        * Invoked with {{grammar:case|word}}
-        *
-        * @param string $word
-        * @param string $case
-        * @return string
-        */
-       function convertGrammar( $word, $case ) {
-               global $wgGrammarForms;
-               if ( isset( $wgGrammarForms['ru'][$case][$word] ) ) {
-                       return $wgGrammarForms['ru'][$case][$word];
-               }
-
-               $grammarTransformations = $this->getGrammarTransformations();
-
-               if ( isset( $grammarTransformations[$case] ) ) {
-                       foreach ( array_values( $grammarTransformations[$case] ) as $rule ) {
-                               $form = $rule[0];
-
-                               if ( $form === '@metadata' ) {
-                                       continue;
-                               }
-
-                               $replacement = $rule[1];
-
-                               $regex = "/$form/";
-
-                               if ( preg_match( $regex, $word ) ) {
-                                       $word = preg_replace( $regex, $replacement, $word );
-
-                                       break;
-                               }
-                       }
-               }
-
-               return $word;
-       }
-
         /**
          * Four-digit number should be without group commas (spaces)
          * See manual of style at https://ru.wikipedia.org/wiki/Википедия:Оформление_статей
diff --git a/languages/data/grammarTransformations/he.json b/languages/data/grammarTransformations/he.json

new file mode 100644 (file)

index 0000000..50620b1
--- /dev/null
+++ b/languages/data/grammarTransformations/he.json
@@ -0,0 +1,26 @@
+{
+       "@metadata": {
+               "authors": [
+                       "Rotem Liss",
+                       "Amir E. Aharoni (amir.aharoni@mail.huji.ac.il)"
+               ]
+       },
+       "prefixed": "תחילית",
+       "תחילית": [
+               [ "@metadata", [
+                       "comment", "הכפלת ו, מחיקת ה הידיעה, הוספת מקף"
+               ] ],
+               [
+                       "^(ו[^ו].+)$",
+                       "ו$1"
+               ],
+               [
+                       "ה(.+)$",
+                       "$1"
+               ],
+               [
+                       "^([^א-ת].+)$",
+                       "־$1"
+               ]
+       ]
+}
diff --git a/resources/Resources.php b/resources/Resources.php

index 4932a29..d1dd668 100644 (file)
--- a/resources/Resources.php
+++ b/resources/Resources.php
@@ -1592,13 +1592,11 @@ return [
                         'dsb' => 'resources/src/mediawiki.language/languages/dsb.js',
                         'fi' => 'resources/src/mediawiki.language/languages/fi.js',
                         'ga' => 'resources/src/mediawiki.language/languages/ga.js',
-                       'he' => 'resources/src/mediawiki.language/languages/he.js',
                         'hsb' => 'resources/src/mediawiki.language/languages/hsb.js',
                         'hu' => 'resources/src/mediawiki.language/languages/hu.js',
                         'hy' => 'resources/src/mediawiki.language/languages/hy.js',
                         'la' => 'resources/src/mediawiki.language/languages/la.js',
                         'os' => 'resources/src/mediawiki.language/languages/os.js',
-                       'ru' => 'resources/src/mediawiki.language/languages/ru.js',
                         'sl' => 'resources/src/mediawiki.language/languages/sl.js',
                         'uk' => 'resources/src/mediawiki.language/languages/uk.js',
                 ],
diff --git a/resources/src/mediawiki.language/languages/he.js b/resources/src/mediawiki.language/languages/he.js

deleted file mode 100644 (file)

index 5bf8c4d..0000000
--- a/resources/src/mediawiki.language/languages/he.js
+++ /dev/null
@@ -1,29 +0,0 @@
-/*!
- * Hebrew (עברית) language functions
- */
-
-mediaWiki.language.convertGrammar = function ( word, form ) {
-       var grammarForms = mediaWiki.language.getData( 'he', 'grammarForms' );
-       if ( grammarForms && grammarForms[ form ] ) {
-               return grammarForms[ form ][ word ];
-       }
-       switch ( form ) {
-               case 'prefixed':
-               case 'תחילית': // the same word in Hebrew
-                       // Duplicate prefixed "Waw", but only if it's not already double
-                       if ( word.slice( 0, 1 ) === 'ו' && word.slice( 0, 2 ) !== 'וו' ) {
-                               word = 'ו' + word;
-                       }
-
-                       // Remove the "He" if prefixed
-                       if ( word.slice( 0, 1 ) === 'ה' ) {
-                               word = word.slice( 1 );
-                       }
-
-                       // Add a hyphen (maqaf) before numbers and non-Hebrew letters
-                       if ( word.slice( 0, 1 ) < 'א' || word.slice( 0, 1 ) > 'ת' ) {
-                               word = '־' + word;
-                       }
-       }
-       return word;
-};
diff --git a/resources/src/mediawiki.language/languages/ru.js b/resources/src/mediawiki.language/languages/ru.js

deleted file mode 100644 (file)

index 09d7c0b..0000000
--- a/resources/src/mediawiki.language/languages/ru.js
+++ /dev/null
@@ -1,38 +0,0 @@
-/*!
- * Russian (Русский) language functions
- */
-
-mediaWiki.language.convertGrammar = function ( word, form ) {
-       'use strict';
-
-       var forms, transformations, i, rule, sourcePattern, regexp, replacement;
-
-       forms = mediaWiki.language.getData( 'ru', 'grammarForms' );
-       if ( forms && forms[ form ] ) {
-               return forms[ form ][ word ];
-       }
-
-       transformations = mediaWiki.language.getData( 'ru', 'grammarTransformations' );
-
-       if ( !transformations[ form ] ) {
-               return word;
-       }
-
-       for ( i = 0; i < transformations[ form ].length; i++ ) {
-               rule = transformations[ form ][ i ];
-               sourcePattern = rule[ 0 ];
-
-               if ( sourcePattern === '@metadata' ) {
-                       continue;
-               }
-
-               regexp = new RegExp( sourcePattern );
-               replacement = rule[ 1 ];
-
-               if ( word.match( regexp ) ) {
-                       return word.replace( regexp, replacement );
-               }
-       }
-
-       return word;
-};
diff --git a/resources/src/mediawiki.language/mediawiki.language.js b/resources/src/mediawiki.language/mediawiki.language.js

index fc2af3d..3726a68 100644 (file)
--- a/resources/src/mediawiki.language/mediawiki.language.js
+++ b/resources/src/mediawiki.language/mediawiki.language.js
@@ -109,7 +109,7 @@
  
                 /**
                  * Grammatical transformations, needed for inflected languages.
-                * Invoked by putting `{{grammar:form|word}}` in a message.
+                * Invoked by putting `{{grammar:case|word}}` in a message.
                  *
                  * The rules can be defined in $wgGrammarForms global or computed
                  * dynamically by overriding this method per language.
@@ -119,10 +119,47 @@
                  * @return {string}
                  */
                 convertGrammar: function ( word, form ) {
-                       var grammarForms = mw.language.getData( mw.config.get( 'wgUserLanguage' ), 'grammarForms' );
-                       if ( grammarForms && grammarForms[ form ] ) {
-                               return grammarForms[ form ][ word ] || word;
+                       var userLanguage, forms, transformations,
+                               patterns, i, rule, sourcePattern, regexp, replacement;
+
+                       userLanguage = mw.config.get( 'wgUserLanguage' );
+
+                       forms = mw.language.getData( userLanguage, 'grammarForms' );
+                       if ( forms && forms[ form ] ) {
+                               return forms[ form ][ word ];
+                       }
+
+                       transformations = mediaWiki.language.getData( userLanguage, 'grammarTransformations' );
+
+                       if ( !( transformations && transformations[ form ] ) ) {
+                               return word;
+                       }
+
+                       patterns = transformations[ form ];
+
+                       // Some names of grammar rules are aliases for other rules.
+                       // In such cases the value is a string rather than object,
+                       // so load the actual rules.
+                       if ( typeof patterns === 'string' ) {
+                               patterns = transformations[ patterns ];
                         }
+
+                       for ( i = 0; i < patterns.length; i++ ) {
+                               rule = patterns[ i ];
+                               sourcePattern = rule[ 0 ];
+
+                               if ( sourcePattern === '@metadata' ) {
+                                       continue;
+                               }
+
+                               regexp = new RegExp( sourcePattern );
+                               replacement = rule[ 1 ];
+
+                               if ( word.match( regexp ) ) {
+                                       return word.replace( regexp, replacement );
+                               }
+                       }
+
                         return word;
                 },
  
diff --git a/tests/phpunit/languages/classes/LanguageHeTest.php b/tests/phpunit/languages/classes/LanguageHeTest.php

index 771cda5..c1b774a 100644 (file)
--- a/tests/phpunit/languages/classes/LanguageHeTest.php
+++ b/tests/phpunit/languages/classes/LanguageHeTest.php
@@ -5,7 +5,7 @@
   * @file
   */
  
-/** Tests for MediaWiki languages/classes/LanguageHe.php */
+/** Tests for MediaWiki Hebrew grammar transformation handling */
  class LanguageHeTest extends LanguageClassesTestCase {
         /**
          * The most common usage for the plural forms is two forms,
author	Amir E. Aharoni <amir.aharoni@mail.huji.ac.il>
	Mon, 28 Sep 2015 10:26:08 +0000 (13:26 +0300)
committer	Amire80 <amir.aharoni@mail.huji.ac.il>
	Fri, 16 Dec 2016 13:52:14 +0000 (15:52 +0200)
autoload.php		patch \| blob \| history
languages/Language.php		patch \| blob \| history
languages/classes/LanguageHe.php	[deleted file]	patch \| blob \| history
languages/classes/LanguageRu.php		patch \| blob \| history
languages/data/grammarTransformations/he.json	[new file with mode: 0644]	patch \| blob
resources/Resources.php		patch \| blob \| history
resources/src/mediawiki.language/languages/he.js	[deleted file]	patch \| blob \| history
resources/src/mediawiki.language/languages/ru.js	[deleted file]	patch \| blob \| history
resources/src/mediawiki.language/mediawiki.language.js		patch \| blob \| history
tests/phpunit/languages/classes/LanguageHeTest.php		patch \| blob \| history