bugfix for wfBCP47 and code coverage
authorAntoine Musso <hashar@users.mediawiki.org>
Sun, 6 Feb 2011 14:47:35 +0000 (14:47 +0000)
committerAntoine Musso <hashar@users.mediawiki.org>
Sun, 6 Feb 2011 14:47:35 +0000 (14:47 +0000)
Language code are case insensitive. The BCP 47 recommands nice
formatting nonetheless. This patch enhance our formatting:
- tags preceded by the private tag 'x' are now lower case
- 4 letters tags are now lower case with first letter uper cased

Please note the RFC seems to have a bug for az-Arab-x-AZE-derbend
which should be az-Arab-x-aze-derbend .  I have changed our test
to reflect this and added a comment for later reference.

includes/GlobalFunctions.php
tests/phpunit/includes/GlobalTest.php

index 9af9b92..dcc3f23 100644 (file)
@@ -3397,6 +3397,7 @@ function wfShorthandToInteger( $string = '' ) {
 
 /**
  * Get the normalised IETF language tag
+ * See unit test for examples.
  * @param $code String: The language code.
  * @return $langCode String: The language code which complying with BCP 47 standards.
  */
@@ -3404,12 +3405,15 @@ function wfBCP47( $code ) {
        $codeSegment = explode( '-', $code );
        foreach ( $codeSegment as $segNo => $seg ) {
                if ( count( $codeSegment ) > 0 ) {
+                       // when previous segment is x, it is a private segment and should be lc 
+                       if( $segNo > 0 && strtolower( $codeSegment[($segNo - 1)] ) == 'x') {
+                               $codeBCP[$segNo] = strtolower( $seg );
                        // ISO 3166 country code
-                       if ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
+                       } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
                                $codeBCP[$segNo] = strtoupper( $seg );
                        // ISO 15924 script code
                        } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) {
-                               $codeBCP[$segNo] = ucfirst( $seg );
+                               $codeBCP[$segNo] = ucfirst( strtolower( $seg ) );
                        // Use lowercase for other cases
                        } else {
                                $codeBCP[$segNo] = strtolower( $seg );
index 1a4918e..a7ba796 100644 (file)
@@ -632,6 +632,133 @@ class GlobalTest extends MediaWikiTestCase {
        
        }
 
+       /**
+        * test @see wfBCP47().
+        * Please note the BCP explicitly state that language codes are case
+        * insensitive, there are some exceptions to the rule :)
+        * This test is used to verify our formatting against all lower and
+        * all upper cases language code.
+        *
+        * @see http://tools.ietf.org/html/bcp47
+        * @dataProvider provideLanguageCodes()
+        */
+       function testBCP47( $code, $expected ) {
+               $code = strtolower( $code );
+               $this->assertEquals( $expected, wfBCP47($code),
+                       "Applying BCP47 standard to lower case '$code'"
+               );
+
+               $code = strtoupper( $code );
+               $this->assertEquals( $expected, wfBCP47($code),
+                       "Applying BCP47 standard to upper case '$code'"
+               );
+       }
+
+       /**
+        * Array format is ($code, $expected)
+        */
+       function provideLanguageCodes() {
+               return array(
+                       // Extracted from BCP47 (list not exhaustive)
+                       # 2.1.1
+                       array( 'en-ca-x-ca'    , 'en-CA-x-ca'     ),
+                       array( 'sgn-be-fr'     , 'sgn-BE-FR'      ),
+                       array( 'az-latn-x-latn', 'az-Latn-x-latn' ),
+                       # 2.2
+                       array( 'sr-Latn-RS', 'sr-Latn-RS' ),
+                       array( 'az-arab-ir', 'az-Arab-IR' ),
+
+                       # 2.2.5
+                       array( 'sl-nedis'  , 'sl-nedis'   ),
+                       array( 'de-ch-1996', 'de-CH-1996' ),
+
+                       # 2.2.6
+                       array(
+                               'en-latn-gb-boont-r-extended-sequence-x-private',
+                               'en-Latn-GB-boont-r-extended-sequence-x-private'
+                       ),
+
+                       // Examples from BCP47 Appendix A
+                       # Simple language subtag:
+                       array( 'DE', 'de' ),
+                       array( 'fR', 'fr' ),
+                       array( 'ja', 'ja' ),
+
+                       # Language subtag plus script subtag:
+                       array( 'zh-hans', 'zh-Hans'),
+                       array( 'sr-cyrl', 'sr-Cyrl'),
+                       array( 'sr-latn', 'sr-Latn'),
+
+                       # Extended language subtags and their primary language subtag
+                       # counterparts:
+                       array( 'zh-cmn-hans-cn', 'zh-cmn-Hans-CN' ),
+                       array( 'cmn-hans-cn'   , 'cmn-Hans-CN'    ),
+                       array( 'zh-yue-hk'     , 'zh-yue-HK'      ),
+                       array( 'yue-hk'        , 'yue-HK'         ),
+
+                       # Language-Script-Region:
+                       array( 'zh-hans-cn', 'zh-Hans-CN' ),
+                       array( 'sr-latn-RS', 'sr-Latn-RS' ),
+
+                       # Language-Variant:
+                       array( 'sl-rozaj'      , 'sl-rozaj'       ),
+                       array( 'sl-rozaj-biske', 'sl-rozaj-biske' ),
+                       array( 'sl-nedis'      , 'sl-nedis'       ),
+
+                       # Language-Region-Variant:
+                       array( 'de-ch-1901'  , 'de-CH-1901'  ),
+                       array( 'sl-it-nedis' , 'sl-IT-nedis' ),
+
+                       # Language-Script-Region-Variant:
+                       array( 'hy-latn-it-arevela', 'hy-Latn-IT-arevela' ),
+
+                       # Language-Region:
+                       array( 'de-de' , 'de-DE' ),
+                       array( 'en-us' , 'en-US' ),
+                       array( 'es-419', 'es-419'),
+
+                       # Private use subtags:
+                       array( 'de-ch-x-phonebk'      , 'de-CH-x-phonebk' ),
+                       array( 'az-arab-x-aze-derbend', 'az-Arab-x-aze-derbend' ),
+                       /**
+                        * Previous test does not reflect the BCP which states:
+                        *  az-Arab-x-AZE-derbend
+                        * AZE being private, it should be lower case, hence the test above
+                        * should probably be:
+                       #array( 'az-arab-x-aze-derbend', 'az-Arab-x-AZE-derbend' ),
+                        */
+
+                       # Private use registry values:
+                       array( 'x-whatever', 'x-whatever' ),
+                       array( 'qaa-qaaa-qm-x-southern', 'qaa-Qaaa-QM-x-southern' ),
+                       array( 'de-qaaa'   , 'de-Qaaa'    ),
+                       array( 'sr-latn-qm', 'sr-Latn-QM' ),
+                       array( 'sr-qaaa-rs', 'sr-Qaaa-RS' ),
+
+                       # Tags that use extensions
+                       array( 'en-us-u-islamcal', 'en-US-u-islamcal' ),
+                       array( 'zh-cn-a-myext-x-private', 'zh-CN-a-myext-x-private' ),
+                       array( 'en-a-myext-b-another', 'en-a-myext-b-another' ),
+
+                       # Invalid:
+                       // de-419-DE
+                       // a-DE
+                       // ar-a-aaa-b-bbb-a-ccc
+       
+               /*      
+                       // ISO 15924 :
+                       array( 'sr-Cyrl', 'sr-Cyrl' ),
+                       array( 'SR-lATN', 'sr-Latn' ), # FIXME fix our function?
+                       array( 'fr-latn', 'fr-Latn' ),
+                       // Use lowercase for single segment
+                       // ISO 3166-1-alpha-2 code
+                       array( 'US', 'us' ),  # USA
+                       array( 'uS', 'us' ),  # USA
+                       array( 'Fr', 'fr' ),  # France
+                       array( 'va', 'va' ),  # Holy See (Vatican City State)
+                */);
+       }
+
        /* TODO: many more! */
 }