From: Giuseppe Lavagetto Date: Tue, 9 Apr 2019 17:02:03 +0000 (+0200) Subject: Add ability to override mb_strtoupper in Language::ucfirst X-Git-Tag: 1.34.0-rc.0~1941^2 X-Git-Url: https://git.heureux-cyclage.org/?p=lhc%2Fweb%2Fwiklou.git;a=commitdiff_plain;h=d46835ef4f877b03a9d48aa392dc23ae37042756 Add ability to override mb_strtoupper in Language::ucfirst Different PHP versions treat unicode differently, and specifically some wiki resources become unreachable if mb_strtoupper's behavior has changed. This patch allows to introduce an override table that allows to smooth the transition period. It also provides maintenance scripts to generate such an override table. Bug: T219279 Change-Id: I0503ff4207fded4648c58c7b50e67c55422a4849 --- diff --git a/autoload.php b/autoload.php index ab36d84157..4f41c8af91 100644 --- a/autoload.php +++ b/autoload.php @@ -564,6 +564,8 @@ $wgAutoloadLocalClasses = [ 'GenerateNormalizerDataAr' => __DIR__ . '/maintenance/language/generateNormalizerDataAr.php', 'GenerateNormalizerDataMl' => __DIR__ . '/maintenance/language/generateNormalizerDataMl.php', 'GenerateSitemap' => __DIR__ . '/maintenance/generateSitemap.php', + 'GenerateUcfirstOverrides' => __DIR__ . '/maintenance/language/generateUcfirstOverrides.php', + 'GenerateUpperCharTable' => __DIR__ . '/maintenance/language/generateUpperCharTable.php', 'GenericArrayObject' => __DIR__ . '/includes/libs/GenericArrayObject.php', 'GenericParameterJob' => __DIR__ . '/includes/jobqueue/GenericParameterJob.php', 'GetConfiguration' => __DIR__ . '/maintenance/getConfiguration.php', diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 4547009b5c..7f841d2fe6 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -3194,6 +3194,19 @@ $wgLocaltimezone = null; */ $wgLocalTZoffset = null; +/** + * List of Unicode characters for which capitalization is overridden in + * Language::ucfirst. The characters should be + * represented as char_to_convert => conversion_override. See T219279 for details + * on why this is useful during php version transitions. + * + * @warning: EXPERIMENTAL! + * + * @since 1.34 + * @var array + */ +$wgOverrideUcfirstCharacters = []; + /** @} */ # End of language/charset settings /*************************************************************************//** diff --git a/languages/Language.php b/languages/Language.php index a9bbc20c1d..1b5580cada 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -2713,7 +2713,7 @@ class Language { public function uc( $str, $first = false ) { if ( $first ) { if ( $this->isMultibyte( $str ) ) { - return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); + return $this->mbUpperChar( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); } else { return ucfirst( $str ); } @@ -2722,6 +2722,28 @@ class Language { } } + /** + * Convert character to uppercase, allowing overrides of the default mb_upper + * behaviour, which is buggy in many ways. Having a conversion table can be + * useful during transitions between PHP versions where unicode changes happen. + * This can make some resources unreachable on-wiki, see discussion at T219279. + * Providing such a conversion table can allow to manage the transition period. + * + * @since 1.34 + * + * @param string $char + * + * @return string + */ + protected function mbUpperChar( $char ) { + global $wgOverrideUcfirstCharacters; + if ( array_key_exists( $char, $wgOverrideUcfirstCharacters ) ) { + return $wgOverrideUcfirstCharacters[$char]; + } else { + return mb_strtoupper( $char ); + } + } + /** * @param string $str * @return mixed|string diff --git a/maintenance/language/generateUcfirstOverrides.php b/maintenance/language/generateUcfirstOverrides.php new file mode 100644 index 0000000000..c1e93f4cde --- /dev/null +++ b/maintenance/language/generateUcfirstOverrides.php @@ -0,0 +1,83 @@ + utf8_uppercase + * overrides. Takes as input two json files generated with generateUpperCharTable.php + * as input. + * + * Example run: + * # this will prepare a file to use to make hhvm's Language::ucfirst work like php7's + * + * $ php7.2 maintenance/language/generateUpperCharTable.php --outfile php7.2.json + * $ hhvm --php maintenance/language/generateUpperCharTable.php --outfile hhvm.json + * $ hhvm maintenance/language/generateUcfirstOverrides.php \ + * --override hhvm.json --with php7.2.json --outfile test.php + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup MaintenanceLanguage + */ + +require_once __DIR__ . '/../Maintenance.php'; + +class GenerateUcfirstOverrides extends Maintenance { + + public function __construct() { + parent::__construct(); + $this->addDescription( + 'Generates a php source file containing a definition for mb_strtoupper overrides' ); + $this->addOption( 'outfile', 'Output file', true, true, 'o' ); + $this->addOption( 'override', 'Char table we want to override', true, true ); + $this->addOption( 'with', 'Char table we want to obtain', true, true ); + } + + public function execute() { + $outfile = $this->getOption( 'outfile' ); + $from = $this->loadJson( $this->getOption( 'override' ) ); + $to = $this->loadJson( $this->getOption( 'with' ) ); + $overrides = []; + + foreach ( $from as $lc => $uc ) { + $ref = $to[$lc] ?? null; + if ( $ref !== null && $ref !== $uc ) { + $overrides[$lc] = $uc; + } + } + $writer = new StaticArrayWriter(); + file_put_contents( + $outfile, + $writer->create( $overrides, 'File created by generateUcfirstOverrides.php' ) + ); + } + + private function loadJson( $filename ) { + $data = file_get_contents( $filename ); + if ( $data === false ) { + $msg = sprintf( "Could not load data from file '%s'\n", $filename ); + $this->fatalError( $msg ); + } + $json = json_decode( $data ); + if ( $result === null ) { + $msg = sprintf( "Invalid json in the data file %s\n", $filename ); + $this->fatalError( $msg, 2 ); + } + return $json; + } +} + +$maintClass = GenerateUcfirstOverrides::class; +require_once RUN_MAINTENANCE_IF_MAIN; diff --git a/maintenance/language/generateUpperCharTable.php b/maintenance/language/generateUpperCharTable.php new file mode 100644 index 0000000000..b03d704551 --- /dev/null +++ b/maintenance/language/generateUpperCharTable.php @@ -0,0 +1,49 @@ + utf8_uppercase + * for all of the utf-8 range. This provides the input for generateUcfirstOverrides.php + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup MaintenanceLanguage + */ + +require_once __DIR__ . '/../Maintenance.php'; + +class GenerateUpperCharTable extends Maintenance { + + public function __construct() { + parent::__construct(); + $this->addDescription( 'Generates the lowercase => uppercase json table' ); + $this->addOption( 'outfile', 'Output file', true, true, 'o' ); + } + + public function execute() { + $outfile = $this->getOption( 'outfile', 'upperchar.json' ); + $toUpperTable = []; + for ( $i = 0; $i <= 0x10ffff; $i++ ) { + $char = UtfNormal\Utils::codepointToUtf8( $i ); + $upper = mb_strtoupper( $char ); + $toUpperTable[$char] = $upper; + } + file_put_contents( $outfile, json_encode( $toUpperTable ) ); + } +} + +$maintClass = GenerateUpperCharTable::class; +require_once RUN_MAINTENANCE_IF_MAIN; diff --git a/tests/phpunit/languages/LanguageTest.php b/tests/phpunit/languages/LanguageTest.php index dca1363885..050f07db1c 100644 --- a/tests/phpunit/languages/LanguageTest.php +++ b/tests/phpunit/languages/LanguageTest.php @@ -1909,4 +1909,27 @@ class LanguageTest extends LanguageClassesTestCase { $ar2 = new LanguageAr(); $this->assertTrue( $ar1->equals( $ar2 ), 'ar equals ar' ); } + + /** + * @dataProvider provideUcfirst + * @covers Language::ucfirst + */ + public function testUcfirst( $orig, $expected, $desc, $overrides = false ) { + $lang = new Language(); + if ( is_array( $overrides ) ) { + $this->setMwGlobals( [ 'wgOverrideUcfirstCharacters' => $overrides ] ); + } + $this->assertSame( $lang->ucfirst( $orig ), $expected, $desc ); + } + + public static function provideUcfirst() { + return [ + [ 'alice', 'Alice', 'simple ASCII string', false ], + [ 'århus', 'Århus', 'unicode string', false ], + //overrides do not affect ASCII characters + [ 'foo', 'Foo', 'ASCII is not overriden', [ 'f' => 'b' ] ], + // but they do affect non-ascii ones + [ 'èl', 'Ll' , 'Non-ASCII is overridden', [ 'è' => 'L' ] ], + ]; + } }