From 76ca6c9b184da0c9a5f9c9e701e6257dbbdb077d Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Wed, 10 Apr 2019 15:33:57 +1000 Subject: [PATCH] Rehabilitate DateFormatter MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This code is surprisingly little changed since I added the class in November 2003, and needs some modernisation. * Remove the "linked" option, unused since 1.21. Similarly, make the "match-whole" option implied. This allows the regexes to be simplified. Nothing will be broken, according to CodeSearch. * Instead of ucfirst(), use the canonical month name from the language. This will work with e.g. French which does not capitalise month names. * Stop caching DateFormatter instances in APC. Caching was added in 2005 when initialisation was being done on every request, but now it is only needed when parsing a page with {{#formatdate}}, which is rarely, and the constructor overhead is only 200µs after Language object data initialisation. Instead, use an in-process cache via a factory service. * Add docs and extra tests. * Remove todo note obsolete since 38 minutes after the original commit. * Rename many variables. * Use double-slash comments * Don't store the Language object, just get arrays. * Use mb_strtolower() instead of Language::lc() -- any customisation of Language::lc() would break PCRE case-insensitive matching. * Use named subpatterns instead of "keys" * Remove the ISO1/ISO2 distinction, the only difference was linking. * Use closure variables instead of temporary object members Change-Id: I25fb1203dba2930724d7bc28ad0d51f59f88e1ea --- autoload.php | 1 + includes/MediaWikiServices.php | 9 + includes/ServiceWiring.php | 4 + includes/parser/CoreParserFunctions.php | 2 +- includes/parser/DateFormatter.php | 441 ++++++++++------------- includes/parser/DateFormatterFactory.php | 18 + tests/parser/parserTests.txt | 43 +++ 7 files changed, 268 insertions(+), 250 deletions(-) create mode 100644 includes/parser/DateFormatterFactory.php diff --git a/autoload.php b/autoload.php index 5fda2171eb..e713017200 100644 --- a/autoload.php +++ b/autoload.php @@ -364,6 +364,7 @@ $wgAutoloadLocalClasses = [ 'DatabaseUpdater' => __DIR__ . '/includes/installer/DatabaseUpdater.php', 'DateFormats' => __DIR__ . '/maintenance/language/date-formats.php', 'DateFormatter' => __DIR__ . '/includes/parser/DateFormatter.php', + 'DateFormatterFactory' => __DIR__ . '/includes/parser/DateFormatterFactory.php', 'DeadendPagesPage' => __DIR__ . '/includes/specials/SpecialDeadendpages.php', 'DeduplicateArchiveRevId' => __DIR__ . '/maintenance/deduplicateArchiveRevId.php', 'DeferrableCallback' => __DIR__ . '/includes/deferred/DeferrableCallback.php', diff --git a/includes/MediaWikiServices.php b/includes/MediaWikiServices.php index 8c60dc7e52..22f0c63498 100644 --- a/includes/MediaWikiServices.php +++ b/includes/MediaWikiServices.php @@ -7,6 +7,7 @@ use Config; use ConfigFactory; use CryptHKDF; use CryptRand; +use DateFormatterFactory; use EventRelayerGroup; use GenderCache; use GlobalVarConfig; @@ -526,6 +527,14 @@ class MediaWikiServices extends ServiceContainer { return $this->getService( 'CryptRand' ); } + /** + * @since 1.33 + * @return DateFormatterFactory + */ + public function getDateFormatterFactory() { + return $this->getService( 'DateFormatterFactory' ); + } + /** * @since 1.28 * @return LoadBalancer The main DB load balancer for the local wiki. diff --git a/includes/ServiceWiring.php b/includes/ServiceWiring.php index 722bac1cb5..8bb8e25db9 100644 --- a/includes/ServiceWiring.php +++ b/includes/ServiceWiring.php @@ -138,6 +138,10 @@ return [ return new CryptRand(); }, + 'DateFormatterFactory' => function () : DateFormatterFactory { + return new DateFormatterFactory; + }, + 'DBLoadBalancer' => function ( MediaWikiServices $services ) : Wikimedia\Rdbms\LoadBalancer { // just return the default LB from the DBLoadBalancerFactory service return $services->getDBLoadBalancerFactory()->getMainLB(); diff --git a/includes/parser/CoreParserFunctions.php b/includes/parser/CoreParserFunctions.php index d1d1a9c442..b2b7486918 100644 --- a/includes/parser/CoreParserFunctions.php +++ b/includes/parser/CoreParserFunctions.php @@ -113,7 +113,7 @@ class CoreParserFunctions { */ public static function formatDate( $parser, $date, $defaultPref = null ) { $lang = $parser->getFunctionLang(); - $df = DateFormatter::getInstance( $lang ); + $df = MediaWikiServices::getInstance()->getDateFormatterFactory()->get( $lang ); $date = trim( $date ); diff --git a/includes/parser/DateFormatter.php b/includes/parser/DateFormatter.php index c9bbc43a98..b0c41d91c5 100644 --- a/includes/parser/DateFormatter.php +++ b/includes/parser/DateFormatter.php @@ -24,322 +24,266 @@ use MediaWiki\MediaWikiServices; /** - * Date formatter, recognises dates in plain text and formats them according to user preferences. - * @todo preferences, OutputPage + * Date formatter. Recognises dates and formats them according to a specified preference. + * + * This class was originally introduced to detect and transform dates in free text. It is now + * only used by the {{#dateformat}} parser function. This is a very rudimentary date formatter; + * Language::sprintfDate() has many more features and is the correct choice for most new code. + * The main advantage of this date formatter is that it is able to format incomplete dates with an + * unspecified year. + * * @ingroup Parser */ class DateFormatter { - private $mSource, $mTarget; - private $monthNames = ''; - + /** @var string[] Date format regexes indexed the class constants */ private $regexes; - private $rules, $xMonths, $preferences; - private $lang, $mLinked; + /** + * @var int[][] Array of special rules. The first key is the preference ID + * (one of the class constants), the second key is the detected source + * format, and the value is the ID of the target format that will be used + * in that case. + */ + private $rules = []; - /** @var string[] */ - private $keys; + /** + * @var int[] Month numbers by lowercase name + */ + private $xMonths = []; - /** @var string[] */ - private $targets; + /** + * @var string[] Month names by number + */ + private $monthNames = []; + /** + * @var int[] A map of descriptive preference text to internal format ID + */ + private $preferenceIDs; + + /** @var string[] Format strings similar to those used by date(), indexed by ID */ + private $targetFormats; + + /** Used as a preference ID for rules that apply regardless of preference */ const ALL = -1; + + /** No preference: the date may be left in the same format as the input */ const NONE = 0; + + /** e.g. January 15, 2001 */ const MDY = 1; + + /** e.g. 15 January 2001 */ const DMY = 2; + + /** e.g. 2001 January 15 */ const YMD = 3; - const ISO1 = 4; + + /** e.g. 2001-01-15 */ + const ISO = 4; + + /** The highest ID that is a valid user preference */ const LASTPREF = 4; - const ISO2 = 5; - const YDM = 6; - const DM = 7; - const MD = 8; - const LAST = 8; + + /** e.g. 2001, 15 January */ + const YDM = 5; + + /** e.g. 15 January */ + const DM = 6; + + /** e.g. January 15 */ + const MD = 7; + + /** The highest ID that is a valid target format */ + const LAST = 7; /** * @param Language $lang In which language to format the date */ public function __construct( Language $lang ) { - $this->lang = $lang; - - $this->monthNames = $this->getMonthRegex(); + $monthRegexParts = []; for ( $i = 1; $i <= 12; $i++ ) { - $this->xMonths[$this->lang->lc( $this->lang->getMonthName( $i ) )] = $i; - $this->xMonths[$this->lang->lc( $this->lang->getMonthAbbreviation( $i ) )] = $i; + $monthName = $lang->getMonthName( $i ); + $monthAbbrev = $lang->getMonthAbbreviation( $i ); + $this->monthNames[$i] = $monthName; + $monthRegexParts[] = preg_quote( $monthName, '/' ); + $monthRegexParts[] = preg_quote( $monthAbbrev, '/' ); + $this->xMonths[mb_strtolower( $monthName )] = $i; + $this->xMonths[mb_strtolower( $monthAbbrev )] = $i; } - $this->regexTrail = '(?![a-z])/iu'; - - # Partial regular expressions - $this->prxDM = '\[\[(\d{1,2})[ _](' . $this->monthNames . ')\]\]'; - $this->prxMD = '\[\[(' . $this->monthNames . ')[ _](\d{1,2})\]\]'; - $this->prxY = '\[\[(\d{1,4}([ _]BC|))\]\]'; - $this->prxISO1 = '\[\[(-?\d{4})]]-\[\[(\d{2})-(\d{2})\]\]'; - $this->prxISO2 = '\[\[(-?\d{4})-(\d{2})-(\d{2})\]\]'; - - # Real regular expressions - $this->regexes[self::DMY] = "/{$this->prxDM}(?: *, *| +){$this->prxY}{$this->regexTrail}"; - $this->regexes[self::YDM] = "/{$this->prxY}(?: *, *| +){$this->prxDM}{$this->regexTrail}"; - $this->regexes[self::MDY] = "/{$this->prxMD}(?: *, *| +){$this->prxY}{$this->regexTrail}"; - $this->regexes[self::YMD] = "/{$this->prxY}(?: *, *| +){$this->prxMD}{$this->regexTrail}"; - $this->regexes[self::DM] = "/{$this->prxDM}{$this->regexTrail}"; - $this->regexes[self::MD] = "/{$this->prxMD}{$this->regexTrail}"; - $this->regexes[self::ISO1] = "/{$this->prxISO1}{$this->regexTrail}"; - $this->regexes[self::ISO2] = "/{$this->prxISO2}{$this->regexTrail}"; - - # Extraction keys - # See the comments in replace() for the meaning of the letters - $this->keys[self::DMY] = 'jFY'; - $this->keys[self::YDM] = 'Y jF'; - $this->keys[self::MDY] = 'FjY'; - $this->keys[self::YMD] = 'Y Fj'; - $this->keys[self::DM] = 'jF'; - $this->keys[self::MD] = 'Fj'; - $this->keys[self::ISO1] = 'ymd'; # y means ISO year - $this->keys[self::ISO2] = 'ymd'; - - # Target date formats - $this->targets[self::DMY] = '[[F j|j F]] [[Y]]'; - $this->targets[self::YDM] = '[[Y]], [[F j|j F]]'; - $this->targets[self::MDY] = '[[F j]], [[Y]]'; - $this->targets[self::YMD] = '[[Y]] [[F j]]'; - $this->targets[self::DM] = '[[F j|j F]]'; - $this->targets[self::MD] = '[[F j]]'; - $this->targets[self::ISO1] = '[[Y|y]]-[[F j|m-d]]'; - $this->targets[self::ISO2] = '[[y-m-d]]'; - - # Rules - # pref source target + // Partial regular expressions + $monthNames = implode( '|', $monthRegexParts ); + $dm = "(?\d{1,2})[ _](?{$monthNames})"; + $md = "(?{$monthNames})[ _](?\d{1,2})"; + $y = '(?\d{1,4}([ _]BC|))'; + $iso = '(?-?\d{4})-(?\d{2})-(?\d{2})'; + + $this->regexes = [ + self::DMY => "/^{$dm}(?: *, *| +){$y}$/iu", + self::YDM => "/^{$y}(?: *, *| +){$dm}$/iu", + self::MDY => "/^{$md}(?: *, *| +){$y}$/iu", + self::YMD => "/^{$y}(?: *, *| +){$md}$/iu", + self::DM => "/^{$dm}$/iu", + self::MD => "/^{$md}$/iu", + self::ISO => "/^{$iso}$/iu", + ]; + + // Target date formats + $this->targetFormats = [ + self::DMY => 'j F Y', + self::YDM => 'Y, j F', + self::MDY => 'F j, Y', + self::YMD => 'Y F j', + self::DM => 'j F', + self::MD => 'F j', + self::ISO => 'y-m-d', + ]; + + // Rules + // pref source target $this->rules[self::DMY][self::MD] = self::DM; $this->rules[self::ALL][self::MD] = self::MD; $this->rules[self::MDY][self::DM] = self::MD; $this->rules[self::ALL][self::DM] = self::DM; - $this->rules[self::NONE][self::ISO2] = self::ISO1; + $this->rules[self::NONE][self::ISO] = self::ISO; - $this->preferences = [ + $this->preferenceIDs = [ 'default' => self::NONE, 'dmy' => self::DMY, 'mdy' => self::MDY, 'ymd' => self::YMD, - 'ISO 8601' => self::ISO1, + 'ISO 8601' => self::ISO, ]; } /** * Get a DateFormatter object * + * @deprecated since 1.33 use MediaWikiServices::getDateFormatterFactory() + * * @param Language|null $lang In which language to format the date * Defaults to the site content language * @return DateFormatter */ public static function getInstance( Language $lang = null ) { - global $wgMainCacheType; - $lang = $lang ?? MediaWikiServices::getInstance()->getContentLanguage(); - $cache = ObjectCache::getLocalServerInstance( $wgMainCacheType ); - - static $dateFormatter = false; - if ( !$dateFormatter ) { - $dateFormatter = $cache->getWithSetCallback( - $cache->makeKey( 'dateformatter', $lang->getCode() ), - $cache::TTL_HOUR, - function () use ( $lang ) { - return new DateFormatter( $lang ); - } - ); - } - - return $dateFormatter; + return MediaWikiServices::getInstance()->getDateFormatterFactory()->get( $lang ); } /** - * @param string $preference User preference + * @param string $preference User preference, must be one of "default", + * "dmy", "mdy", "ymd" or "ISO 8601". * @param string $text Text to reformat - * @param array $options Array can contain 'linked' and/or 'match-whole' + * @param array $options Ignored. Since 1.33, 'match-whole' is implied, and + * 'linked' has been removed. * * @return string */ - public function reformat( $preference, $text, $options = [ 'linked' ] ) { - $linked = in_array( 'linked', $options ); - $match_whole = in_array( 'match-whole', $options ); - - if ( isset( $this->preferences[$preference] ) ) { - $preference = $this->preferences[$preference]; + public function reformat( $preference, $text, $options = [] ) { + if ( isset( $this->preferenceIDs[$preference] ) ) { + $preference = $this->preferenceIDs[$preference]; } else { $preference = self::NONE; } - for ( $i = 1; $i <= self::LAST; $i++ ) { - $this->mSource = $i; - if ( isset( $this->rules[$preference][$i] ) ) { + for ( $source = 1; $source <= self::LAST; $source++ ) { + if ( isset( $this->rules[$preference][$source] ) ) { # Specific rules - $this->mTarget = $this->rules[$preference][$i]; - } elseif ( isset( $this->rules[self::ALL][$i] ) ) { + $target = $this->rules[$preference][$source]; + } elseif ( isset( $this->rules[self::ALL][$source] ) ) { # General rules - $this->mTarget = $this->rules[self::ALL][$i]; + $target = $this->rules[self::ALL][$source]; } elseif ( $preference ) { # User preference - $this->mTarget = $preference; + $target = $preference; } else { # Default - $this->mTarget = $i; + $target = $source; } - $regex = $this->regexes[$i]; + $regex = $this->regexes[$source]; - // Horrible hack - if ( !$linked ) { - $regex = str_replace( [ '\[\[', '\]\]' ], '', $regex ); - } - - if ( $match_whole ) { - // Let's hope this works - $regex = preg_replace( '!^/!', '/^', $regex ); - $regex = str_replace( $this->regexTrail, - '$' . $this->regexTrail, $regex ); - } + $text = preg_replace_callback( $regex, + function ( $match ) use ( $target ) { + $format = $this->targetFormats[$target]; - // Another horrible hack - $this->mLinked = $linked; - $text = preg_replace_callback( $regex, [ $this, 'replace' ], $text ); - unset( $this->mLinked ); - } - return $text; - } + $text = ''; - /** - * Regexp replacement callback - * - * @param array $matches - * @return string - */ - private function replace( $matches ) { - # Extract information from $matches - $linked = $this->mLinked ?? true; - - $bits = []; - $key = $this->keys[$this->mSource]; - $keyLength = strlen( $key ); - for ( $p = 0; $p < $keyLength; $p++ ) { - if ( $key[$p] != ' ' ) { - $bits[$key[$p]] = $matches[$p + 1]; - } - } - - return $this->formatDate( $bits, $matches[0], $linked ); - } - - /** - * @param array $bits - * @param string $orig Original input string, to be returned - * on formatting failure. - * @param bool $link - * @return string - */ - private function formatDate( $bits, $orig, $link = true ) { - $format = $this->targets[$this->mTarget]; - - if ( !$link ) { - // strip piped links - $format = preg_replace( '/\[\[[^|]+\|([^\]]+)\]\]/', '$1', $format ); - // strip remaining links - $format = str_replace( [ '[[', ']]' ], '', $format ); - } - - # Construct new date - $text = ''; - $fail = false; - - // Pre-generate y/Y stuff because we need the year for the title. - if ( !isset( $bits['y'] ) && isset( $bits['Y'] ) ) { - $bits['y'] = $this->makeIsoYear( $bits['Y'] ); - } - if ( !isset( $bits['Y'] ) && isset( $bits['y'] ) ) { - $bits['Y'] = $this->makeNormalYear( $bits['y'] ); - } - - if ( !isset( $bits['m'] ) ) { - $m = $this->makeIsoMonth( $bits['F'] ); - if ( $m === false ) { - $fail = true; - } else { - $bits['m'] = $m; - } - } - - if ( !isset( $bits['d'] ) ) { - $bits['d'] = sprintf( '%02d', $bits['j'] ); - } - - $formatLength = strlen( $format ); - for ( $p = 0; $p < $formatLength; $p++ ) { - $char = $format[$p]; - switch ( $char ) { - case 'd': # ISO day of month - $text .= $bits['d']; - break; - case 'm': # ISO month - $text .= $bits['m']; - break; - case 'y': # ISO year - $text .= $bits['y']; - break; - case 'j': # ordinary day of month - if ( !isset( $bits['j'] ) ) { - $text .= intval( $bits['d'] ); - } else { - $text .= $bits['j']; + // Pre-generate y/Y stuff because we need the year for the title. + if ( !isset( $match['isoYear'] ) && isset( $match['year'] ) ) { + $match['isoYear'] = $this->makeIsoYear( $match['year'] ); + } + if ( !isset( $match['year'] ) && isset( $match['isoYear'] ) ) { + $match['year'] = $this->makeNormalYear( $match['isoYear'] ); } - break; - case 'F': # long month - if ( !isset( $bits['F'] ) ) { - $m = intval( $bits['m'] ); - if ( $m > 12 || $m < 1 ) { - $fail = true; + + if ( !isset( $match['isoMonth'] ) ) { + $m = $this->makeIsoMonth( $match['monthName'] ); + if ( $m === false ) { + // Fail + return $match[0]; } else { - $text .= $this->lang->getMonthName( $m ); + $match['isoMonth'] = $m; } - } else { - $text .= ucfirst( $bits['F'] ); } - break; - case 'Y': # ordinary (optional BC) year - $text .= $bits['Y']; - break; - default: - $text .= $char; - } - } - if ( $fail ) { - // This occurs when parsing a date with day or month outside the bounds - // of possibilities. - return $orig; - } - $isoBits = []; - if ( isset( $bits['y'] ) ) { - $isoBits[] = $bits['y']; - } - $isoBits[] = $bits['m']; - $isoBits[] = $bits['d']; - $isoDate = implode( '-', $isoBits ); + if ( !isset( $match['isoDay'] ) ) { + $match['isoDay'] = sprintf( '%02d', $match['day'] ); + } + + $formatLength = strlen( $format ); + for ( $p = 0; $p < $formatLength; $p++ ) { + $char = $format[$p]; + switch ( $char ) { + case 'd': // ISO day of month + $text .= $match['isoDay']; + break; + case 'm': // ISO month + $text .= $match['isoMonth']; + break; + case 'y': // ISO year + $text .= $match['isoYear']; + break; + case 'j': // ordinary day of month + if ( !isset( $match['day'] ) ) { + $text .= intval( $match['isoDay'] ); + } else { + $text .= $match['day']; + } + break; + case 'F': // long month + $m = intval( $match['isoMonth'] ); + if ( $m > 12 || $m < 1 ) { + // Fail + return $match[0]; + } else { + $text .= $this->monthNames[$m]; + } + break; + case 'Y': // ordinary (optional BC) year + $text .= $match['year']; + break; + default: + $text .= $char; + } + } - // Output is not strictly HTML (it's wikitext), but is whitelisted. - $text = Html::rawElement( 'span', - [ 'class' => 'mw-formatted-date', 'title' => $isoDate ], $text ); + $isoBits = []; + if ( isset( $match['isoYear'] ) ) { + $isoBits[] = $match['isoYear']; + } + $isoBits[] = $match['isoMonth']; + $isoBits[] = $match['isoDay']; + $isoDate = implode( '-', $isoBits ); - return $text; - } + // Output is not strictly HTML (it's wikitext), but is whitelisted. + $text = Html::rawElement( 'span', + [ 'class' => 'mw-formatted-date', 'title' => $isoDate ], $text ); - /** - * Return a regex that can be used to find month names in string - * @return string regex to find the months with - */ - private function getMonthRegex() { - $names = []; - for ( $i = 1; $i <= 12; $i++ ) { - $names[] = preg_quote( $this->lang->getMonthName( $i ), '/' ); - $names[] = preg_quote( $this->lang->getMonthAbbreviation( $i ), '/' ); + return $text; + }, $text + ); } - return implode( '|', $names ); + return $text; } /** @@ -348,7 +292,7 @@ class DateFormatter { * @return string|false ISO month name, or false if the input was invalid */ private function makeIsoMonth( $monthName ) { - $isoMonth = $this->xMonths[$this->lang->lc( $monthName )] ?? false; + $isoMonth = $this->xMonths[mb_strtolower( $monthName )] ?? false; if ( $isoMonth === false ) { return false; } @@ -361,12 +305,11 @@ class DateFormatter { * @return string ISO year name */ private function makeIsoYear( $year ) { - # Assumes the year is in a nice format, as enforced by the regex + // Assumes the year is in a nice format, as enforced by the regex if ( substr( $year, -2 ) == 'BC' ) { $num = intval( substr( $year, 0, -3 ) ) - 1; - # PHP bug note: sprintf( "%04d", -1 ) fails poorly + // PHP bug note: sprintf( "%04d", -1 ) fails poorly $text = sprintf( '-%04d', $num ); - } else { $text = sprintf( '%04d', $year ); } @@ -374,7 +317,7 @@ class DateFormatter { } /** - * Make a year one from an ISO year, for instance: '400 BC' from '-0399'. + * Make a year from an ISO year, for instance: '400 BC' from '-0399'. * @param string $iso ISO year * @return int|string int representing year number in case of AD dates, or string containing * year number and 'BC' at the end otherwise. diff --git a/includes/parser/DateFormatterFactory.php b/includes/parser/DateFormatterFactory.php new file mode 100644 index 0000000000..d18ecf41a3 --- /dev/null +++ b/includes/parser/DateFormatterFactory.php @@ -0,0 +1,18 @@ +getCode(); + if ( !isset( $this->instances[$code] ) ) { + $this->instances[$code] = new DateFormatter( $lang ); + } + return $this->instances[$code]; + } +} diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index ee33f1d7d1..0facec2513 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -24146,6 +24146,49 @@ language=nl title=[[MediaWiki:Common.css]]

!! end +!! test +formatdate with invalid month +!! wikitext +{{#formatdate:2019-22-22|dmy}} +!! html +

2019-22-22 +

+!! end + +!! test +formatdate: dots in month name do not match any char (T220563) +!! options +language=de +!! wikitext +{{#formatdate:jun. 3|dmy}} +{{#formatdate:junx 3|dmy}} +!! html +

3 Juni +junx 3 +

+!! end + +!! test +formatdate uses correct capitalisation in French +!! options +language=fr +!! wikitext +{{#formatdate:Juin 3|dmy}} +!! html +

3 juin +

+!! end + +!! test +formatdate uses correct capitalisation in English +!! wikitext +{{#formatdate:june 3|dmy}} +!! html +

3 June +

+!! end + + # # # -- 2.20.1