Merge "mediawiki.searchSuggest: Blacklist Konqueror < 4.11"
[lhc/web/wiklou.git] / languages / utils / CLDRPluralRuleEvaluator.php
1 <?php
2 /**
3 * Parse and evaluate a plural rule.
4 *
5 * UTS #35 Revision 33
6 * http://www.unicode.org/reports/tr35/tr35-33/tr35-numbers.html#Language_Plural_Rules
7 *
8 * @author Niklas Laxstrom, Tim Starling
9 *
10 * @copyright Copyright © 2010-2012, Niklas Laxström
11 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0
12 * or later
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
28 *
29 *
30 * @file
31 * @since 1.20
32 */
33 class CLDRPluralRuleEvaluator {
34 /**
35 * Evaluate a number against a set of plural rules. If a rule passes,
36 * return the index of plural rule.
37 *
38 * @param int $number The number to be evaluated against the rules
39 * @param array $rules The associative array of plural rules in pluralform => rule format.
40 * @return int The index of the plural form which passed the evaluation
41 */
42 public static function evaluate( $number, array $rules ) {
43 $rules = self::compile( $rules );
44 return self::evaluateCompiled( $number, $rules );
45 }
46
47 /**
48 * Convert a set of rules to a compiled form which is optimised for
49 * fast evaluation. The result will be an array of strings, and may be cached.
50 *
51 * @param array $rules The rules to compile
52 * @return array An array of compile rules.
53 */
54 public static function compile( array $rules ) {
55 // We can't use array_map() for this because it generates a warning if
56 // there is an exception.
57 foreach ( $rules as &$rule ) {
58 $rule = CLDRPluralRuleConverter::convert( $rule );
59 }
60 return $rules;
61 }
62
63 /**
64 * Evaluate a compiled set of rules returned by compile(). Do not allow
65 * the user to edit the compiled form, or else PHP errors may result.
66 *
67 * @param string $number The number to be evaluated against the rules, in English, or it
68 * may be a type convertible to string.
69 * @param array $rules The associative array of plural rules in pluralform => rule format.
70 * @return int The index of the plural form which passed the evaluation
71 */
72 public static function evaluateCompiled( $number, array $rules ) {
73 // Calculate the values of the operand symbols
74 $number = strval( $number );
75 if ( !preg_match( '/^ -? ( ([0-9]+) (?: \. ([0-9]+) )? )$/x', $number, $m ) ) {
76 wfDebug( __METHOD__.': invalid number input, returning "other"' );
77 return count( $rules );
78 }
79 if ( !isset( $m[3] ) ) {
80 $operandSymbols = array(
81 'n' => intval( $m[1] ),
82 'i' => intval( $m[1] ),
83 'v' => 0,
84 'w' => 0,
85 'f' => 0,
86 't' => 0
87 );
88 } else {
89 $absValStr = $m[1];
90 $intStr = $m[2];
91 $fracStr = $m[3];
92 $operandSymbols = array(
93 'n' => floatval( $absValStr ),
94 'i' => intval( $intStr ),
95 'v' => strlen( $fracStr ),
96 'w' => strlen( rtrim( $fracStr, '0' ) ),
97 'f' => intval( $fracStr ),
98 't' => intval( rtrim( $fracStr, '0' ) ),
99 );
100 }
101
102 // The compiled form is RPN, with tokens strictly delimited by
103 // spaces, so this is a simple RPN evaluator.
104 foreach ( $rules as $i => $rule ) {
105 $stack = array();
106 $zero = ord( '0' );
107 $nine = ord( '9' );
108 foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
109 $ord = ord( $token );
110 if ( isset( $operandSymbols[$token] ) ) {
111 $stack[] = $operandSymbols[$token];
112 } elseif ( $ord >= $zero && $ord <= $nine ) {
113 $stack[] = intval( $token );
114 } else {
115 $right = array_pop( $stack );
116 $left = array_pop( $stack );
117 $result = self::doOperation( $token, $left, $right );
118 $stack[] = $result;
119 }
120 }
121 if ( $stack[0] ) {
122 return $i;
123 }
124 }
125 // None of the provided rules match. The number belongs to category
126 // 'other', which comes last.
127 return count( $rules );
128 }
129
130 /**
131 * Do a single operation
132 *
133 * @param string $token The token string
134 * @param mixed $left The left operand. If it is an object, its state may be destroyed.
135 * @param mixed $right The right operand
136 * @throws CLDRPluralRuleError
137 * @return mixed The operation result
138 */
139 private static function doOperation( $token, $left, $right ) {
140 if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
141 if ( !( $right instanceof CLDRPluralRuleEvaluator_Range ) ) {
142 $right = new CLDRPluralRuleEvaluator_Range( $right );
143 }
144 }
145 switch ( $token ) {
146 case 'or':
147 return $left || $right;
148 case 'and':
149 return $left && $right;
150 case 'is':
151 return $left == $right;
152 case 'is-not':
153 return $left != $right;
154 case 'in':
155 return $right->isNumberIn( $left );
156 case 'not-in':
157 return !$right->isNumberIn( $left );
158 case 'within':
159 return $right->isNumberWithin( $left );
160 case 'not-within':
161 return !$right->isNumberWithin( $left );
162 case 'mod':
163 if ( is_int( $left ) ) {
164 return (int)fmod( $left, $right );
165 }
166 return fmod( $left, $right );
167 case ',':
168 if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
169 $range = $left;
170 } else {
171 $range = new CLDRPluralRuleEvaluator_Range( $left );
172 }
173 $range->add( $right );
174 return $range;
175 case '..':
176 return new CLDRPluralRuleEvaluator_Range( $left, $right );
177 default:
178 throw new CLDRPluralRuleError( "Invalid RPN token" );
179 }
180 }
181 }
182
183 /**
184 * Evaluator helper class representing a range list.
185 */
186 class CLDRPluralRuleEvaluator_Range {
187 /**
188 * The parts
189 *
190 * @var array
191 */
192 public $parts = array();
193
194 /**
195 * Initialize a new instance of CLDRPluralRuleEvaluator_Range
196 *
197 * @param int $start The start of the range
198 * @param int|bool $end The end of the range, or false if the range is not bounded.
199 */
200 function __construct( $start, $end = false ) {
201 if ( $end === false ) {
202 $this->parts[] = $start;
203 } else {
204 $this->parts[] = array( $start, $end );
205 }
206 }
207
208 /**
209 * Determine if the given number is inside the range.
210 *
211 * @param int $number The number to check
212 * @param bool $integerConstraint If true, also asserts the number is an integer; otherwise, number simply has to be inside the range.
213 * @return bool True if the number is inside the range; otherwise, false.
214 */
215 function isNumberIn( $number, $integerConstraint = true ) {
216 foreach ( $this->parts as $part ) {
217 if ( is_array( $part ) ) {
218 if ( ( !$integerConstraint || floor( $number ) === (float)$number )
219 && $number >= $part[0] && $number <= $part[1]
220 ) {
221 return true;
222 }
223 } else {
224 if ( $number == $part ) {
225 return true;
226 }
227 }
228 }
229 return false;
230 }
231
232 /**
233 * Readable alias for isNumberIn( $number, false ), and the implementation
234 * of the "within" operator.
235 *
236 * @param int $number The number to check
237 * @return bool True if the number is inside the range; otherwise, false.
238 */
239 function isNumberWithin( $number ) {
240 return $this->isNumberIn( $number, false );
241 }
242
243 /**
244 * Add another part to this range.
245 *
246 * @param CLDRPluralRuleEvaluator_Range|int $other The part to add, either
247 * a range object itself or a single number.
248 */
249 function add( $other ) {
250 if ( $other instanceof self ) {
251 $this->parts = array_merge( $this->parts, $other->parts );
252 } else {
253 $this->parts[] = $other;
254 }
255 }
256
257 /**
258 * Returns the string representation of the rule evaluator range.
259 * The purpose of this method is to help debugging.
260 *
261 * @return string The string representation of the rule evaluator range
262 */
263 function __toString() {
264 $s = 'Range(';
265 foreach ( $this->parts as $i => $part ) {
266 if ( $i ) {
267 $s .= ', ';
268 }
269 if ( is_array( $part ) ) {
270 $s .= $part[0] . '..' . $part[1];
271 } else {
272 $s .= $part;
273 }
274 }
275 $s .= ')';
276 return $s;
277 }
278
279 }
280
281 /**
282 * Helper class for converting rules to reverse polish notation (RPN).
283 */
284 class CLDRPluralRuleConverter {
285 /**
286 * The input string
287 *
288 * @var string
289 */
290 public $rule;
291
292 /**
293 * The current position
294 *
295 * @var int
296 */
297 public $pos;
298
299 /**
300 * The past-the-end position
301 *
302 * @var int
303 */
304 public $end;
305
306 /**
307 * The operator stack
308 *
309 * @var array
310 */
311 public $operators = array();
312
313 /**
314 * The operand stack
315 *
316 * @var array
317 */
318 public $operands = array();
319
320 /**
321 * Precedence levels. Note that there's no need to worry about associativity
322 * for the level 4 operators, since they return boolean and don't accept
323 * boolean inputs.
324 */
325 static $precedence = array(
326 'or' => 2,
327 'and' => 3,
328 'is' => 4,
329 'is-not' => 4,
330 'in' => 4,
331 'not-in' => 4,
332 'within' => 4,
333 'not-within' => 4,
334 'mod' => 5,
335 ',' => 6,
336 '..' => 7,
337 );
338
339 /**
340 * A character list defining whitespace, for use in strspn() etc.
341 */
342 const WHITESPACE_CLASS = " \t\r\n";
343
344 /**
345 * Same for digits. Note that the grammar given in UTS #35 doesn't allow
346 * negative numbers or decimal separators.
347 */
348 const NUMBER_CLASS = '0123456789';
349
350 /**
351 * A character list of symbolic operands.
352 */
353 const OPERAND_SYMBOLS = 'nivwft';
354
355 /**
356 * An anchored regular expression which matches a word at the current offset.
357 */
358 const WORD_REGEX = '/[a-zA-Z@]+/A';
359
360 /**
361 * Convert a rule to RPN. This is the only public entry point.
362 *
363 * @param string $rule The rule to convert
364 * @return string The RPN representation of the rule
365 */
366 public static function convert( $rule ) {
367 $parser = new self( $rule );
368 return $parser->doConvert();
369 }
370
371 /**
372 * Private constructor.
373 */
374 protected function __construct( $rule ) {
375 $this->rule = $rule;
376 $this->pos = 0;
377 $this->end = strlen( $rule );
378 }
379
380 /**
381 * Do the operation.
382 *
383 * @return string The RPN representation of the rule (e.g. "5 3 mod n is")
384 */
385 protected function doConvert() {
386 $expectOperator = true;
387
388 // Iterate through all tokens, saving the operators and operands to a
389 // stack per Dijkstra's shunting yard algorithm.
390 /** @var CLDRPluralRuleConverter_Operator $token */
391 while ( false !== ( $token = $this->nextToken() ) ) {
392 // In this grammar, there are only binary operators, so every valid
393 // rule string will alternate between operator and operand tokens.
394 $expectOperator = !$expectOperator;
395
396 if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
397 // Operand
398 if ( $expectOperator ) {
399 $token->error( 'unexpected operand' );
400 }
401 $this->operands[] = $token;
402 continue;
403 } else {
404 // Operator
405 if ( !$expectOperator ) {
406 $token->error( 'unexpected operator' );
407 }
408 // Resolve higher precedence levels
409 $lastOp = end( $this->operators );
410 while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
411 $this->doOperation( $lastOp, $this->operands );
412 array_pop( $this->operators );
413 $lastOp = end( $this->operators );
414 }
415 $this->operators[] = $token;
416 }
417 }
418
419 // Finish off the stack
420 while ( $op = array_pop( $this->operators ) ) {
421 $this->doOperation( $op, $this->operands );
422 }
423
424 // Make sure the result is sane. The first case is possible for an empty
425 // string input, the second should be unreachable.
426 if ( !count( $this->operands ) ) {
427 $this->error( 'condition expected' );
428 } elseif ( count( $this->operands ) > 1 ) {
429 $this->error( 'missing operator or too many operands' );
430 }
431
432 $value = $this->operands[0];
433 if ( $value->type !== 'boolean' ) {
434 $this->error( 'the result must have a boolean type' );
435 }
436
437 return $this->operands[0]->rpn;
438 }
439
440 /**
441 * Fetch the next token from the input string.
442 *
443 * @return CLDRPluralRuleConverter_Fragment The next token
444 */
445 protected function nextToken() {
446 if ( $this->pos >= $this->end ) {
447 return false;
448 }
449
450 // Whitespace
451 $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
452 $this->pos += $length;
453
454 if ( $this->pos >= $this->end ) {
455 return false;
456 }
457
458 // Number
459 $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
460 if ( $length !== 0 ) {
461 $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
462 $this->pos += $length;
463 return $token;
464 }
465
466 // Two-character operators
467 $op2 = substr( $this->rule, $this->pos, 2 );
468 if ( $op2 === '..' || $op2 === '!=' ) {
469 $token = $this->newOperator( $op2, $this->pos, 2 );
470 $this->pos += 2;
471 return $token;
472 }
473
474 // Single-character operators
475 $op1 = $this->rule[$this->pos];
476 if ( $op1 === ',' || $op1 === '=' || $op1 === '%' ) {
477 $token = $this->newOperator( $op1, $this->pos, 1 );
478 $this->pos ++;
479 return $token;
480 }
481
482 // Word
483 if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
484 $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"' );
485 }
486 $word1 = strtolower( $m[0] );
487 $word2 = '';
488 $nextTokenPos = $this->pos + strlen( $word1 );
489 if ( $word1 === 'not' || $word1 === 'is' ) {
490 // Look ahead one word
491 $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
492 if ( $nextTokenPos < $this->end
493 && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos )
494 ) {
495 $word2 = strtolower( $m[0] );
496 $nextTokenPos += strlen( $word2 );
497 }
498 }
499
500 // Two-word operators like "is not" take precedence over single-word operators like "is"
501 if ( $word2 !== '' ) {
502 $bothWords = "{$word1}-{$word2}";
503 if ( isset( self::$precedence[$bothWords] ) ) {
504 $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
505 $this->pos = $nextTokenPos;
506 return $token;
507 }
508 }
509
510 // Single-word operators
511 if ( isset( self::$precedence[$word1] ) ) {
512 $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
513 $this->pos += strlen( $word1 );
514 return $token;
515 }
516
517 // The single-character operand symbols
518 if ( strpos( self::OPERAND_SYMBOLS, $word1 ) !== false ) {
519 $token = $this->newNumber( $word1, $this->pos );
520 $this->pos ++;
521 return $token;
522 }
523
524 // Samples
525 if ( $word1 === '@integer' || $word1 === '@decimal' ) {
526 // Samples are like comments, they have no effect on rule evaluation.
527 // They run from the first sample indicator to the end of the string.
528 $this->pos = $this->end;
529 return false;
530 }
531
532 $this->error( 'unrecognised word' );
533 }
534
535 /**
536 * For the binary operator $op, pop its operands off the stack and push
537 * a fragment with rpn and type members describing the result of that
538 * operation.
539 *
540 * @param CLDRPluralRuleConverter_Operator $op
541 */
542 protected function doOperation( $op ) {
543 if ( count( $this->operands ) < 2 ) {
544 $op->error( 'missing operand' );
545 }
546 $right = array_pop( $this->operands );
547 $left = array_pop( $this->operands );
548 $result = $op->operate( $left, $right );
549 $this->operands[] = $result;
550 }
551
552 /**
553 * Create a numerical expression object
554 *
555 * @param string $text
556 * @param int $pos
557 * @return CLDRPluralRuleConverter_Expression The numerical expression
558 */
559 protected function newNumber( $text, $pos ) {
560 return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
561 }
562
563 /**
564 * Create a binary operator
565 *
566 * @param string $type
567 * @param int $pos
568 * @param int $length
569 * @return CLDRPluralRuleConverter_Operator The operator
570 */
571 protected function newOperator( $type, $pos, $length ) {
572 return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
573 }
574
575 /**
576 * Throw an error
577 */
578 protected function error( $message ) {
579 throw new CLDRPluralRuleError( $message );
580 }
581 }
582
583 /**
584 * Helper for CLDRPluralRuleConverter.
585 * The base class for operators and expressions, describing a region of the input string.
586 */
587 class CLDRPluralRuleConverter_Fragment {
588 public $parser, $pos, $length, $end;
589
590 function __construct( $parser, $pos, $length ) {
591 $this->parser = $parser;
592 $this->pos = $pos;
593 $this->length = $length;
594 $this->end = $pos + $length;
595 }
596
597 public function error( $message ) {
598 $text = $this->getText();
599 throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
600 }
601
602 public function getText() {
603 return substr( $this->parser->rule, $this->pos, $this->length );
604 }
605 }
606
607 /**
608 * Helper for CLDRPluralRuleConverter.
609 * An expression object, representing a region of the input string (for error
610 * messages), the RPN notation used to evaluate it, and the result type for
611 * validation.
612 */
613 class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
614 /** @var string */
615 public $type;
616
617 /** @var string */
618 public $rpn;
619
620 function __construct( $parser, $type, $rpn, $pos, $length ) {
621 parent::__construct( $parser, $pos, $length );
622 $this->type = $type;
623 $this->rpn = $rpn;
624 }
625
626 public function isType( $type ) {
627 if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
628 return true;
629 }
630 if ( $type === $this->type ) {
631 return true;
632 }
633 return false;
634 }
635 }
636
637 /**
638 * Helper for CLDRPluralRuleConverter.
639 * An operator object, representing a region of the input string (for error
640 * messages), and the binary operator at that location.
641 */
642 class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
643 /** @var string The name */
644 public $name;
645
646 /**
647 * Each op type has three characters: left operand type, right operand type and result type
648 *
649 * b = boolean
650 * n = number
651 * r = range
652 *
653 * A number is a kind of range.
654 *
655 * @var array
656 */
657 static $opTypes = array(
658 'or' => 'bbb',
659 'and' => 'bbb',
660 'is' => 'nnb',
661 'is-not' => 'nnb',
662 'in' => 'nrb',
663 'not-in' => 'nrb',
664 'within' => 'nrb',
665 'not-within' => 'nrb',
666 'mod' => 'nnn',
667 ',' => 'rrr',
668 '..' => 'nnr',
669 );
670
671 /**
672 * Map converting from the abbrevation to the full form.
673 *
674 * @var array
675 */
676 static $typeSpecMap = array(
677 'b' => 'boolean',
678 'n' => 'number',
679 'r' => 'range',
680 );
681
682 /**
683 * Map for converting the new operators introduced in Rev 33 to the old forms
684 */
685 static $aliasMap = array(
686 '%' => 'mod',
687 '!=' => 'not-in',
688 '=' => 'in'
689 );
690
691 /**
692 * Initialize a new instance of a CLDRPluralRuleConverter_Operator object
693 *
694 * @param CLDRPluralRuleConverter $parser The parser
695 * @param string $name The operator name
696 * @param int $pos The length
697 * @param int $length
698 */
699 function __construct( $parser, $name, $pos, $length ) {
700 parent::__construct( $parser, $pos, $length );
701 if ( isset( self::$aliasMap[$name] ) ) {
702 $name = self::$aliasMap[$name];
703 }
704 $this->name = $name;
705 }
706
707 /**
708 * Compute the operation
709 *
710 * @param CLDRPluralRuleConverter_Expression $left The left part of the expression
711 * @param CLDRPluralRuleConverter_Expression $right The right part of the expression
712 * @return CLDRPluralRuleConverter_Expression The result of the operation
713 */
714 public function operate( $left, $right ) {
715 $typeSpec = self::$opTypes[$this->name];
716
717 $leftType = self::$typeSpecMap[$typeSpec[0]];
718 $rightType = self::$typeSpecMap[$typeSpec[1]];
719 $resultType = self::$typeSpecMap[$typeSpec[2]];
720
721 $start = min( $this->pos, $left->pos, $right->pos );
722 $end = max( $this->end, $left->end, $right->end );
723 $length = $end - $start;
724
725 $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
726 "{$left->rpn} {$right->rpn} {$this->name}",
727 $start, $length );
728
729 if ( !$left->isType( $leftType ) ) {
730 $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
731 }
732
733 if ( !$right->isType( $rightType ) ) {
734 $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
735 }
736 return $newExpr;
737 }
738 }
739
740 /**
741 * The exception class for all the classes in this file. This will be thrown
742 * back to the caller if there is any validation error.
743 */
744 class CLDRPluralRuleError extends MWException {
745 function __construct( $message ) {
746 parent::__construct( 'CLDR plural rule error: ' . $message );
747 }
748 }