Merge "Add no-dimensions option to MediaTransformOutput::toHtml"
[lhc/web/wiklou.git] / languages / utils / CLDRPluralRuleEvaluator.php
1 <?php
2 /**
3 * Parse and evaluate a plural rule.
4 *
5 * http://unicode.org/reports/tr35/#Language_Plural_Rules
6 *
7 * @author Niklas Laxstrom, Tim Starling
8 *
9 * @copyright Copyright © 2010-2012, Niklas Laxström
10 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0
11 * or later
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License along
24 * with this program; if not, write to the Free Software Foundation, Inc.,
25 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 * http://www.gnu.org/copyleft/gpl.html
27 *
28 *
29 * @file
30 * @since 1.20
31 */
32 class CLDRPluralRuleEvaluator {
33 /**
34 * Evaluate a number against a set of plural rules. If a rule passes,
35 * return the index of plural rule.
36 *
37 * @param int The number to be evaluated against the rules
38 * @param array The associative array of plural rules in pluralform => rule format.
39 * @return int The index of the plural form which passed the evaluation
40 */
41 public static function evaluate( $number, array $rules ) {
42 $rules = self::compile( $rules );
43 return self::evaluateCompiled( $number, $rules );
44 }
45
46 /**
47 * Convert a set of rules to a compiled form which is optimised for
48 * fast evaluation. The result will be an array of strings, and may be cached.
49 *
50 * @param array $rules The rules to compile
51 * @return array An array of compile rules.
52 */
53 public static function compile( array $rules ) {
54 // We can't use array_map() for this because it generates a warning if
55 // there is an exception.
56 foreach ( $rules as &$rule ) {
57 $rule = CLDRPluralRuleConverter::convert( $rule );
58 }
59 return $rules;
60 }
61
62 /**
63 * Evaluate a compiled set of rules returned by compile(). Do not allow
64 * the user to edit the compiled form, or else PHP errors may result.
65 *
66 * @param int The number to be evaluated against the rules
67 * @param array The associative array of plural rules in pluralform => rule format.
68 * @return int The index of the plural form which passed the evaluation
69 */
70 public static function evaluateCompiled( $number, array $rules ) {
71 // The compiled form is RPN, with tokens strictly delimited by
72 // spaces, so this is a simple RPN evaluator.
73 foreach ( $rules as $i => $rule ) {
74 $stack = array();
75 $zero = ord( '0' );
76 $nine = ord( '9' );
77 foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
78 $ord = ord( $token );
79 if ( $token === 'n' ) {
80 $stack[] = $number;
81 } elseif ( $ord >= $zero && $ord <= $nine ) {
82 $stack[] = intval( $token );
83 } else {
84 $right = array_pop( $stack );
85 $left = array_pop( $stack );
86 $result = self::doOperation( $token, $left, $right );
87 $stack[] = $result;
88 }
89 }
90 if ( $stack[0] ) {
91 return $i;
92 }
93 }
94 // None of the provided rules match. The number belongs to caregory
95 // 'other' which comes last.
96 return count( $rules );
97 }
98
99 /**
100 * Do a single operation
101 *
102 * @param string $token The token string
103 * @param mixed $left The left operand. If it is an object, its state may be destroyed.
104 * @param mixed $right The right operand
105 * @throws CLDRPluralRuleError
106 * @return mixed The operation result
107 */
108 private static function doOperation( $token, $left, $right ) {
109 if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
110 if ( !( $right instanceof CLDRPluralRuleEvaluator_Range ) ) {
111 $right = new CLDRPluralRuleEvaluator_Range( $right );
112 }
113 }
114 switch ( $token ) {
115 case 'or':
116 return $left || $right;
117 case 'and':
118 return $left && $right;
119 case 'is':
120 return $left == $right;
121 case 'is-not':
122 return $left != $right;
123 case 'in':
124 return $right->isNumberIn( $left );
125 case 'not-in':
126 return !$right->isNumberIn( $left );
127 case 'within':
128 return $right->isNumberWithin( $left );
129 case 'not-within':
130 return !$right->isNumberWithin( $left );
131 case 'mod':
132 if ( is_int( $left ) ) {
133 return (int)fmod( $left, $right );
134 }
135 return fmod( $left, $right );
136 case ',':
137 if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
138 $range = $left;
139 } else {
140 $range = new CLDRPluralRuleEvaluator_Range( $left );
141 }
142 $range->add( $right );
143 return $range;
144 case '..':
145 return new CLDRPluralRuleEvaluator_Range( $left, $right );
146 default:
147 throw new CLDRPluralRuleError( "Invalid RPN token" );
148 }
149 }
150 }
151
152 /**
153 * Evaluator helper class representing a range list.
154 */
155 class CLDRPluralRuleEvaluator_Range {
156 /**
157 * The parts
158 *
159 * @var array
160 */
161 public $parts = array();
162
163 /**
164 * Initialize a new instance of CLDRPluralRuleEvaluator_Range
165 *
166 * @param int $start The start of the range
167 * @param int|bool $end The end of the range, or false if the range is not bounded.
168 */
169 function __construct( $start, $end = false ) {
170 if ( $end === false ) {
171 $this->parts[] = $start;
172 } else {
173 $this->parts[] = array( $start, $end );
174 }
175 }
176
177 /**
178 * Determine if the given number is inside the range.
179 *
180 * @param int $number The number to check
181 * @param bool $integerConstraint If true, also asserts the number is an integer; otherwise, number simply has to be inside the range.
182 * @return bool True if the number is inside the range; otherwise, false.
183 */
184 function isNumberIn( $number, $integerConstraint = true ) {
185 foreach ( $this->parts as $part ) {
186 if ( is_array( $part ) ) {
187 if ( ( !$integerConstraint || floor( $number ) === (float)$number )
188 && $number >= $part[0] && $number <= $part[1]
189 ) {
190 return true;
191 }
192 } else {
193 if ( $number == $part ) {
194 return true;
195 }
196 }
197 }
198 return false;
199 }
200
201 /**
202 * Readable alias for isNumberIn( $number, false ), and the implementation
203 * of the "within" operator.
204 *
205 * @param int $number The number to check
206 * @return bool True if the number is inside the range; otherwise, false.
207 */
208 function isNumberWithin( $number ) {
209 return $this->isNumberIn( $number, false );
210 }
211
212 /**
213 * Add another part to this range.
214 *
215 * @param mixed The part to add, either a range object itself or a single number.
216 */
217 function add( $other ) {
218 if ( $other instanceof self ) {
219 $this->parts = array_merge( $this->parts, $other->parts );
220 } else {
221 $this->parts[] = $other;
222 }
223 }
224
225 /**
226 * Returns the string representation of the rule evaluator range.
227 * The purpose of this method is to help debugging.
228 *
229 * @return string The string representation of the rule evaluator range
230 */
231 function __toString() {
232 $s = 'Range(';
233 foreach ( $this->parts as $i => $part ) {
234 if ( $i ) {
235 $s .= ', ';
236 }
237 if ( is_array( $part ) ) {
238 $s .= $part[0] . '..' . $part[1];
239 } else {
240 $s .= $part;
241 }
242 }
243 $s .= ')';
244 return $s;
245 }
246
247 }
248
249 /**
250 * Helper class for converting rules to reverse polish notation (RPN).
251 */
252 class CLDRPluralRuleConverter {
253 /**
254 * The rule
255 *
256 * @var string
257 */
258 public $rule;
259
260 /**
261 * The position
262 *
263 * @var int
264 */
265 public $pos;
266
267 /**
268 * The last position possible
269 *
270 * @var int
271 */
272 public $end;
273
274 /**
275 * The operators
276 *
277 * @var array
278 */
279 public $operators = array();
280
281 /**
282 * The operands
283 *
284 * @var array
285 */
286 public $operands = array();
287
288 /**
289 * Precedence levels. Note that there's no need to worry about associativity
290 * for the level 4 operators, since they return boolean and don't accept
291 * boolean inputs.
292 */
293 static $precedence = array(
294 'or' => 2,
295 'and' => 3,
296 'is' => 4,
297 'is-not' => 4,
298 'in' => 4,
299 'not-in' => 4,
300 'within' => 4,
301 'not-within' => 4,
302 'mod' => 5,
303 ',' => 6,
304 '..' => 7,
305 );
306
307 /**
308 * A character list defining whitespace, for use in strspn() etc.
309 */
310 const WHITESPACE_CLASS = " \t\r\n";
311
312 /**
313 * Same for digits. Note that the grammar given in UTS #35 doesn't allow
314 * negative numbers or decimals.
315 */
316 const NUMBER_CLASS = '0123456789';
317
318 /**
319 * An anchored regular expression which matches a word at the current offset.
320 */
321 const WORD_REGEX = '/[a-zA-Z]+/A';
322
323 /**
324 * Convert a rule to RPN. This is the only public entry point.
325 *
326 * @param $rule The rule to convert
327 * @return string The RPN representation of the rule
328 */
329 public static function convert( $rule ) {
330 $parser = new self( $rule );
331 return $parser->doConvert();
332 }
333
334 /**
335 * Private constructor.
336 */
337 protected function __construct( $rule ) {
338 $this->rule = $rule;
339 $this->pos = 0;
340 $this->end = strlen( $rule );
341 }
342
343 /**
344 * Do the operation.
345 *
346 * @return string The RPN representation of the rule (e.g. "5 3 mod n is")
347 */
348 protected function doConvert() {
349 $expectOperator = true;
350
351 // Iterate through all tokens, saving the operators and operands to a
352 // stack per Dijkstra's shunting yard algorithm.
353 while ( false !== ( $token = $this->nextToken() ) ) {
354 // In this grammar, there are only binary operators, so every valid
355 // rule string will alternate between operator and operand tokens.
356 $expectOperator = !$expectOperator;
357
358 if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
359 // Operand
360 if ( $expectOperator ) {
361 $token->error( 'unexpected operand' );
362 }
363 $this->operands[] = $token;
364 continue;
365 } else {
366 // Operator
367 if ( !$expectOperator ) {
368 $token->error( 'unexpected operator' );
369 }
370 // Resolve higher precedence levels
371 $lastOp = end( $this->operators );
372 while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
373 $this->doOperation( $lastOp, $this->operands );
374 array_pop( $this->operators );
375 $lastOp = end( $this->operators );
376 }
377 $this->operators[] = $token;
378 }
379 }
380
381 // Finish off the stack
382 while ( $op = array_pop( $this->operators ) ) {
383 $this->doOperation( $op, $this->operands );
384 }
385
386 // Make sure the result is sane. The first case is possible for an empty
387 // string input, the second should be unreachable.
388 if ( !count( $this->operands ) ) {
389 $this->error( 'condition expected' );
390 } elseif ( count( $this->operands ) > 1 ) {
391 $this->error( 'missing operator or too many operands' );
392 }
393
394 $value = $this->operands[0];
395 if ( $value->type !== 'boolean' ) {
396 $this->error( 'the result must have a boolean type' );
397 }
398
399 return $this->operands[0]->rpn;
400 }
401
402 /**
403 * Fetch the next token from the input string.
404 *
405 * @return CLDRPluralRuleConverter_Fragment The next token
406 */
407 protected function nextToken() {
408 if ( $this->pos >= $this->end ) {
409 return false;
410 }
411
412 // Whitespace
413 $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
414 $this->pos += $length;
415
416 if ( $this->pos >= $this->end ) {
417 return false;
418 }
419
420 // Number
421 $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
422 if ( $length !== 0 ) {
423 $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
424 $this->pos += $length;
425 return $token;
426 }
427
428 // Comma
429 if ( $this->rule[$this->pos] === ',' ) {
430 $token = $this->newOperator( ',', $this->pos, 1 );
431 $this->pos ++;
432 return $token;
433 }
434
435 // Dot dot
436 if ( substr( $this->rule, $this->pos, 2 ) === '..' ) {
437 $token = $this->newOperator( '..', $this->pos, 2 );
438 $this->pos += 2;
439 return $token;
440 }
441
442 // Word
443 if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
444 $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"' );
445 }
446 $word1 = strtolower( $m[0] );
447 $word2 = '';
448 $nextTokenPos = $this->pos + strlen( $word1 );
449 if ( $word1 === 'not' || $word1 === 'is' ) {
450 // Look ahead one word
451 $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
452 if ( $nextTokenPos < $this->end
453 && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos )
454 ) {
455 $word2 = strtolower( $m[0] );
456 $nextTokenPos += strlen( $word2 );
457 }
458 }
459
460 // Two-word operators like "is not" take precedence over single-word operators like "is"
461 if ( $word2 !== '' ) {
462 $bothWords = "{$word1}-{$word2}";
463 if ( isset( self::$precedence[$bothWords] ) ) {
464 $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
465 $this->pos = $nextTokenPos;
466 return $token;
467 }
468 }
469
470 // Single-word operators
471 if ( isset( self::$precedence[$word1] ) ) {
472 $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
473 $this->pos += strlen( $word1 );
474 return $token;
475 }
476
477 // The special numerical keyword "n"
478 if ( $word1 === 'n' ) {
479 $token = $this->newNumber( 'n', $this->pos );
480 $this->pos ++;
481 return $token;
482 }
483
484 $this->error( 'unrecognised word' );
485 }
486
487 /**
488 * For the binary operator $op, pop its operands off the stack and push
489 * a fragment with rpn and type members describing the result of that
490 * operation.
491 */
492 protected function doOperation( $op ) {
493 if ( count( $this->operands ) < 2 ) {
494 $op->error( 'missing operand' );
495 }
496 $right = array_pop( $this->operands );
497 $left = array_pop( $this->operands );
498 $result = $op->operate( $left, $right );
499 $this->operands[] = $result;
500 }
501
502 /**
503 * Create a numerical expression object
504 *
505 * @return CLDRPluralRuleConverter_Expression The numerical expression
506 */
507 protected function newNumber( $text, $pos ) {
508 return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
509 }
510
511 /**
512 * Create a binary operator
513 *
514 * @return CLDRPluralRuleConverter_Operator The operator
515 */
516 protected function newOperator( $type, $pos, $length ) {
517 return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
518 }
519
520 /**
521 * Throw an error
522 */
523 protected function error( $message ) {
524 throw new CLDRPluralRuleError( $message );
525 }
526 }
527
528 /**
529 * Helper for CLDRPluralRuleConverter.
530 * The base class for operators and expressions, describing a region of the input string.
531 */
532 class CLDRPluralRuleConverter_Fragment {
533 public $parser, $pos, $length, $end;
534
535 function __construct( $parser, $pos, $length ) {
536 $this->parser = $parser;
537 $this->pos = $pos;
538 $this->length = $length;
539 $this->end = $pos + $length;
540 }
541
542 public function error( $message ) {
543 $text = $this->getText();
544 throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
545 }
546
547 public function getText() {
548 return substr( $this->parser->rule, $this->pos, $this->length );
549 }
550 }
551
552 /**
553 * Helper for CLDRPluralRuleConverter.
554 * An expression object, representing a region of the input string (for error
555 * messages), the RPN notation used to evaluate it, and the result type for
556 * validation.
557 */
558 class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
559 public $type, $rpn;
560
561 function __construct( $parser, $type, $rpn, $pos, $length ) {
562 parent::__construct( $parser, $pos, $length );
563 $this->type = $type;
564 $this->rpn = $rpn;
565 }
566
567 public function isType( $type ) {
568 if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
569 return true;
570 }
571 if ( $type === $this->type ) {
572 return true;
573 }
574 return false;
575 }
576 }
577
578 /**
579 * Helper for CLDRPluralRuleConverter.
580 * An operator object, representing a region of the input string (for error
581 * messages), and the binary operator at that location.
582 */
583 class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
584 /**
585 * The name
586 *
587 * @var string
588 */
589 public $name;
590
591 /**
592 * Each op type has three characters: left operand type, right operand type and result type
593 *
594 * b = boolean
595 * n = number
596 * r = range
597 *
598 * A number is a kind of range.
599 *
600 * @var array
601 */
602 static $opTypes = array(
603 'or' => 'bbb',
604 'and' => 'bbb',
605 'is' => 'nnb',
606 'is-not' => 'nnb',
607 'in' => 'nrb',
608 'not-in' => 'nrb',
609 'within' => 'nrb',
610 'not-within' => 'nrb',
611 'mod' => 'nnn',
612 ',' => 'rrr',
613 '..' => 'nnr',
614 );
615
616 /**
617 * Map converting from the abbrevation to the full form.
618 *
619 * @var array
620 */
621 static $typeSpecMap = array(
622 'b' => 'boolean',
623 'n' => 'number',
624 'r' => 'range',
625 );
626
627 /**
628 * Initialize a new instance of a CLDRPluralRuleConverter_Operator object
629 *
630 * @param CLDRPluralRuleConverter $parser The parser
631 * @param string $name The operator name
632 * @param int $pos The position
633 * @param int $pos The length
634 */
635 function __construct( $parser, $name, $pos, $length ) {
636 parent::__construct( $parser, $pos, $length );
637 $this->name = $name;
638 }
639
640 /**
641 * Compute the operation
642 *
643 * @param CLDRPluralRuleConverter_Expression $left The left part of the expression
644 * @param CLDRPluralRuleConverter_Expression $right The right part of the expression
645 * @return CLDRPluralRuleConverter_Expression The result of the operation
646 */
647 public function operate( $left, $right ) {
648 $typeSpec = self::$opTypes[$this->name];
649
650 $leftType = self::$typeSpecMap[$typeSpec[0]];
651 $rightType = self::$typeSpecMap[$typeSpec[1]];
652 $resultType = self::$typeSpecMap[$typeSpec[2]];
653
654 $start = min( $this->pos, $left->pos, $right->pos );
655 $end = max( $this->end, $left->end, $right->end );
656 $length = $end - $start;
657
658 $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
659 "{$left->rpn} {$right->rpn} {$this->name}",
660 $start, $length );
661
662 if ( !$left->isType( $leftType ) ) {
663 $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
664 }
665
666 if ( !$right->isType( $rightType ) ) {
667 $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
668 }
669 return $newExpr;
670 }
671 }
672
673 /**
674 * The exception class for all the classes in this file. This will be thrown
675 * back to the caller if there is any validation error.
676 */
677 class CLDRPluralRuleError extends MWException {
678 function __construct( $message ) {
679 parent::__construct( 'CLDR plural rule error: ' . $message );
680 }
681 }