Merge "Allow ORMTable to access a foreign wiki."
[lhc/web/wiklou.git] / languages / utils / CLDRPluralRuleEvaluator.php
1 <?php
2 /**
3 * Parse and evaluate a plural rule.
4 *
5 * http://unicode.org/reports/tr35/#Language_Plural_Rules
6 *
7 * @author Niklas Laxstrom, Tim Starling
8 *
9 * @copyright Copyright © 2010-2012, Niklas Laxström
10 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
11 * @file
12 * @since 1.20
13 */
14
15 class CLDRPluralRuleEvaluator {
16 /**
17 * Evaluate a number against a set of plural rules. If a rule passes,
18 * return the index of plural rule.
19 *
20 * @param int The number to be evaluated against the rules
21 * @param array The associative array of plural rules in pluralform => rule format.
22 * @return int The index of the plural form which passed the evaluation
23 */
24 public static function evaluate( $number, array $rules ) {
25 $rules = self::compile( $rules );
26 return self::evaluateCompiled( $number, $rules );
27 }
28
29 /**
30 * Convert a set of rules to a compiled form which is optimised for
31 * fast evaluation. The result will be an array of strings, and may be cached.
32 *
33 * @param $rules The rules to compile
34 * @return An array of compile rules.
35 */
36 public static function compile( array $rules ) {
37 // We can't use array_map() for this because it generates a warning if
38 // there is an exception.
39 foreach ( $rules as &$rule ) {
40 $rule = CLDRPluralRuleConverter::convert( $rule );
41 }
42 return $rules;
43 }
44
45 /**
46 * Evaluate a compiled set of rules returned by compile(). Do not allow
47 * the user to edit the compiled form, or else PHP errors may result.
48 */
49 public static function evaluateCompiled( $number, array $rules ) {
50 // The compiled form is RPN, with tokens strictly delimited by
51 // spaces, so this is a simple RPN evaluator.
52 foreach ( $rules as $i => $rule ) {
53 $stack = array();
54 $zero = ord( '0' );
55 $nine = ord( '9' );
56 foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
57 $ord = ord( $token );
58 if ( $token === 'n' ) {
59 $stack[] = $number;
60 } elseif ( $ord >= $zero && $ord <= $nine ) {
61 $stack[] = intval( $token );
62 } else {
63 $right = array_pop( $stack );
64 $left = array_pop( $stack );
65 $result = self::doOperation( $token, $left, $right );
66 $stack[] = $result;
67 }
68 }
69 if ( $stack[0] ) {
70 return $i;
71 }
72 }
73 // None of the provided rules match. The number belongs to caregory
74 // 'other' which comes last.
75 return count( $rules );
76 }
77
78 /**
79 * Do a single operation
80 *
81 * @param $token string The token string
82 * @param $left The left operand. If it is an object, its state may be destroyed.
83 * @param $right The right operand
84 * @throws CLDRPluralRuleError
85 * @return mixed
86 */
87 private static function doOperation( $token, $left, $right ) {
88 if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
89 if ( !($right instanceof CLDRPluralRuleEvaluator_Range ) ) {
90 $right = new CLDRPluralRuleEvaluator_Range( $right );
91 }
92 }
93 switch ( $token ) {
94 case 'or':
95 return $left || $right;
96 case 'and':
97 return $left && $right;
98 case 'is':
99 return $left == $right;
100 case 'is-not':
101 return $left != $right;
102 case 'in':
103 return $right->isNumberIn( $left );
104 case 'not-in':
105 return !$right->isNumberIn( $left );
106 case 'within':
107 return $right->isNumberWithin( $left );
108 case 'not-within':
109 return !$right->isNumberWithin( $left );
110 case 'mod':
111 if ( is_int( $left ) ) {
112 return (int) fmod( $left, $right );
113 }
114 return fmod( $left, $right );
115 case ',':
116 if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
117 $range = $left;
118 } else {
119 $range = new CLDRPluralRuleEvaluator_Range( $left );
120 }
121 $range->add( $right );
122 return $range;
123 case '..':
124 return new CLDRPluralRuleEvaluator_Range( $left, $right );
125 default:
126 throw new CLDRPluralRuleError( "Invalid RPN token" );
127 }
128 }
129 }
130
131 /**
132 * Evaluator helper class representing a range list.
133 */
134 class CLDRPluralRuleEvaluator_Range {
135 public $parts = array();
136
137 function __construct( $start, $end = false ) {
138 if ( $end === false ) {
139 $this->parts[] = $start;
140 } else {
141 $this->parts[] = array( $start, $end );
142 }
143 }
144
145 /**
146 * Determine if the given number is inside the range. If $integerConstraint
147 * is true, the number must additionally be an integer if it is to match
148 * any interval part.
149 */
150 function isNumberIn( $number, $integerConstraint = true ) {
151 foreach ( $this->parts as $part ) {
152 if ( is_array( $part ) ) {
153 if ( ( !$integerConstraint || floor( $number ) === (float)$number )
154 && $number >= $part[0] && $number <= $part[1] )
155 {
156 return true;
157 }
158 } else {
159 if ( $number == $part ) {
160 return true;
161 }
162 }
163 }
164 return false;
165 }
166
167 /**
168 * Readable alias for isNumberIn( $number, false ), and the implementation
169 * of the "within" operator.
170 */
171 function isNumberWithin( $number ) {
172 return $this->isNumberIn( $number, false );
173 }
174
175 /**
176 * Add another part to this range. The supplied new part may either be a
177 * range object itself, or a single number.
178 */
179 function add( $other ) {
180 if ( $other instanceof self ) {
181 $this->parts = array_merge( $this->parts, $other->parts );
182 } else {
183 $this->parts[] = $other;
184 }
185 }
186
187 /**
188 * For debugging
189 */
190 function __toString() {
191 $s = 'Range(';
192 foreach ( $this->parts as $i => $part ) {
193 if ( $i ) {
194 $s .= ', ';
195 }
196 if ( is_array( $part ) ) {
197 $s .= $part[0] . '..' . $part[1];
198 } else {
199 $s .= $part;
200 }
201 }
202 $s .= ')';
203 return $s;
204 }
205
206 }
207
208 /**
209 * Helper class for converting rules to reverse polish notation (RPN).
210 */
211 class CLDRPluralRuleConverter {
212 public $rule, $pos, $end;
213 public $operators = array();
214 public $operands = array();
215
216 /**
217 * Precedence levels. Note that there's no need to worry about associativity
218 * for the level 4 operators, since they return boolean and don't accept
219 * boolean inputs.
220 */
221 static $precedence = array(
222 'or' => 2,
223 'and' => 3,
224 'is' => 4,
225 'is-not' => 4,
226 'in' => 4,
227 'not-in' => 4,
228 'within' => 4,
229 'not-within' => 4,
230 'mod' => 5,
231 ',' => 6,
232 '..' => 7,
233 );
234
235 /**
236 * A character list defining whitespace, for use in strspn() etc.
237 */
238 const WHITESPACE_CLASS = " \t\r\n";
239
240 /**
241 * Same for digits. Note that the grammar given in UTS #35 doesn't allow
242 * negative numbers or decimals.
243 */
244 const NUMBER_CLASS = '0123456789';
245
246 /**
247 * An anchored regular expression which matches a word at the current offset.
248 */
249 const WORD_REGEX = '/[a-zA-Z]+/A';
250
251 /**
252 * Convert a rule to RPN. This is the only public entry point.
253 */
254 public static function convert( $rule ) {
255 $parser = new self( $rule );
256 return $parser->doConvert();
257 }
258
259 /**
260 * Private constructor.
261 */
262 protected function __construct( $rule ) {
263 $this->rule = $rule;
264 $this->pos = 0;
265 $this->end = strlen( $rule );
266 }
267
268 /**
269 * Do the operation.
270 */
271 protected function doConvert() {
272 $expectOperator = true;
273
274 // Iterate through all tokens, saving the operators and operands to a
275 // stack per Dijkstra's shunting yard algorithm.
276 while ( false !== ( $token = $this->nextToken() ) ) {
277 // In this grammar, there are only binary operators, so every valid
278 // rule string will alternate between operator and operand tokens.
279 $expectOperator = !$expectOperator;
280
281 if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
282 // Operand
283 if ( $expectOperator ) {
284 $token->error( 'unexpected operand' );
285 }
286 $this->operands[] = $token;
287 continue;
288 } else {
289 // Operator
290 if ( !$expectOperator ) {
291 $token->error( 'unexpected operator' );
292 }
293 // Resolve higher precedence levels
294 $lastOp = end( $this->operators );
295 while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
296 $this->doOperation( $lastOp, $this->operands );
297 array_pop( $this->operators );
298 $lastOp = end( $this->operators );
299 }
300 $this->operators[] = $token;
301 }
302 }
303
304 // Finish off the stack
305 while ( $op = array_pop( $this->operators ) ) {
306 $this->doOperation( $op, $this->operands );
307 }
308
309 // Make sure the result is sane. The first case is possible for an empty
310 // string input, the second should be unreachable.
311 if ( !count( $this->operands ) ) {
312 $this->error( 'condition expected' );
313 } elseif ( count( $this->operands ) > 1 ) {
314 $this->error( 'missing operator or too many operands' );
315 }
316
317 $value = $this->operands[0];
318 if ( $value->type !== 'boolean' ) {
319 $this->error( 'the result must have a boolean type' );
320 }
321
322 return $this->operands[0]->rpn;
323 }
324
325 /**
326 * Fetch the next token from the input string. Return it as a
327 * CLDRPluralRuleConverter_Fragment object.
328 */
329 protected function nextToken() {
330 if ( $this->pos >= $this->end ) {
331 return false;
332 }
333
334 // Whitespace
335 $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
336 $this->pos += $length;
337
338 if ( $this->pos >= $this->end ) {
339 return false;
340 }
341
342 // Number
343 $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
344 if ( $length !== 0 ) {
345 $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
346 $this->pos += $length;
347 return $token;
348 }
349
350 // Comma
351 if ( $this->rule[$this->pos] === ',' ) {
352 $token = $this->newOperator( ',', $this->pos, 1 );
353 $this->pos ++;
354 return $token;
355 }
356
357 // Dot dot
358 if ( substr( $this->rule, $this->pos, 2 ) === '..' ) {
359 $token = $this->newOperator( '..', $this->pos, 2 );
360 $this->pos += 2;
361 return $token;
362 }
363
364 // Word
365 if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
366 $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"' );
367 }
368 $word1 = strtolower( $m[0] );
369 $word2 = '';
370 $nextTokenPos = $this->pos + strlen( $word1 );
371 if ( $word1 === 'not' || $word1 === 'is' ) {
372 // Look ahead one word
373 $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
374 if ( $nextTokenPos < $this->end
375 && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos ) )
376 {
377 $word2 = strtolower( $m[0] );
378 $nextTokenPos += strlen( $word2 );
379 }
380 }
381
382 // Two-word operators like "is not" take precedence over single-word operators like "is"
383 if ( $word2 !== '' ) {
384 $bothWords = "{$word1}-{$word2}";
385 if ( isset( self::$precedence[$bothWords] ) ) {
386 $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
387 $this->pos = $nextTokenPos;
388 return $token;
389 }
390 }
391
392 // Single-word operators
393 if ( isset( self::$precedence[$word1] ) ) {
394 $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
395 $this->pos += strlen( $word1 );
396 return $token;
397 }
398
399 // The special numerical keyword "n"
400 if ( $word1 === 'n' ) {
401 $token = $this->newNumber( 'n', $this->pos );
402 $this->pos ++;
403 return $token;
404 }
405
406 $this->error( 'unrecognised word' );
407 }
408
409 /**
410 * For the binary operator $op, pop its operands off the stack and push
411 * a fragment with rpn and type members describing the result of that
412 * operation.
413 */
414 protected function doOperation( $op ) {
415 if ( count( $this->operands ) < 2 ) {
416 $op->error( 'missing operand' );
417 }
418 $right = array_pop( $this->operands );
419 $left = array_pop( $this->operands );
420 $result = $op->operate( $left, $right );
421 $this->operands[] = $result;
422 }
423
424 /**
425 * Create a numerical expression object
426 */
427 protected function newNumber( $text, $pos ) {
428 return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
429 }
430
431 /**
432 * Create a binary operator
433 */
434 protected function newOperator( $type, $pos, $length ) {
435 return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
436 }
437
438 /**
439 * Throw an error
440 */
441 protected function error( $message ) {
442 throw new CLDRPluralRuleError( $message );
443 }
444 }
445
446 /**
447 * Helper for CLDRPluralRuleConverter.
448 * The base class for operators and expressions, describing a region of the input string.
449 */
450 class CLDRPluralRuleConverter_Fragment {
451 public $parser, $pos, $length, $end;
452
453 function __construct( $parser, $pos, $length ) {
454 $this->parser = $parser;
455 $this->pos = $pos;
456 $this->length = $length;
457 $this->end = $pos + $length;
458 }
459
460 public function error( $message ) {
461 $text = $this->getText();
462 throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
463 }
464
465 public function getText() {
466 return substr( $this->parser->rule, $this->pos, $this->length );
467 }
468 }
469
470 /**
471 * Helper for CLDRPluralRuleConverter.
472 * An expression object, representing a region of the input string (for error
473 * messages), the RPN notation used to evaluate it, and the result type for
474 * validation.
475 */
476 class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
477 public $type, $rpn;
478
479 function __construct( $parser, $type, $rpn, $pos, $length ) {
480 parent::__construct( $parser, $pos, $length );
481 $this->type = $type;
482 $this->rpn = $rpn;
483 }
484
485 public function isType( $type ) {
486 if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
487 return true;
488 }
489 if ( $type === $this->type ) {
490 return true;
491 }
492 return false;
493 }
494 }
495
496 /**
497 * Helper for CLDRPluralRuleConverter.
498 * An operator object, representing a region of the input string (for error
499 * messages), and the binary operator at that location.
500 */
501 class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
502 public $name;
503
504 /**
505 * Each op type has three characters: left operand type, right operand type and result type
506 *
507 * b = boolean
508 * n = number
509 * r = range
510 *
511 * A number is a kind of range.
512 */
513 static $opTypes = array(
514 'or' => 'bbb',
515 'and' => 'bbb',
516 'is' => 'nnb',
517 'is-not' => 'nnb',
518 'in' => 'nrb',
519 'not-in' => 'nrb',
520 'within' => 'nrb',
521 'not-within' => 'nrb',
522 'mod' => 'nnn',
523 ',' => 'rrr',
524 '..' => 'nnr',
525 );
526
527 /**
528 * Map converting from the abbrevation to the full form.
529 */
530 static $typeSpecMap = array(
531 'b' => 'boolean',
532 'n' => 'number',
533 'r' => 'range',
534 );
535
536 function __construct( $parser, $name, $pos, $length ) {
537 parent::__construct( $parser, $pos, $length );
538 $this->name = $name;
539 }
540
541 public function operate( $left, $right ) {
542 $typeSpec = self::$opTypes[$this->name];
543
544 $leftType = self::$typeSpecMap[$typeSpec[0]];
545 $rightType = self::$typeSpecMap[$typeSpec[1]];
546 $resultType = self::$typeSpecMap[$typeSpec[2]];
547
548 $start = min( $this->pos, $left->pos, $right->pos );
549 $end = max( $this->end, $left->end, $right->end );
550 $length = $end - $start;
551
552 $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
553 "{$left->rpn} {$right->rpn} {$this->name}",
554 $start, $length );
555
556 if ( !$left->isType( $leftType ) ) {
557 $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
558 }
559
560 if ( !$right->isType( $rightType ) ) {
561 $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
562 }
563 return $newExpr;
564 }
565 }
566
567 /**
568 * The exception class for all the classes in this file. This will be thrown
569 * back to the caller if there is any validation error.
570 */
571 class CLDRPluralRuleError extends MWException {
572 function __construct( $message ) {
573 parent::__construct( 'CLDR plural rule error: ' . $message );
574 }
575 }