Merge "SpecialMyLanguage: Get content language from service"
[lhc/web/wiklou.git] / includes / libs / JavaScriptMinifier.php
1 <?php
2 /**
3 * JavaScript Minifier
4 *
5 * @file
6 * @author Paul Copperman <paul.copperman@gmail.com>
7 * @license Apache-2.0
8 * @license MIT
9 * @license GPL-2.0-or-later
10 * @license LGPL-2.1-or-later
11 */
12
13 /**
14 * This class is meant to safely minify javascript code, while leaving syntactically correct
15 * programs intact. Other libraries, such as JSMin require a certain coding style to work
16 * correctly. OTOH, libraries like jsminplus, that do parse the code correctly are rather
17 * slow, because they construct a complete parse tree before outputting the code minified.
18 * So this class is meant to allow arbitrary (but syntactically correct) input, while being
19 * fast enough to be used for on-the-fly minifying.
20 *
21 * This class was written with ECMA-262 Edition 3 in mind ("ECMAScript 3"). Parsing features
22 * new to ECMAScript 5 or later might not be supported. However, Edition 5.1 better reflects
23 * how actual JS engines worked and work and is simpler and more readable prose. As such,
24 * the below code will refer to sections of the 5.1 specification.
25 *
26 * See <https://www.ecma-international.org/ecma-262/5.1/>.
27 */
28 class JavaScriptMinifier {
29
30 /* Parsing states.
31 * The state machine is only necessary to decide whether to parse a slash as division
32 * operator or as regexp literal.
33 * States are named after the next expected item. We only distinguish states when the
34 * distinction is relevant for our purpose.
35 */
36 const STATEMENT = 0;
37 const CONDITION = 1;
38 const PROPERTY_ASSIGNMENT = 2;
39 const EXPRESSION = 3;
40 const EXPRESSION_NO_NL = 4; // only relevant for semicolon insertion
41 const EXPRESSION_OP = 5;
42 const EXPRESSION_FUNC = 6;
43 const EXPRESSION_TERNARY = 7; // used to determine the role of a colon
44 const EXPRESSION_TERNARY_OP = 8;
45 const EXPRESSION_TERNARY_FUNC = 9;
46 const PAREN_EXPRESSION = 10; // expression which is not on the top level
47 const PAREN_EXPRESSION_OP = 11;
48 const PAREN_EXPRESSION_FUNC = 12;
49 const PROPERTY_EXPRESSION = 13; // expression which is within an object literal
50 const PROPERTY_EXPRESSION_OP = 14;
51 const PROPERTY_EXPRESSION_FUNC = 15;
52
53 /* Token types */
54 const TYPE_UN_OP = 101; // unary operators
55 const TYPE_INCR_OP = 102; // ++ and --
56 const TYPE_BIN_OP = 103; // binary operators
57 const TYPE_ADD_OP = 104; // + and - which can be either unary or binary ops
58 const TYPE_HOOK = 105; // ?
59 const TYPE_COLON = 106; // :
60 const TYPE_COMMA = 107; // ,
61 const TYPE_SEMICOLON = 108; // ;
62 const TYPE_BRACE_OPEN = 109; // {
63 const TYPE_BRACE_CLOSE = 110; // }
64 const TYPE_PAREN_OPEN = 111; // ( and [
65 const TYPE_PAREN_CLOSE = 112; // ) and ]
66 const TYPE_RETURN = 113; // keywords: break, continue, return, throw
67 const TYPE_IF = 114; // keywords: catch, for, with, switch, while, if
68 const TYPE_DO = 115; // keywords: case, var, finally, else, do, try
69 const TYPE_FUNC = 116; // keywords: function
70 const TYPE_LITERAL = 117; // all literals, identifiers and unrecognised tokens
71
72 // Sanity limit to avoid excessive memory usage
73 const STACK_LIMIT = 1000;
74
75 /**
76 * Maximum line length
77 *
78 * This is not a strict maximum, but a guideline. Longer lines will be
79 * produced when literals (e.g. quoted strings) longer than this are
80 * encountered, or when required to guard against semicolon insertion.
81 *
82 * This is a private member (instead of constant) to allow tests to
83 * set it to 1, to verify ASI and line-breaking behaviour.
84 */
85 private static $maxLineLength = 1000;
86
87 /**
88 * Returns minified JavaScript code.
89 *
90 * @param string $s JavaScript code to minify
91 * @return String Minified code
92 */
93 public static function minify( $s ) {
94 // First we declare a few tables that contain our parsing rules
95
96 // $opChars : Characters which can be combined without whitespace between them.
97 $opChars = [
98 // ECMAScript 5.1 § 7.7 Punctuators
99 // Unlike the spec, these are individual symbols, not sequences.
100 '{' => true,
101 '}' => true,
102 '(' => true,
103 ')' => true,
104 '[' => true,
105 ']' => true,
106 '.' => true,
107 ';' => true,
108 ',' => true,
109 '<' => true,
110 '>' => true,
111 '=' => true,
112 '!' => true,
113 '+' => true,
114 '-' => true,
115 '*' => true,
116 '%' => true,
117 '&' => true,
118 '|' => true,
119 '^' => true,
120 '~' => true,
121 '?' => true,
122 ':' => true,
123 '/' => true,
124 // ECMAScript 5.1 § 7.8.4 String Literals
125 '"' => true,
126 "'" => true,
127 ];
128
129 // $tokenTypes : Map keywords and operators to their corresponding token type
130 $tokenTypes = [
131 // ECMAScript 5.1 § 11.4 Unary Operators
132 // ECMAScript 5.1 § 11.6 Additive Operators
133 // UnaryExpression includes PostfixExpression, which includes 'new'.
134 'new' => self::TYPE_UN_OP,
135 'delete' => self::TYPE_UN_OP,
136 'void' => self::TYPE_UN_OP,
137 'typeof' => self::TYPE_UN_OP,
138 '++' => self::TYPE_INCR_OP,
139 '--' => self::TYPE_INCR_OP,
140 '+' => self::TYPE_ADD_OP,
141 '-' => self::TYPE_ADD_OP,
142 '~' => self::TYPE_UN_OP,
143 '!' => self::TYPE_UN_OP,
144 // ECMAScript 5.1 § 11.5 Multiplicative Operators
145 '*' => self::TYPE_BIN_OP,
146 '/' => self::TYPE_BIN_OP,
147 '%' => self::TYPE_BIN_OP,
148 // ECMAScript 5.1 § 11.7 Bitwise Shift Operators
149 '<<' => self::TYPE_BIN_OP,
150 '>>' => self::TYPE_BIN_OP,
151 '>>>' => self::TYPE_BIN_OP,
152 // ECMAScript 5.1 § 11.8 Relational Operators
153 '<' => self::TYPE_BIN_OP,
154 '>' => self::TYPE_BIN_OP,
155 '<=' => self::TYPE_BIN_OP,
156 '>=' => self::TYPE_BIN_OP,
157 // ECMAScript 5.1 § 11.9 Equality Operators
158 '==' => self::TYPE_BIN_OP,
159 '!=' => self::TYPE_BIN_OP,
160 '===' => self::TYPE_BIN_OP,
161 '!==' => self::TYPE_BIN_OP,
162 'instanceof' => self::TYPE_BIN_OP,
163 'in' => self::TYPE_BIN_OP,
164 // ECMAScript 5.1 § 11.10 Binary Bitwise Operators
165 '&' => self::TYPE_BIN_OP,
166 '^' => self::TYPE_BIN_OP,
167 '|' => self::TYPE_BIN_OP,
168 // ECMAScript 5.1 § 11.11 Binary Logical Operators
169 '&&' => self::TYPE_BIN_OP,
170 '||' => self::TYPE_BIN_OP,
171 // ECMAScript 5.1 § 11.12 Conditional Operator
172 // Also known as ternary.
173 '?' => self::TYPE_HOOK,
174 ':' => self::TYPE_COLON,
175 // ECMAScript 5.1 § 11.13 Assignment Operators
176 '=' => self::TYPE_BIN_OP,
177 '*=' => self::TYPE_BIN_OP,
178 '/=' => self::TYPE_BIN_OP,
179 '%=' => self::TYPE_BIN_OP,
180 '+=' => self::TYPE_BIN_OP,
181 '-=' => self::TYPE_BIN_OP,
182 '<<=' => self::TYPE_BIN_OP,
183 '>>=' => self::TYPE_BIN_OP,
184 '>>>=' => self::TYPE_BIN_OP,
185 '&=' => self::TYPE_BIN_OP,
186 '^=' => self::TYPE_BIN_OP,
187 '|=' => self::TYPE_BIN_OP,
188 // ECMAScript 5.1 § 11.14 Comma Operator
189 ',' => self::TYPE_COMMA,
190
191 // The keywords that disallow LineTerminator before their
192 // (sometimes optional) Expression or Identifier.
193 //
194 // keyword ;
195 // keyword [no LineTerminator here] Identifier ;
196 // keyword [no LineTerminator here] Expression ;
197 //
198 // See also ECMAScript 5.1:
199 // - § 12.7 The continue Statement
200 // - $ 12.8 The break Statement
201 // - § 12.9 The return Statement
202 // - § 12.13 The throw Statement
203 'continue' => self::TYPE_RETURN,
204 'break' => self::TYPE_RETURN,
205 'return' => self::TYPE_RETURN,
206 'throw' => self::TYPE_RETURN,
207
208 // The keywords require a parenthesised Expression or Identifier
209 // before the next Statement.
210 //
211 // keyword ( Expression ) Statement
212 // keyword ( Identifier ) Statement
213 //
214 // See also ECMAScript 5.1:
215 // - § 12.5 The if Statement
216 // - § 12.6 Iteration Statements (do, while, for)
217 // - § 12.10 The with Statement
218 // - § 12.11 The switch Statement
219 // - § 12.13 The throw Statement
220 'if' => self::TYPE_IF,
221 'catch' => self::TYPE_IF,
222 'while' => self::TYPE_IF,
223 'for' => self::TYPE_IF,
224 'switch' => self::TYPE_IF,
225 'with' => self::TYPE_IF,
226
227 // The keywords followed by an Identifier, Statement,
228 // Expression, or Block.
229 //
230 // var Identifier
231 // else Statement
232 // do Statement
233 // case Expression
234 // try Block
235 // finally Block
236 //
237 // See also ECMAScript 5.1:
238 // - § 12.2 Variable Statement
239 // - § 12.5 The if Statement (else)
240 // - § 12.6 Iteration Statements (do, while, for)
241 // - § 12.11 The switch Statement (case)
242 // - § 12.14 The try Statement
243 'var' => self::TYPE_DO,
244 'else' => self::TYPE_DO,
245 'do' => self::TYPE_DO,
246 'case' => self::TYPE_DO,
247 'try' => self::TYPE_DO,
248 'finally' => self::TYPE_DO,
249
250 // ECMAScript 5.1 § 13 Function Definition
251 'function' => self::TYPE_FUNC,
252
253 // Can be one of:
254 // - DecimalLiteral (ECMAScript 5.1 § 7.8.3 Numeric Literals)
255 // - MemberExpression (ECMAScript 5.1 § 11.2 Left-Hand-Side Expressions)
256 '.' => self::TYPE_BIN_OP,
257
258 // Can be one of:
259 // - Block (ECMAScript 5.1 § 12.1 Block)
260 // - ObjectLiteral (ECMAScript 5.1 § 11.1 Primary Expressions)
261 '{' => self::TYPE_BRACE_OPEN,
262 '}' => self::TYPE_BRACE_CLOSE,
263
264 // Can be one of:
265 // - Parenthesised Identifier or Expression after a
266 // TYPE_IF or TYPE_FUNC keyword.
267 // - PrimaryExpression (ECMAScript 5.1 § 11.1 Primary Expressions)
268 // - CallExpression (ECMAScript 5.1 § 11.2 Left-Hand-Side Expressions)
269 '(' => self::TYPE_PAREN_OPEN,
270 ')' => self::TYPE_PAREN_CLOSE,
271
272 // Can be one of:
273 // - ArrayLiteral (ECMAScript 5.1 § 11.1 Primary Expressions)
274 '[' => self::TYPE_PAREN_OPEN,
275 ']' => self::TYPE_PAREN_CLOSE,
276
277 // Can be one of:
278 // - End of any statement
279 // - EmptyStatement (ECMAScript 5.1 § 12.3 Empty Statement)
280 ';' => self::TYPE_SEMICOLON,
281 ];
282
283 // $goto : This is the main table for our state machine. For every state/token pair
284 // the following state is defined. When no rule exists for a given pair,
285 // the state is left unchanged.
286 $goto = [
287 self::STATEMENT => [
288 self::TYPE_UN_OP => self::EXPRESSION,
289 self::TYPE_INCR_OP => self::EXPRESSION,
290 self::TYPE_ADD_OP => self::EXPRESSION,
291 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
292 self::TYPE_RETURN => self::EXPRESSION_NO_NL,
293 self::TYPE_IF => self::CONDITION,
294 self::TYPE_FUNC => self::CONDITION,
295 self::TYPE_LITERAL => self::EXPRESSION_OP
296 ],
297 self::CONDITION => [
298 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
299 ],
300 self::PROPERTY_ASSIGNMENT => [
301 self::TYPE_COLON => self::PROPERTY_EXPRESSION,
302 self::TYPE_BRACE_OPEN => self::STATEMENT
303 ],
304 self::EXPRESSION => [
305 self::TYPE_SEMICOLON => self::STATEMENT,
306 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
307 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
308 self::TYPE_FUNC => self::EXPRESSION_FUNC,
309 self::TYPE_LITERAL => self::EXPRESSION_OP
310 ],
311 self::EXPRESSION_NO_NL => [
312 self::TYPE_SEMICOLON => self::STATEMENT,
313 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
314 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
315 self::TYPE_FUNC => self::EXPRESSION_FUNC,
316 self::TYPE_LITERAL => self::EXPRESSION_OP
317 ],
318 self::EXPRESSION_OP => [
319 self::TYPE_BIN_OP => self::EXPRESSION,
320 self::TYPE_ADD_OP => self::EXPRESSION,
321 self::TYPE_HOOK => self::EXPRESSION_TERNARY,
322 self::TYPE_COLON => self::STATEMENT,
323 self::TYPE_COMMA => self::EXPRESSION,
324 self::TYPE_SEMICOLON => self::STATEMENT,
325 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
326 ],
327 self::EXPRESSION_FUNC => [
328 self::TYPE_BRACE_OPEN => self::STATEMENT
329 ],
330 self::EXPRESSION_TERNARY => [
331 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
332 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
333 self::TYPE_FUNC => self::EXPRESSION_TERNARY_FUNC,
334 self::TYPE_LITERAL => self::EXPRESSION_TERNARY_OP
335 ],
336 self::EXPRESSION_TERNARY_OP => [
337 self::TYPE_BIN_OP => self::EXPRESSION_TERNARY,
338 self::TYPE_ADD_OP => self::EXPRESSION_TERNARY,
339 self::TYPE_HOOK => self::EXPRESSION_TERNARY,
340 self::TYPE_COMMA => self::EXPRESSION_TERNARY,
341 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
342 ],
343 self::EXPRESSION_TERNARY_FUNC => [
344 self::TYPE_BRACE_OPEN => self::STATEMENT
345 ],
346 self::PAREN_EXPRESSION => [
347 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
348 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
349 self::TYPE_FUNC => self::PAREN_EXPRESSION_FUNC,
350 self::TYPE_LITERAL => self::PAREN_EXPRESSION_OP
351 ],
352 self::PAREN_EXPRESSION_OP => [
353 self::TYPE_BIN_OP => self::PAREN_EXPRESSION,
354 self::TYPE_ADD_OP => self::PAREN_EXPRESSION,
355 self::TYPE_HOOK => self::PAREN_EXPRESSION,
356 self::TYPE_COLON => self::PAREN_EXPRESSION,
357 self::TYPE_COMMA => self::PAREN_EXPRESSION,
358 self::TYPE_SEMICOLON => self::PAREN_EXPRESSION,
359 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
360 ],
361 self::PAREN_EXPRESSION_FUNC => [
362 self::TYPE_BRACE_OPEN => self::STATEMENT
363 ],
364 self::PROPERTY_EXPRESSION => [
365 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
366 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
367 self::TYPE_FUNC => self::PROPERTY_EXPRESSION_FUNC,
368 self::TYPE_LITERAL => self::PROPERTY_EXPRESSION_OP
369 ],
370 self::PROPERTY_EXPRESSION_OP => [
371 self::TYPE_BIN_OP => self::PROPERTY_EXPRESSION,
372 self::TYPE_ADD_OP => self::PROPERTY_EXPRESSION,
373 self::TYPE_HOOK => self::PROPERTY_EXPRESSION,
374 self::TYPE_COMMA => self::PROPERTY_ASSIGNMENT,
375 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
376 ],
377 self::PROPERTY_EXPRESSION_FUNC => [
378 self::TYPE_BRACE_OPEN => self::STATEMENT
379 ]
380 ];
381
382 // $push : This table contains the rules for when to push a state onto the stack.
383 // The pushed state is the state to return to when the corresponding
384 // closing token is found
385 $push = [
386 self::STATEMENT => [
387 self::TYPE_BRACE_OPEN => self::STATEMENT,
388 self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
389 ],
390 self::CONDITION => [
391 self::TYPE_PAREN_OPEN => self::STATEMENT
392 ],
393 self::PROPERTY_ASSIGNMENT => [
394 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT
395 ],
396 self::EXPRESSION => [
397 self::TYPE_BRACE_OPEN => self::EXPRESSION_OP,
398 self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
399 ],
400 self::EXPRESSION_NO_NL => [
401 self::TYPE_BRACE_OPEN => self::EXPRESSION_OP,
402 self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
403 ],
404 self::EXPRESSION_OP => [
405 self::TYPE_HOOK => self::EXPRESSION,
406 self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
407 ],
408 self::EXPRESSION_FUNC => [
409 self::TYPE_BRACE_OPEN => self::EXPRESSION_OP
410 ],
411 self::EXPRESSION_TERNARY => [
412 self::TYPE_BRACE_OPEN => self::EXPRESSION_TERNARY_OP,
413 self::TYPE_PAREN_OPEN => self::EXPRESSION_TERNARY_OP
414 ],
415 self::EXPRESSION_TERNARY_OP => [
416 self::TYPE_HOOK => self::EXPRESSION_TERNARY,
417 self::TYPE_PAREN_OPEN => self::EXPRESSION_TERNARY_OP
418 ],
419 self::EXPRESSION_TERNARY_FUNC => [
420 self::TYPE_BRACE_OPEN => self::EXPRESSION_TERNARY_OP
421 ],
422 self::PAREN_EXPRESSION => [
423 self::TYPE_BRACE_OPEN => self::PAREN_EXPRESSION_OP,
424 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION_OP
425 ],
426 self::PAREN_EXPRESSION_OP => [
427 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION_OP
428 ],
429 self::PAREN_EXPRESSION_FUNC => [
430 self::TYPE_BRACE_OPEN => self::PAREN_EXPRESSION_OP
431 ],
432 self::PROPERTY_EXPRESSION => [
433 self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP,
434 self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP
435 ],
436 self::PROPERTY_EXPRESSION_OP => [
437 self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP,
438 self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP
439 ],
440 self::PROPERTY_EXPRESSION_FUNC => [
441 self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP
442 ]
443 ];
444
445 // $pop : Rules for when to pop a state from the stack
446 $pop = [
447 self::STATEMENT => [ self::TYPE_BRACE_CLOSE => true ],
448 self::PROPERTY_ASSIGNMENT => [ self::TYPE_BRACE_CLOSE => true ],
449 self::EXPRESSION => [ self::TYPE_BRACE_CLOSE => true ],
450 self::EXPRESSION_NO_NL => [ self::TYPE_BRACE_CLOSE => true ],
451 self::EXPRESSION_OP => [ self::TYPE_BRACE_CLOSE => true ],
452 self::EXPRESSION_TERNARY_OP => [ self::TYPE_COLON => true ],
453 self::PAREN_EXPRESSION => [ self::TYPE_PAREN_CLOSE => true ],
454 self::PAREN_EXPRESSION_OP => [ self::TYPE_PAREN_CLOSE => true ],
455 self::PROPERTY_EXPRESSION => [ self::TYPE_BRACE_CLOSE => true ],
456 self::PROPERTY_EXPRESSION_OP => [ self::TYPE_BRACE_CLOSE => true ]
457 ];
458
459 // $semicolon : Rules for when a semicolon insertion is appropriate
460 $semicolon = [
461 self::EXPRESSION_NO_NL => [
462 self::TYPE_UN_OP => true,
463 self::TYPE_INCR_OP => true,
464 self::TYPE_ADD_OP => true,
465 self::TYPE_BRACE_OPEN => true,
466 self::TYPE_PAREN_OPEN => true,
467 self::TYPE_RETURN => true,
468 self::TYPE_IF => true,
469 self::TYPE_DO => true,
470 self::TYPE_FUNC => true,
471 self::TYPE_LITERAL => true
472 ],
473 self::EXPRESSION_OP => [
474 self::TYPE_UN_OP => true,
475 self::TYPE_INCR_OP => true,
476 self::TYPE_BRACE_OPEN => true,
477 self::TYPE_RETURN => true,
478 self::TYPE_IF => true,
479 self::TYPE_DO => true,
480 self::TYPE_FUNC => true,
481 self::TYPE_LITERAL => true
482 ]
483 ];
484
485 // $divStates : Contains all states that can be followed by a division operator
486 $divStates = [
487 self::EXPRESSION_OP => true,
488 self::EXPRESSION_TERNARY_OP => true,
489 self::PAREN_EXPRESSION_OP => true,
490 self::PROPERTY_EXPRESSION_OP => true
491 ];
492
493 // Here's where the minifying takes place: Loop through the input, looking for tokens
494 // and output them to $out, taking actions to the above defined rules when appropriate.
495 $out = '';
496 $pos = 0;
497 $length = strlen( $s );
498 $lineLength = 0;
499 $newlineFound = true;
500 $state = self::STATEMENT;
501 $stack = [];
502 $last = ';'; // Pretend that we have seen a semicolon yet
503 while ( $pos < $length ) {
504 // First, skip over any whitespace and multiline comments, recording whether we
505 // found any newline character
506 $skip = strspn( $s, " \t\n\r\xb\xc", $pos );
507 if ( !$skip ) {
508 $ch = $s[$pos];
509 if ( $ch === '/' && substr( $s, $pos, 2 ) === '/*' ) {
510 // Multiline comment. Search for the end token or EOT.
511 $end = strpos( $s, '*/', $pos + 2 );
512 $skip = $end === false ? $length - $pos : $end - $pos + 2;
513 }
514 }
515 if ( $skip ) {
516 // The semicolon insertion mechanism needs to know whether there was a newline
517 // between two tokens, so record it now.
518 if ( !$newlineFound && strcspn( $s, "\r\n", $pos, $skip ) !== $skip ) {
519 $newlineFound = true;
520 }
521 $pos += $skip;
522 continue;
523 }
524 // Handle C++-style comments and html comments, which are treated as single line
525 // comments by the browser, regardless of whether the end tag is on the same line.
526 // Handle --> the same way, but only if it's at the beginning of the line
527 if ( ( $ch === '/' && substr( $s, $pos, 2 ) === '//' )
528 || ( $ch === '<' && substr( $s, $pos, 4 ) === '<!--' )
529 || ( $ch === '-' && $newlineFound && substr( $s, $pos, 3 ) === '-->' )
530 ) {
531 $pos += strcspn( $s, "\r\n", $pos );
532 continue;
533 }
534
535 // Find out which kind of token we're handling.
536 // Note: $end must point past the end of the current token
537 // so that `substr($s, $pos, $end - $pos)` would be the entire token.
538 // In order words, $end will be the offset of the last relevant character
539 // in the stream + 1, or simply put: The offset of the first character
540 // of any next token in the stream.
541 $end = $pos + 1;
542 // Handle string literals
543 if ( $ch === "'" || $ch === '"' ) {
544 // Search to the end of the string literal, skipping over backslash escapes
545 $search = $ch . '\\';
546 do{
547 // Speculatively add 2 to the end so that if we see a backslash,
548 // the next iteration will start 2 characters further (one for the
549 // backslash, one for the escaped character).
550 // We'll correct this outside the loop.
551 $end += strcspn( $s, $search, $end ) + 2;
552 // If the last character in our search for a quote or a backlash
553 // matched a backslash and we haven't reached the end, keep searching..
554 } while ( $end - 2 < $length && $s[$end - 2] === '\\' );
555 // Correction (1): Undo speculative add, keep only one (end of string literal)
556 $end--;
557 if ( $end > $length ) {
558 // Correction (2): Loop wrongly assumed an end quote ended the search,
559 // but search ended because we've reached the end. Correct $end.
560 // TODO: This is invalid and should throw.
561 $end--;
562 }
563 // We have to distinguish between regexp literals and division operators
564 // A division operator is only possible in certain states
565 } elseif ( $ch === '/' && !isset( $divStates[$state] ) ) {
566 // Regexp literal
567 for ( ; ; ) {
568 // Search until we find "/" (end of regexp), "\" (backslash escapes),
569 // or "[" (start of character classes).
570 do{
571 // Speculatively add 2 to ensure next iteration skips
572 // over backslash and escaped character.
573 // We'll correct this outside the loop.
574 $end += strcspn( $s, '/[\\', $end ) + 2;
575 // If backslash escape, keep searching...
576 } while ( $end - 2 < $length && $s[$end - 2] === '\\' );
577 // Correction (1): Undo speculative add, keep only one (end of regexp)
578 $end--;
579 if ( $end > $length ) {
580 // Correction (2): Loop wrongly assumed end slash was seen
581 // String ended without end of regexp. Correct $end.
582 // TODO: This is invalid and should throw.
583 $end--;
584 break;
585 }
586 if ( $s[$end - 1] === '/' ) {
587 break;
588 }
589 // (Implicit else), we must've found the start of a char class,
590 // skip until we find "]" (end of char class), or "\" (backslash escape)
591 do{
592 // Speculatively add 2 for backslash escape.
593 // We'll substract one outside the loop.
594 $end += strcspn( $s, ']\\', $end ) + 2;
595 // If backslash escape, keep searching...
596 } while ( $end - 2 < $length && $s[$end - 2] === '\\' );
597 // Correction (1): Undo speculative add, keep only one (end of regexp)
598 $end--;
599 if ( $end > $length ) {
600 // Correction (2): Loop wrongly assumed "]" was seen
601 // String ended without ending char class or regexp. Correct $end.
602 // TODO: This is invalid and should throw.
603 $end--;
604 break;
605 }
606 }
607 // Search past the regexp modifiers (gi)
608 while ( $end < $length && ctype_alpha( $s[$end] ) ) {
609 $end++;
610 }
611 } elseif (
612 $ch === '0'
613 && ( $pos + 1 < $length ) && ( $s[$pos + 1] === 'x' || $s[$pos + 1] === 'X' )
614 ) {
615 // Hex numeric literal
616 $end++; // x or X
617 $len = strspn( $s, '0123456789ABCDEFabcdef', $end );
618 if ( !$len ) {
619 return self::parseError(
620 $s,
621 $pos,
622 'Expected a hexadecimal number but found ' . substr( $s, $pos, 5 ) . '...'
623 );
624 }
625 $end += $len;
626 } elseif (
627 ctype_digit( $ch )
628 || ( $ch === '.' && $pos + 1 < $length && ctype_digit( $s[$pos + 1] ) )
629 ) {
630 $end += strspn( $s, '0123456789', $end );
631 $decimal = strspn( $s, '.', $end );
632 if ( $decimal ) {
633 if ( $decimal > 2 ) {
634 return self::parseError( $s, $end, 'The number has too many decimal points' );
635 }
636 $end += strspn( $s, '0123456789', $end + 1 ) + $decimal;
637 }
638 $exponent = strspn( $s, 'eE', $end );
639 if ( $exponent ) {
640 if ( $exponent > 1 ) {
641 return self::parseError( $s, $end, 'Number with several E' );
642 }
643 $end++;
644
645 // + sign is optional; - sign is required.
646 $end += strspn( $s, '-+', $end );
647 $len = strspn( $s, '0123456789', $end );
648 if ( !$len ) {
649 return self::parseError(
650 $s,
651 $pos,
652 'No decimal digits after e, how many zeroes should be added?'
653 );
654 }
655 $end += $len;
656 }
657 } elseif ( isset( $opChars[$ch] ) ) {
658 // Punctuation character. Search for the longest matching operator.
659 while (
660 $end < $length
661 && isset( $tokenTypes[substr( $s, $pos, $end - $pos + 1 )] )
662 ) {
663 $end++;
664 }
665 } else {
666 // Identifier or reserved word. Search for the end by excluding whitespace and
667 // punctuation.
668 $end += strcspn( $s, " \t\n.;,=<>+-{}()[]?:*/%'\"!&|^~\xb\xc\r", $end );
669 }
670
671 // Now get the token type from our type array
672 $token = substr( $s, $pos, $end - $pos ); // so $end - $pos == strlen( $token )
673 $type = $tokenTypes[$token] ?? self::TYPE_LITERAL;
674
675 if ( $newlineFound && isset( $semicolon[$state][$type] ) ) {
676 // This token triggers the semicolon insertion mechanism of javascript. While we
677 // could add the ; token here ourselves, keeping the newline has a few advantages.
678 $out .= "\n";
679 $state = self::STATEMENT;
680 $lineLength = 0;
681 } elseif ( $lineLength + $end - $pos > self::$maxLineLength &&
682 !isset( $semicolon[$state][$type] ) && $type !== self::TYPE_INCR_OP ) {
683 // This line would get too long if we added $token, so add a newline first.
684 // Only do this if it won't trigger semicolon insertion and if it won't
685 // put a postfix increment operator on its own line, which is illegal in js.
686 $out .= "\n";
687 $lineLength = 0;
688 // Check, whether we have to separate the token from the last one with whitespace
689 } elseif ( !isset( $opChars[$last] ) && !isset( $opChars[$ch] ) ) {
690 $out .= ' ';
691 $lineLength++;
692 // Don't accidentally create ++, -- or // tokens
693 } elseif ( $last === $ch && ( $ch === '+' || $ch === '-' || $ch === '/' ) ) {
694 $out .= ' ';
695 $lineLength++;
696 }
697 if (
698 $type === self::TYPE_LITERAL
699 && ( $token === 'true' || $token === 'false' )
700 && ( $state === self::EXPRESSION || $state === self::PROPERTY_EXPRESSION )
701 && $last !== '.'
702 ) {
703 $token = ( $token === 'true' ) ? '!0' : '!1';
704 }
705
706 $out .= $token;
707 $lineLength += $end - $pos; // += strlen( $token )
708 $last = $s[$end - 1];
709 $pos = $end;
710 $newlineFound = false;
711
712 // Now that we have output our token, transition into the new state.
713 if ( isset( $push[$state][$type] ) && count( $stack ) < self::STACK_LIMIT ) {
714 $stack[] = $push[$state][$type];
715 }
716 if ( $stack && isset( $pop[$state][$type] ) ) {
717 $state = array_pop( $stack );
718 } elseif ( isset( $goto[$state][$type] ) ) {
719 $state = $goto[$state][$type];
720 }
721 }
722 return $out;
723 }
724
725 static function parseError( $fullJavascript, $position, $errorMsg ) {
726 // TODO: Handle the error: trigger_error, throw exception, return false...
727 return false;
728 }
729 }