Merge "Implement static public Parser::getExternalLinkRel"
[lhc/web/wiklou.git] / includes / StringUtils.php
1 <?php
2 /**
3 * Methods to play with strings.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 */
22
23 /**
24 * A collection of static methods to play with strings.
25 */
26 class StringUtils {
27 /**
28 * Perform an operation equivalent to
29 *
30 * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
31 *
32 * except that it's worst-case O(N) instead of O(N^2)
33 *
34 * Compared to delimiterReplace(), this implementation is fast but memory-
35 * hungry and inflexible. The memory requirements are such that I don't
36 * recommend using it on anything but guaranteed small chunks of text.
37 *
38 * @param $startDelim
39 * @param $endDelim
40 * @param $replace
41 * @param $subject
42 *
43 * @return string
44 */
45 static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
46 $segments = explode( $startDelim, $subject );
47 $output = array_shift( $segments );
48 foreach ( $segments as $s ) {
49 $endDelimPos = strpos( $s, $endDelim );
50 if ( $endDelimPos === false ) {
51 $output .= $startDelim . $s;
52 } else {
53 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
54 }
55 }
56 return $output;
57 }
58
59 /**
60 * Perform an operation equivalent to
61 *
62 * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
63 *
64 * This implementation is slower than hungryDelimiterReplace but uses far less
65 * memory. The delimiters are literal strings, not regular expressions.
66 *
67 * If the start delimiter ends with an initial substring of the end delimiter,
68 * e.g. in the case of C-style comments, the behaviour differs from the model
69 * regex. In this implementation, the end must share no characters with the
70 * start, so e.g. /*\/ is not considered to be both the start and end of a
71 * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
72 *
73 * @param $startDelim String: start delimiter
74 * @param $endDelim String: end delimiter
75 * @param $callback Callback: function to call on each match
76 * @param $subject String
77 * @param $flags String: regular expression flags
78 * @throws MWException
79 * @return string
80 */
81 static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, $subject, $flags = '' ) {
82 $inputPos = 0;
83 $outputPos = 0;
84 $output = '';
85 $foundStart = false;
86 $encStart = preg_quote( $startDelim, '!' );
87 $encEnd = preg_quote( $endDelim, '!' );
88 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
89 $endLength = strlen( $endDelim );
90 $m = array();
91
92 while ( $inputPos < strlen( $subject ) &&
93 preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) )
94 {
95 $tokenOffset = $m[0][1];
96 if ( $m[1][0] != '' ) {
97 if ( $foundStart &&
98 $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 )
99 {
100 # An end match is present at the same location
101 $tokenType = 'end';
102 $tokenLength = $endLength;
103 } else {
104 $tokenType = 'start';
105 $tokenLength = strlen( $m[0][0] );
106 }
107 } elseif ( $m[2][0] != '' ) {
108 $tokenType = 'end';
109 $tokenLength = strlen( $m[0][0] );
110 } else {
111 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
112 }
113
114 if ( $tokenType == 'start' ) {
115 # Only move the start position if we haven't already found a start
116 # This means that START START END matches outer pair
117 if ( !$foundStart ) {
118 # Found start
119 $inputPos = $tokenOffset + $tokenLength;
120 # Write out the non-matching section
121 $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
122 $outputPos = $tokenOffset;
123 $contentPos = $inputPos;
124 $foundStart = true;
125 } else {
126 # Move the input position past the *first character* of START,
127 # to protect against missing END when it overlaps with START
128 $inputPos = $tokenOffset + 1;
129 }
130 } elseif ( $tokenType == 'end' ) {
131 if ( $foundStart ) {
132 # Found match
133 $output .= call_user_func( $callback, array(
134 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
135 substr( $subject, $contentPos, $tokenOffset - $contentPos )
136 ));
137 $foundStart = false;
138 } else {
139 # Non-matching end, write it out
140 $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
141 }
142 $inputPos = $outputPos = $tokenOffset + $tokenLength;
143 } else {
144 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
145 }
146 }
147 if ( $outputPos < strlen( $subject ) ) {
148 $output .= substr( $subject, $outputPos );
149 }
150 return $output;
151 }
152
153 /**
154 * Perform an operation equivalent to
155 *
156 * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
157 *
158 * @param $startDelim String: start delimiter regular expression
159 * @param $endDelim String: end delimiter regular expression
160 * @param $replace String: replacement string. May contain $1, which will be
161 * replaced by the text between the delimiters
162 * @param $subject String to search
163 * @param $flags String: regular expression flags
164 * @return String: The string with the matches replaced
165 */
166 static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
167 $replacer = new RegexlikeReplacer( $replace );
168 return self::delimiterReplaceCallback( $startDelim, $endDelim,
169 $replacer->cb(), $subject, $flags );
170 }
171
172 /**
173 * More or less "markup-safe" explode()
174 * Ignores any instances of the separator inside <...>
175 * @param $separator String
176 * @param $text String
177 * @return array
178 */
179 static function explodeMarkup( $separator, $text ) {
180 $placeholder = "\x00";
181
182 // Remove placeholder instances
183 $text = str_replace( $placeholder, '', $text );
184
185 // Replace instances of the separator inside HTML-like tags with the placeholder
186 $replacer = new DoubleReplacer( $separator, $placeholder );
187 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
188
189 // Explode, then put the replaced separators back in
190 $items = explode( $separator, $cleaned );
191 foreach( $items as $i => $str ) {
192 $items[$i] = str_replace( $placeholder, $separator, $str );
193 }
194
195 return $items;
196 }
197
198 /**
199 * Escape a string to make it suitable for inclusion in a preg_replace()
200 * replacement parameter.
201 *
202 * @param $string String
203 * @return String
204 */
205 static function escapeRegexReplacement( $string ) {
206 $string = str_replace( '\\', '\\\\', $string );
207 $string = str_replace( '$', '\\$', $string );
208 return $string;
209 }
210
211 /**
212 * Workalike for explode() with limited memory usage.
213 * Returns an Iterator
214 * @param $separator
215 * @param $subject
216 * @return ArrayIterator|ExplodeIterator
217 */
218 static function explode( $separator, $subject ) {
219 if ( substr_count( $subject, $separator ) > 1000 ) {
220 return new ExplodeIterator( $separator, $subject );
221 } else {
222 return new ArrayIterator( explode( $separator, $subject ) );
223 }
224 }
225 }
226
227 /**
228 * Base class for "replacers", objects used in preg_replace_callback() and
229 * StringUtils::delimiterReplaceCallback()
230 */
231 class Replacer {
232
233 /**
234 * @return array
235 */
236 function cb() {
237 return array( &$this, 'replace' );
238 }
239 }
240
241 /**
242 * Class to replace regex matches with a string similar to that used in preg_replace()
243 */
244 class RegexlikeReplacer extends Replacer {
245 var $r;
246
247 /**
248 * @param $r string
249 */
250 function __construct( $r ) {
251 $this->r = $r;
252 }
253
254 /**
255 * @param $matches array
256 * @return string
257 */
258 function replace( $matches ) {
259 $pairs = array();
260 foreach ( $matches as $i => $match ) {
261 $pairs["\$$i"] = $match;
262 }
263 return strtr( $this->r, $pairs );
264 }
265
266 }
267
268 /**
269 * Class to perform secondary replacement within each replacement string
270 */
271 class DoubleReplacer extends Replacer {
272
273 /**
274 * @param $from
275 * @param $to
276 * @param $index int
277 */
278 function __construct( $from, $to, $index = 0 ) {
279 $this->from = $from;
280 $this->to = $to;
281 $this->index = $index;
282 }
283
284 /**
285 * @param $matches array
286 * @return mixed
287 */
288 function replace( $matches ) {
289 return str_replace( $this->from, $this->to, $matches[$this->index] );
290 }
291 }
292
293 /**
294 * Class to perform replacement based on a simple hashtable lookup
295 */
296 class HashtableReplacer extends Replacer {
297 var $table, $index;
298
299 /**
300 * @param $table
301 * @param $index int
302 */
303 function __construct( $table, $index = 0 ) {
304 $this->table = $table;
305 $this->index = $index;
306 }
307
308 /**
309 * @param $matches array
310 * @return mixed
311 */
312 function replace( $matches ) {
313 return $this->table[$matches[$this->index]];
314 }
315 }
316
317 /**
318 * Replacement array for FSS with fallback to strtr()
319 * Supports lazy initialisation of FSS resource
320 */
321 class ReplacementArray {
322 /*mostly private*/ var $data = false;
323 /*mostly private*/ var $fss = false;
324
325 /**
326 * Create an object with the specified replacement array
327 * The array should have the same form as the replacement array for strtr()
328 * @param array $data
329 */
330 function __construct( $data = array() ) {
331 $this->data = $data;
332 }
333
334 /**
335 * @return array
336 */
337 function __sleep() {
338 return array( 'data' );
339 }
340
341 function __wakeup() {
342 $this->fss = false;
343 }
344
345 /**
346 * Set the whole replacement array at once
347 */
348 function setArray( $data ) {
349 $this->data = $data;
350 $this->fss = false;
351 }
352
353 /**
354 * @return array|bool
355 */
356 function getArray() {
357 return $this->data;
358 }
359
360 /**
361 * Set an element of the replacement array
362 * @param $from string
363 * @param $to stromg
364 */
365 function setPair( $from, $to ) {
366 $this->data[$from] = $to;
367 $this->fss = false;
368 }
369
370 /**
371 * @param $data array
372 */
373 function mergeArray( $data ) {
374 $this->data = array_merge( $this->data, $data );
375 $this->fss = false;
376 }
377
378 /**
379 * @param $other
380 */
381 function merge( $other ) {
382 $this->data = array_merge( $this->data, $other->data );
383 $this->fss = false;
384 }
385
386 /**
387 * @param $from string
388 */
389 function removePair( $from ) {
390 unset($this->data[$from]);
391 $this->fss = false;
392 }
393
394 /**
395 * @param $data array
396 */
397 function removeArray( $data ) {
398 foreach( $data as $from => $to ) {
399 $this->removePair( $from );
400 }
401 $this->fss = false;
402 }
403
404 /**
405 * @param $subject string
406 * @return string
407 */
408 function replace( $subject ) {
409 if ( function_exists( 'fss_prep_replace' ) ) {
410 wfProfileIn( __METHOD__.'-fss' );
411 if ( $this->fss === false ) {
412 $this->fss = fss_prep_replace( $this->data );
413 }
414 $result = fss_exec_replace( $this->fss, $subject );
415 wfProfileOut( __METHOD__.'-fss' );
416 } else {
417 wfProfileIn( __METHOD__.'-strtr' );
418 $result = strtr( $subject, $this->data );
419 wfProfileOut( __METHOD__.'-strtr' );
420 }
421 return $result;
422 }
423 }
424
425 /**
426 * An iterator which works exactly like:
427 *
428 * foreach ( explode( $delim, $s ) as $element ) {
429 * ...
430 * }
431 *
432 * Except it doesn't use 193 byte per element
433 */
434 class ExplodeIterator implements Iterator {
435 // The subject string
436 var $subject, $subjectLength;
437
438 // The delimiter
439 var $delim, $delimLength;
440
441 // The position of the start of the line
442 var $curPos;
443
444 // The position after the end of the next delimiter
445 var $endPos;
446
447 // The current token
448 var $current;
449
450 /**
451 * Construct a DelimIterator
452 * @param $delim string
453 * @param $s string
454 */
455 function __construct( $delim, $s ) {
456 $this->subject = $s;
457 $this->delim = $delim;
458
459 // Micro-optimisation (theoretical)
460 $this->subjectLength = strlen( $s );
461 $this->delimLength = strlen( $delim );
462
463 $this->rewind();
464 }
465
466 function rewind() {
467 $this->curPos = 0;
468 $this->endPos = strpos( $this->subject, $this->delim );
469 $this->refreshCurrent();
470 }
471
472 function refreshCurrent() {
473 if ( $this->curPos === false ) {
474 $this->current = false;
475 } elseif ( $this->curPos >= $this->subjectLength ) {
476 $this->current = '';
477 } elseif ( $this->endPos === false ) {
478 $this->current = substr( $this->subject, $this->curPos );
479 } else {
480 $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
481 }
482 }
483
484 function current() {
485 return $this->current;
486 }
487
488 function key() {
489 return $this->curPos;
490 }
491
492 /**
493 * @return string
494 */
495 function next() {
496 if ( $this->endPos === false ) {
497 $this->curPos = false;
498 } else {
499 $this->curPos = $this->endPos + $this->delimLength;
500 if ( $this->curPos >= $this->subjectLength ) {
501 $this->endPos = false;
502 } else {
503 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
504 }
505 }
506 $this->refreshCurrent();
507 return $this->current;
508 }
509
510 /**
511 * @return bool
512 */
513 function valid() {
514 return $this->curPos !== false;
515 }
516 }