includes/StringUtils.php

   1 <?php
   2 /**
   3  * A collection of static methods to play with strings.
   4  */
   5 class StringUtils {
   6         /**
   7          * Perform an operation equivalent to
   8          *
   9          *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
  10          *
  11          * except that it's worst-case O(N) instead of O(N^2)
  12          *
  13          * Compared to delimiterReplace(), this implementation is fast but memory-
  14          * hungry and inflexible. The memory requirements are such that I don't
  15          * recommend using it on anything but guaranteed small chunks of text.
  16          */
  17         static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
  18                 $segments = explode( $startDelim, $subject );
  19                 $output = array_shift( $segments );
  20                 foreach ( $segments as $s ) {
  21                         $endDelimPos = strpos( $s, $endDelim );
  22                         if ( $endDelimPos === false ) {
  23                                 $output .= $startDelim . $s;
  24                         } else {
  25                                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
  26                         }
  27                 }
  28                 return $output;
  29         }
  30
  31         /**
  32          * Perform an operation equivalent to
  33          *
  34          *   preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
  35          *
  36          * This implementation is slower than hungryDelimiterReplace but uses far less
  37          * memory. The delimiters are literal strings, not regular expressions.
  38          *
  39          * @param string $flags Regular expression flags
  40          */
  41         # If the start delimiter ends with an initial substring of the end delimiter,
  42         # e.g. in the case of C-style comments, the behaviour differs from the model
  43         # regex. In this implementation, the end must share no characters with the
  44         # start, so e.g. /*/ is not considered to be both the start and end of a
  45         # comment. /*/xy/*/ is considered to be a single comment with contents /xy/.
  46         static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, $subject, $flags = '' ) {
  47                 $inputPos = 0;
  48                 $outputPos = 0;
  49                 $output = '';
  50                 $foundStart = false;
  51                 $encStart = preg_quote( $startDelim, '!' );
  52                 $encEnd = preg_quote( $endDelim, '!' );
  53                 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
  54                 $endLength = strlen( $endDelim );
  55                 $m = array();
  56
  57                 while ( $inputPos < strlen( $subject ) &&
  58                   preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) )
  59                 {
  60                         $tokenOffset = $m[0][1];
  61                         if ( $m[1][0] != '' ) {
  62                                 if ( $foundStart &&
  63                                   $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 )
  64                                 {
  65                                         # An end match is present at the same location
  66                                         $tokenType = 'end';
  67                                         $tokenLength = $endLength;
  68                                 } else {
  69                                         $tokenType = 'start';
  70                                         $tokenLength = strlen( $m[0][0] );
  71                                 }
  72                         } elseif ( $m[2][0] != '' ) {
  73                                 $tokenType = 'end';
  74                                 $tokenLength = strlen( $m[0][0] );
  75                         } else {
  76                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
  77                         }
  78
  79                         if ( $tokenType == 'start' ) {
  80                                 $inputPos = $tokenOffset + $tokenLength;
  81                                 # Only move the start position if we haven't already found a start
  82                                 # This means that START START END matches outer pair
  83                                 if ( !$foundStart ) {
  84                                         # Found start
  85                                         # Write out the non-matching section
  86                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
  87                                         $outputPos = $tokenOffset;
  88                                         $contentPos = $inputPos;
  89                                         $foundStart = true;
  90                                 }
  91                         } elseif ( $tokenType == 'end' ) {
  92                                 if ( $foundStart ) {
  93                                         # Found match
  94                                         $output .= call_user_func( $callback, array(
  95                                                 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
  96                                                 substr( $subject, $contentPos, $tokenOffset - $contentPos )
  97                                         ));
  98                                         $foundStart = false;
  99                                 } else {
 100                                         # Non-matching end, write it out
 101                                         $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
 102                                 }
 103                                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
 104                         } else {
 105                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
 106                         }
 107                 }
 108                 if ( $outputPos < strlen( $subject ) ) {
 109                         $output .= substr( $subject, $outputPos );
 110                 }
 111                 return $output;
 112         }
 113
 114         /*
 115          * Perform an operation equivalent to
 116          *
 117          *   preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
 118          *
 119          * @param string $startDelim Start delimiter regular expression
 120          * @param string $endDelim End delimiter regular expression
 121          * @param string $replace Replacement string. May contain $1, which will be
 122          *               replaced by the text between the delimiters
 123          * @param string $subject String to search
 124          * @return string The string with the matches replaced
 125          */
 126         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
 127                 $replacer = new RegexlikeReplacer( $replace );
 128                 return self::delimiterReplaceCallback( $startDelim, $endDelim,
 129                         $replacer->cb(), $subject, $flags );
 130         }
 131
 132         /**
 133          * More or less "markup-safe" explode()
 134          * Ignores any instances of the separator inside <...>
 135          * @param string $separator
 136          * @param string $text
 137          * @return array
 138          */
 139         static function explodeMarkup( $separator, $text ) {
 140                 $placeholder = "\x00";
 141
 142                 // Remove placeholder instances
 143                 $text = str_replace( $placeholder, '', $text );
 144
 145                 // Replace instances of the separator inside HTML-like tags with the placeholder
 146                 $replacer = new DoubleReplacer( $separator, $placeholder );
 147                 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
 148
 149                 // Explode, then put the replaced separators back in
 150                 $items = explode( $separator, $cleaned );
 151                 foreach( $items as $i => $str ) {
 152                         $items[$i] = str_replace( $placeholder, $separator, $str );
 153                 }
 154
 155                 return $items;
 156         }
 157
 158         /**
 159          * Escape a string to make it suitable for inclusion in a preg_replace()
 160          * replacement parameter.
 161          *
 162          * @param string $string
 163          * @return string
 164          */
 165         static function escapeRegexReplacement( $string ) {
 166                 $string = str_replace( '\\', '\\\\', $string );
 167                 $string = str_replace( '$', '\\$', $string );
 168                 return $string;
 169         }
 170
 171         /**
 172          * Workalike for explode() with limited memory usage.
 173          * Returns an Iterator
 174          */
 175         static function explode( $separator, $subject ) {
 176                 if ( substr_count( $subject, $separator ) > 1000 ) {
 177                         return new ExplodeIterator( $separator, $subject );
 178                 } else {
 179                         return new ArrayIterator( explode( $separator, $subject ) );
 180                 }
 181         }
 182
 183         /**
 184          * Clean characters that are invalid in the given character set
 185          * from a given string.
 186          *
 187          * @param $string \type{$string} String to clean
 188          * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
 189          * @return \type{$string} Cleaned string
 190          */
 191         public static function cleanForCharset( $string, $charset='' ) {
 192                 global $wgOutputEncoding;
 193                 switch ( $charset ? $charset : $wgOutputEncoding ) {
 194                         # UTF-8 should be all we need to worry about. :)
 195                 case 'UTF-8':
 196                         return self::cleanUtf8( $string );
 197                 default:
 198                         return $string;
 199                 }
 200         }
 201
 202         /**
 203          * Clean invalid UTF-8 characters and sequences from a given string,
 204          * replacing them with U+FFFD.
 205          * Should be RFC 3629 compliant.
 206          *
 207          * @param $string \type{$string} String to clean
 208          * @return \type{$string} Cleaned string
 209          */
 210         private static function cleanUtf8( $str ) {
 211                 # HERE BE DRAGONS!
 212                 # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
 213
 214                 $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
 215                                   0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
 216                 $len = strlen( $str );
 217                 $left = $bytes = 0;
 218                 for ( $i = 0; $i < $len; $i++ ) {
 219                         $ch = ord( $str[$i] );
 220                         if ( !$left ) {
 221                                 if ( !($ch & 0x80 ) )
 222                                         continue;
 223                                 $left = (( $ch & 0xFE ) == 0xFC ? 5 :
 224                                         (( $ch & 0xFC ) == 0xF8 ? 4 :
 225                                         (( $ch & 0xF8 ) == 0xF0 ? 3 :
 226                                         (( $ch & 0xF0 ) == 0xE0 ? 2 :
 227                                         (( $ch & 0xE0 ) == 0xC0 ? 1 :
 228                                                                   0 )))));
 229                                 if ( $left ) {
 230                                         $bytes = $left + 1;
 231                                         $sum = $ch & ( 0xFF >> $bytes + 1 );
 232                                         continue;
 233                                 } else if ( $ch & 0x80 ) {
 234                                         $bytes = 1;
 235                                 }
 236                         } else if ( ( $ch & 0xC0 ) == 0x80 ) {
 237                                 $sum <<= 6;
 238                                 $sum += $ch & 0x3F;
 239                                 if ( --$left ) continue;
 240                                 if ( ( $bytes == 2 && $sum < 0x80     ) ||
 241                                      ( $bytes == 3 && $sum < 0x800    ) ||
 242                                      ( $bytes == 4 && $sum < 0x10000  ) ||
 243                                      ( $bytes >  4 || $sum > 0x10FFFF ) ||
 244                                      in_array( $sum, $illegal ) ) {
 245                                 } else continue;
 246
 247                         } else {
 248                                 $bytes -= $left;
 249                                 $i--;
 250                         }
 251
 252                         $str = ( substr( $str, 0, $i - $bytes + 1 ) .
 253                                  "\xEF\xBF\xBD" .
 254                                  substr( $str, $i + 1 ) );
 255                         $i   += 3 - $bytes;
 256                         $len += 3 - $bytes;
 257                         $left = 0;
 258                 }
 259
 260                 return $str;
 261         }
 262 }
 263
 264 /**
 265  * Base class for "replacers", objects used in preg_replace_callback() and
 266  * StringUtils::delimiterReplaceCallback()
 267  */
 268 class Replacer {
 269         function cb() {
 270                 return array( &$this, 'replace' );
 271         }
 272 }
 273
 274 /**
 275  * Class to replace regex matches with a string similar to that used in preg_replace()
 276  */
 277 class RegexlikeReplacer extends Replacer {
 278         var $r;
 279         function __construct( $r ) {
 280                 $this->r = $r;
 281         }
 282
 283         function replace( $matches ) {
 284                 $pairs = array();
 285                 foreach ( $matches as $i => $match ) {
 286                         $pairs["\$$i"] = $match;
 287                 }
 288                 return strtr( $this->r, $pairs );
 289         }
 290
 291 }
 292
 293 /**
 294  * Class to perform secondary replacement within each replacement string
 295  */
 296 class DoubleReplacer extends Replacer {
 297         function __construct( $from, $to, $index = 0 ) {
 298                 $this->from = $from;
 299                 $this->to = $to;
 300                 $this->index = $index;
 301         }
 302
 303         function replace( $matches ) {
 304                 return str_replace( $this->from, $this->to, $matches[$this->index] );
 305         }
 306 }
 307
 308 /**
 309  * Class to perform replacement based on a simple hashtable lookup
 310  */
 311 class HashtableReplacer extends Replacer {
 312         var $table, $index;
 313
 314         function __construct( $table, $index = 0 ) {
 315                 $this->table = $table;
 316                 $this->index = $index;
 317         }
 318
 319         function replace( $matches ) {
 320                 return $this->table[$matches[$this->index]];
 321         }
 322 }
 323
 324 /**
 325  * Replacement array for FSS with fallback to strtr()
 326  * Supports lazy initialisation of FSS resource
 327  */
 328 class ReplacementArray {
 329         /*mostly private*/ var $data = false;
 330         /*mostly private*/ var $fss = false;
 331
 332         /**
 333          * Create an object with the specified replacement array
 334          * The array should have the same form as the replacement array for strtr()
 335          */
 336         function __construct( $data = array() ) {
 337                 $this->data = $data;
 338         }
 339
 340         function __sleep() {
 341                 return array( 'data' );
 342         }
 343
 344         function __wakeup() {
 345                 $this->fss = false;
 346         }
 347
 348         /**
 349          * Set the whole replacement array at once
 350          */
 351         function setArray( $data ) {
 352                 $this->data = $data;
 353                 $this->fss = false;
 354         }
 355
 356         function getArray() {
 357                 return $this->data;
 358         }
 359
 360         /**
 361          * Set an element of the replacement array
 362          */
 363         function setPair( $from, $to ) {
 364                 $this->data[$from] = $to;
 365                 $this->fss = false;
 366         }
 367
 368         function mergeArray( $data ) {
 369                 $this->data = array_merge( $this->data, $data );
 370                 $this->fss = false;
 371         }
 372
 373         function merge( $other ) {
 374                 $this->data = array_merge( $this->data, $other->data );
 375                 $this->fss = false;
 376         }
 377
 378         function removePair( $from ) {
 379                 unset($this->data[$from]);
 380                 $this->fss = false;
 381         }
 382
 383         function removeArray( $data ) {
 384                 foreach( $data as $from => $to )
 385                         $this->removePair( $from );
 386                 $this->fss = false;
 387         }
 388
 389         function replace( $subject ) {
 390                 if ( function_exists( 'fss_prep_replace' ) ) {
 391                         wfProfileIn( __METHOD__.'-fss' );
 392                         if ( $this->fss === false ) {
 393                                 $this->fss = fss_prep_replace( $this->data );
 394                         }
 395                         $result = fss_exec_replace( $this->fss, $subject );
 396                         wfProfileOut( __METHOD__.'-fss' );
 397                 } else {
 398                         wfProfileIn( __METHOD__.'-strtr' );
 399                         $result = strtr( $subject, $this->data );
 400                         wfProfileOut( __METHOD__.'-strtr' );
 401                 }
 402                 return $result;
 403         }
 404 }
 405
 406 /**
 407  * An iterator which works exactly like:
 408  *
 409  * foreach ( explode( $delim, $s ) as $element ) {
 410  *    ...
 411  * }
 412  *
 413  * Except it doesn't use 193 byte per element
 414  */
 415 class ExplodeIterator implements Iterator {
 416         // The subject string
 417         var $subject, $subjectLength;
 418
 419         // The delimiter
 420         var $delim, $delimLength;
 421
 422         // The position of the start of the line
 423         var $curPos;
 424
 425         // The position after the end of the next delimiter
 426         var $endPos;
 427
 428         // The current token
 429         var $current;
 430
 431         /**
 432          * Construct a DelimIterator
 433          */
 434         function __construct( $delim, $s ) {
 435                 $this->subject = $s;
 436                 $this->delim = $delim;
 437
 438                 // Micro-optimisation (theoretical)
 439                 $this->subjectLength = strlen( $s );
 440                 $this->delimLength = strlen( $delim );
 441
 442                 $this->rewind();
 443         }
 444
 445         function rewind() {
 446                 $this->curPos = 0;
 447                 $this->endPos = strpos( $this->subject, $this->delim );
 448                 $this->refreshCurrent();
 449         }
 450
 451
 452         function refreshCurrent() {
 453                 if ( $this->curPos === false ) {
 454                         $this->current = false;
 455                 } elseif ( $this->curPos >= $this->subjectLength ) {
 456                         $this->current = '';
 457                 } elseif ( $this->endPos === false ) {
 458                         $this->current = substr( $this->subject, $this->curPos );
 459                 } else {
 460                         $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
 461                 }
 462         }
 463
 464         function current() {
 465                 return $this->current;
 466         }
 467
 468         function key() {
 469                 return $this->curPos;
 470         }
 471
 472         function next() {
 473                 if ( $this->endPos === false ) {
 474                         $this->curPos = false;
 475                 } else {
 476                         $this->curPos = $this->endPos + $this->delimLength;
 477                         if ( $this->curPos >= $this->subjectLength ) {
 478                                 $this->endPos = false;
 479                         } else {
 480                                 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
 481                         }
 482                 }
 483                 $this->refreshCurrent();
 484                 return $this->current;
 485         }
 486
 487         function valid() {
 488                 return $this->curPos !== false;
 489         }
 490 }
 491