includes/libs/StringUtils.php

   1 <?php
   2 /**
   3  * Methods to play with strings.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  */
  22
  23 /**
  24  * A collection of static methods to play with strings.
  25  */
  26 class StringUtils {
  27         /**
  28          * Test whether a string is valid UTF-8.
  29          *
  30          * The function check for invalid byte sequences, overlong encoding but
  31          * not for different normalisations.
  32          *
  33          * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
  34          * In particular, the pure PHP code path did not in fact check for overlong forms.
  35          * Beware of this when backporting code to that version of MediaWiki.
  36          *
  37          * @since 1.21
  38          * @param string $value String to check
  39          * @return bool Whether the given $value is a valid UTF-8 encoded string
  40          */
  41         static function isUtf8( $value ) {
  42                 $value = (string)$value;
  43
  44                 // HHVM 3.4 and older come with an outdated version of libmbfl that
  45                 // incorrectly allows values above U+10FFFF, so we have to check
  46                 // for them separately. (This issue also exists in PHP 5.3 and
  47                 // older, which are no longer supported.)
  48                 static $newPHP;
  49                 if ( $newPHP === null ) {
  50                         $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
  51                 }
  52
  53                 return mb_check_encoding( $value, 'UTF-8' ) &&
  54                         ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
  55         }
  56
  57         /**
  58          * Explode a string, but ignore any instances of the separator inside
  59          * the given start and end delimiters, which may optionally nest.
  60          * The delimiters are literal strings, not regular expressions.
  61          * @param string $startDelim Start delimiter
  62          * @param string $endDelim End delimiter
  63          * @param string $separator Separator string for the explode.
  64          * @param string $subject Subject string to explode.
  65          * @param bool $nested True iff the delimiters are allowed to nest.
  66          * @return ArrayIterator
  67          */
  68         static function delimiterExplode( $startDelim, $endDelim, $separator,
  69                 $subject, $nested = false ) {
  70                 $inputPos = 0;
  71                 $lastPos = 0;
  72                 $depth = 0;
  73                 $encStart = preg_quote( $startDelim, '!' );
  74                 $encEnd = preg_quote( $endDelim, '!' );
  75                 $encSep = preg_quote( $separator, '!' );
  76                 $len = strlen( $subject );
  77                 $m = [];
  78                 $exploded = [];
  79                 while (
  80                         $inputPos < $len &&
  81                         preg_match(
  82                                 "!$encStart|$encEnd|$encSep!S", $subject, $m,
  83                                 PREG_OFFSET_CAPTURE, $inputPos
  84                         )
  85                 ) {
  86                         $match = $m[0][0];
  87                         $matchPos = $m[0][1];
  88                         $inputPos = $matchPos + strlen( $match );
  89                         if ( $match === $separator ) {
  90                                 if ( $depth === 0 ) {
  91                                         $exploded[] = substr(
  92                                                 $subject, $lastPos, $matchPos - $lastPos
  93                                         );
  94                                         $lastPos = $inputPos;
  95                                 }
  96                         } elseif ( $match === $startDelim ) {
  97                                 if ( $depth === 0 || $nested ) {
  98                                         $depth++;
  99                                 }
 100                         } else {
 101                                 $depth--;
 102                         }
 103                 }
 104                 $exploded[] = substr( $subject, $lastPos );
 105                 // This method could be rewritten in the future to avoid creating an
 106                 // intermediate array, since the return type is just an iterator.
 107                 return new ArrayIterator( $exploded );
 108         }
 109
 110         /**
 111          * Perform an operation equivalent to `preg_replace()`
 112          *
 113          * Matches this code:
 114          *
 115          *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
 116          *
 117          * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
 118          * implementation is fast but memory-hungry and inflexible. The memory requirements are such
 119          * that I don't recommend using it on anything but guaranteed small chunks of text.
 120          *
 121          * @param string $startDelim
 122          * @param string $endDelim
 123          * @param string $replace
 124          * @param string $subject
 125          * @return string
 126          */
 127         static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
 128                 $segments = explode( $startDelim, $subject );
 129                 $output = array_shift( $segments );
 130                 foreach ( $segments as $s ) {
 131                         $endDelimPos = strpos( $s, $endDelim );
 132                         if ( $endDelimPos === false ) {
 133                                 $output .= $startDelim . $s;
 134                         } else {
 135                                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
 136                         }
 137                 }
 138
 139                 return $output;
 140         }
 141
 142         /**
 143          * Perform an operation equivalent to `preg_replace_callback()`
 144          *
 145          * Matches this code:
 146          *
 147          *     preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
 148          *
 149          * If the start delimiter ends with an initial substring of the end delimiter,
 150          * e.g. in the case of C-style comments, the behavior differs from the model
 151          * regex. In this implementation, the end must share no characters with the
 152          * start, so e.g. `/*\/` is not considered to be both the start and end of a
 153          * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
 154          *
 155          * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
 156          * but uses far less memory. The delimiters are literal strings, not regular expressions.
 157          *
 158          * @param string $startDelim Start delimiter
 159          * @param string $endDelim End delimiter
 160          * @param callable $callback Function to call on each match
 161          * @param string $subject
 162          * @param string $flags Regular expression flags
 163          * @throws InvalidArgumentException
 164          * @return string
 165          */
 166         static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
 167                 $subject, $flags = ''
 168         ) {
 169                 $inputPos = 0;
 170                 $outputPos = 0;
 171                 $output = '';
 172                 $foundStart = false;
 173                 $encStart = preg_quote( $startDelim, '!' );
 174                 $encEnd = preg_quote( $endDelim, '!' );
 175                 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
 176                 $endLength = strlen( $endDelim );
 177                 $m = [];
 178
 179                 while ( $inputPos < strlen( $subject ) &&
 180                         preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
 181                 ) {
 182                         $tokenOffset = $m[0][1];
 183                         if ( $m[1][0] != '' ) {
 184                                 if ( $foundStart &&
 185                                         $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
 186                                 ) {
 187                                         # An end match is present at the same location
 188                                         $tokenType = 'end';
 189                                         $tokenLength = $endLength;
 190                                 } else {
 191                                         $tokenType = 'start';
 192                                         $tokenLength = strlen( $m[0][0] );
 193                                 }
 194                         } elseif ( $m[2][0] != '' ) {
 195                                 $tokenType = 'end';
 196                                 $tokenLength = strlen( $m[0][0] );
 197                         } else {
 198                                 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
 199                         }
 200
 201                         if ( $tokenType == 'start' ) {
 202                                 # Only move the start position if we haven't already found a start
 203                                 # This means that START START END matches outer pair
 204                                 if ( !$foundStart ) {
 205                                         # Found start
 206                                         $inputPos = $tokenOffset + $tokenLength;
 207                                         # Write out the non-matching section
 208                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
 209                                         $outputPos = $tokenOffset;
 210                                         $contentPos = $inputPos;
 211                                         $foundStart = true;
 212                                 } else {
 213                                         # Move the input position past the *first character* of START,
 214                                         # to protect against missing END when it overlaps with START
 215                                         $inputPos = $tokenOffset + 1;
 216                                 }
 217                         } elseif ( $tokenType == 'end' ) {
 218                                 if ( $foundStart ) {
 219                                         # Found match
 220                                         $output .= call_user_func( $callback, [
 221                                                 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
 222                                                 substr( $subject, $contentPos, $tokenOffset - $contentPos )
 223                                         ] );
 224                                         $foundStart = false;
 225                                 } else {
 226                                         # Non-matching end, write it out
 227                                         $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
 228                                 }
 229                                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
 230                         } else {
 231                                 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
 232                         }
 233                 }
 234                 if ( $outputPos < strlen( $subject ) ) {
 235                         $output .= substr( $subject, $outputPos );
 236                 }
 237
 238                 return $output;
 239         }
 240
 241         /**
 242          * Perform an operation equivalent to `preg_replace()` with flags.
 243          *
 244          * Matches this code:
 245          *
 246          *     preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
 247          *
 248          * @param string $startDelim Start delimiter regular expression
 249          * @param string $endDelim End delimiter regular expression
 250          * @param string $replace Replacement string. May contain $1, which will be
 251          *  replaced by the text between the delimiters
 252          * @param string $subject String to search
 253          * @param string $flags Regular expression flags
 254          * @return string The string with the matches replaced
 255          */
 256         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
 257                 $replacer = new RegexlikeReplacer( $replace );
 258
 259                 return self::delimiterReplaceCallback( $startDelim, $endDelim,
 260                         $replacer->cb(), $subject, $flags );
 261         }
 262
 263         /**
 264          * More or less "markup-safe" explode()
 265          * Ignores any instances of the separator inside `<...>`
 266          * @param string $separator
 267          * @param string $text
 268          * @return array
 269          */
 270         static function explodeMarkup( $separator, $text ) {
 271                 $placeholder = "\x00";
 272
 273                 // Remove placeholder instances
 274                 $text = str_replace( $placeholder, '', $text );
 275
 276                 // Replace instances of the separator inside HTML-like tags with the placeholder
 277                 $replacer = new DoubleReplacer( $separator, $placeholder );
 278                 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
 279
 280                 // Explode, then put the replaced separators back in
 281                 $items = explode( $separator, $cleaned );
 282                 foreach ( $items as $i => $str ) {
 283                         $items[$i] = str_replace( $placeholder, $separator, $str );
 284                 }
 285
 286                 return $items;
 287         }
 288
 289         /**
 290          * More or less "markup-safe" str_replace()
 291          * Ignores any instances of the separator inside `<...>`
 292          * @param string $search
 293          * @param string $replace
 294          * @param string $text
 295          * @return string
 296          */
 297         static function replaceMarkup( $search, $replace, $text ) {
 298                 $placeholder = "\x00";
 299
 300                 // Remove placeholder instances
 301                 $text = str_replace( $placeholder, '', $text );
 302
 303                 // Replace instances of the separator inside HTML-like tags with the placeholder
 304                 $replacer = new DoubleReplacer( $search, $placeholder );
 305                 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
 306
 307                 // Explode, then put the replaced separators back in
 308                 $cleaned = str_replace( $search, $replace, $cleaned );
 309                 $text = str_replace( $placeholder, $search, $cleaned );
 310
 311                 return $text;
 312         }
 313
 314         /**
 315          * Escape a string to make it suitable for inclusion in a preg_replace()
 316          * replacement parameter.
 317          *
 318          * @param string $string
 319          * @return string
 320          */
 321         static function escapeRegexReplacement( $string ) {
 322                 $string = str_replace( '\\', '\\\\', $string );
 323                 $string = str_replace( '$', '\\$', $string );
 324                 return $string;
 325         }
 326
 327         /**
 328          * Workalike for explode() with limited memory usage.
 329          *
 330          * @param string $separator
 331          * @param string $subject
 332          * @return ArrayIterator|ExplodeIterator
 333          */
 334         static function explode( $separator, $subject ) {
 335                 if ( substr_count( $subject, $separator ) > 1000 ) {
 336                         return new ExplodeIterator( $separator, $subject );
 337                 } else {
 338                         return new ArrayIterator( explode( $separator, $subject ) );
 339                 }
 340         }
 341 }