StringUtils: throw InvalidArgumentException and move into libs/
authorKunal Mehta <legoktm@gmail.com>
Thu, 29 Jan 2015 18:30:57 +0000 (10:30 -0800)
committerKunal Mehta <legoktm@gmail.com>
Thu, 29 Jan 2015 18:30:57 +0000 (10:30 -0800)
Bug: T87863
Change-Id: Iac5bd958c27cad834e35930d0b99adb75c60411b

autoload.php
includes/libs/StringUtils.php [new file with mode: 0644]
includes/utils/StringUtils.php [deleted file]
tests/phpunit/includes/libs/StringUtilsTest.php [new file with mode: 0644]
tests/phpunit/includes/utils/StringUtilsTest.php [deleted file]

index e6f0f89..90cd074 100644 (file)
@@ -1155,7 +1155,7 @@ $wgAutoloadLocalClasses = array(
        'StoreFileOp' => __DIR__ . '/includes/filebackend/FileOp.php',
        'StreamFile' => __DIR__ . '/includes/StreamFile.php',
        'StringPrefixSearch' => __DIR__ . '/includes/PrefixSearch.php',
-       'StringUtils' => __DIR__ . '/includes/utils/StringUtils.php',
+       'StringUtils' => __DIR__ . '/includes/libs/StringUtils.php',
        'StripState' => __DIR__ . '/includes/parser/StripState.php',
        'StubObject' => __DIR__ . '/includes/StubObject.php',
        'StubUserLang' => __DIR__ . '/includes/StubObject.php',
diff --git a/includes/libs/StringUtils.php b/includes/libs/StringUtils.php
new file mode 100644 (file)
index 0000000..11ae0b2
--- /dev/null
@@ -0,0 +1,317 @@
+<?php
+/**
+ * Methods to play with strings.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ */
+
+/**
+ * A collection of static methods to play with strings.
+ */
+class StringUtils {
+       /**
+        * Test whether a string is valid UTF-8.
+        *
+        * The function check for invalid byte sequences, overlong encoding but
+        * not for different normalisations.
+        *
+        * This relies internally on the mbstring function mb_check_encoding()
+        * hardcoded to check against UTF-8. Whenever the function is not available
+        * we fallback to a pure PHP implementation. Setting $disableMbstring to
+        * true will skip the use of mb_check_encoding, this is mostly intended for
+        * unit testing our internal implementation.
+        *
+        * @since 1.21
+        * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
+        * In particular, the pure PHP code path did not in fact check for overlong forms.
+        * Beware of this when backporting code to that version of MediaWiki.
+        *
+        * @param string $value String to check
+        * @param bool $disableMbstring Whether to use the pure PHP
+        * implementation instead of trying mb_check_encoding. Intended for unit
+        * testing. Default: false
+        *
+        * @return bool Whether the given $value is a valid UTF-8 encoded string
+        */
+       static function isUtf8( $value, $disableMbstring = false ) {
+               $value = (string)$value;
+
+               // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
+               // U+10FFFF are incorrectly allowed, so we have to check for them separately.
+               if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
+                       static $newPHP;
+                       if ( $newPHP === null ) {
+                               $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
+                       }
+
+                       return mb_check_encoding( $value, 'UTF-8' ) &&
+                               ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
+               }
+
+               if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
+                       // String contains only ASCII characters, has to be valid
+                       return true;
+               }
+
+               // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
+               // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
+               // sequences, which can be as long as the input string is. Multiple short regexes are
+               // used rather than a single long regex for performance.
+               static $regexes;
+               if ( $regexes === null ) {
+                       $cont = "[\x80-\xbf]";
+                       $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
+                       $regexes = array(
+                               // Continuation byte at the start
+                               "/^$cont/",
+
+                               // ASCII byte followed by a continuation byte
+                               "/[\\x00-\x7f]$cont/S",
+
+                               // Illegal byte
+                               "/[\xc0\xc1\xf5-\xff]/S",
+
+                               // Invalid 2-byte sequence, or valid one then an extra continuation byte
+                               "/[\xc2-\xdf](?!$cont$after)/S",
+
+                               // Invalid 3-byte sequence, or valid one then an extra continuation byte
+                               "/\xe0(?![\xa0-\xbf]$cont$after)/",
+                               "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
+                               "/\xed(?![\x80-\x9f]$cont$after)/",
+
+                               // Invalid 4-byte sequence, or valid one then an extra continuation byte
+                               "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
+                               "/[\xf1-\xf3](?!$cont{3}$after)/S",
+                               "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
+                       );
+               }
+
+               foreach ( $regexes as $regex ) {
+                       if ( preg_match( $regex, $value ) !== 0 ) {
+                               return false;
+                       }
+               }
+
+               return true;
+       }
+
+       /**
+        * Perform an operation equivalent to
+        *
+        *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
+        *
+        * except that it's worst-case O(N) instead of O(N^2)
+        *
+        * Compared to delimiterReplace(), this implementation is fast but memory-
+        * hungry and inflexible. The memory requirements are such that I don't
+        * recommend using it on anything but guaranteed small chunks of text.
+        *
+        * @param string $startDelim
+        * @param string $endDelim
+        * @param string $replace
+        * @param string $subject
+        *
+        * @return string
+        */
+       static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
+               $segments = explode( $startDelim, $subject );
+               $output = array_shift( $segments );
+               foreach ( $segments as $s ) {
+                       $endDelimPos = strpos( $s, $endDelim );
+                       if ( $endDelimPos === false ) {
+                               $output .= $startDelim . $s;
+                       } else {
+                               $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
+                       }
+               }
+
+               return $output;
+       }
+
+       /**
+        * Perform an operation equivalent to
+        *
+        *   preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
+        *
+        * This implementation is slower than hungryDelimiterReplace but uses far less
+        * memory. The delimiters are literal strings, not regular expressions.
+        *
+        * If the start delimiter ends with an initial substring of the end delimiter,
+        * e.g. in the case of C-style comments, the behavior differs from the model
+        * regex. In this implementation, the end must share no characters with the
+        * start, so e.g. /*\/ is not considered to be both the start and end of a
+        * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
+        *
+        * @param string $startDelim Start delimiter
+        * @param string $endDelim End delimiter
+        * @param callable $callback Function to call on each match
+        * @param string $subject
+        * @param string $flags Regular expression flags
+        * @throws InvalidArgumentException
+        * @return string
+        */
+       static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
+               $subject, $flags = ''
+       ) {
+               $inputPos = 0;
+               $outputPos = 0;
+               $output = '';
+               $foundStart = false;
+               $encStart = preg_quote( $startDelim, '!' );
+               $encEnd = preg_quote( $endDelim, '!' );
+               $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
+               $endLength = strlen( $endDelim );
+               $m = array();
+
+               while ( $inputPos < strlen( $subject ) &&
+                       preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
+               ) {
+                       $tokenOffset = $m[0][1];
+                       if ( $m[1][0] != '' ) {
+                               if ( $foundStart &&
+                                       $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
+                               ) {
+                                       # An end match is present at the same location
+                                       $tokenType = 'end';
+                                       $tokenLength = $endLength;
+                               } else {
+                                       $tokenType = 'start';
+                                       $tokenLength = strlen( $m[0][0] );
+                               }
+                       } elseif ( $m[2][0] != '' ) {
+                               $tokenType = 'end';
+                               $tokenLength = strlen( $m[0][0] );
+                       } else {
+                               throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
+                       }
+
+                       if ( $tokenType == 'start' ) {
+                               # Only move the start position if we haven't already found a start
+                               # This means that START START END matches outer pair
+                               if ( !$foundStart ) {
+                                       # Found start
+                                       $inputPos = $tokenOffset + $tokenLength;
+                                       # Write out the non-matching section
+                                       $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
+                                       $outputPos = $tokenOffset;
+                                       $contentPos = $inputPos;
+                                       $foundStart = true;
+                               } else {
+                                       # Move the input position past the *first character* of START,
+                                       # to protect against missing END when it overlaps with START
+                                       $inputPos = $tokenOffset + 1;
+                               }
+                       } elseif ( $tokenType == 'end' ) {
+                               if ( $foundStart ) {
+                                       # Found match
+                                       $output .= call_user_func( $callback, array(
+                                               substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
+                                               substr( $subject, $contentPos, $tokenOffset - $contentPos )
+                                       ) );
+                                       $foundStart = false;
+                               } else {
+                                       # Non-matching end, write it out
+                                       $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
+                               }
+                               $inputPos = $outputPos = $tokenOffset + $tokenLength;
+                       } else {
+                               throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
+                       }
+               }
+               if ( $outputPos < strlen( $subject ) ) {
+                       $output .= substr( $subject, $outputPos );
+               }
+
+               return $output;
+       }
+
+       /**
+        * Perform an operation equivalent to
+        *
+        *   preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
+        *
+        * @param string $startDelim Start delimiter regular expression
+        * @param string $endDelim End delimiter regular expression
+        * @param string $replace Replacement string. May contain $1, which will be
+        *                 replaced by the text between the delimiters
+        * @param string $subject String to search
+        * @param string $flags Regular expression flags
+        * @return string The string with the matches replaced
+        */
+       static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
+               $replacer = new RegexlikeReplacer( $replace );
+
+               return self::delimiterReplaceCallback( $startDelim, $endDelim,
+                       $replacer->cb(), $subject, $flags );
+       }
+
+       /**
+        * More or less "markup-safe" explode()
+        * Ignores any instances of the separator inside <...>
+        * @param string $separator
+        * @param string $text
+        * @return array
+        */
+       static function explodeMarkup( $separator, $text ) {
+               $placeholder = "\x00";
+
+               // Remove placeholder instances
+               $text = str_replace( $placeholder, '', $text );
+
+               // Replace instances of the separator inside HTML-like tags with the placeholder
+               $replacer = new DoubleReplacer( $separator, $placeholder );
+               $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
+
+               // Explode, then put the replaced separators back in
+               $items = explode( $separator, $cleaned );
+               foreach ( $items as $i => $str ) {
+                       $items[$i] = str_replace( $placeholder, $separator, $str );
+               }
+
+               return $items;
+       }
+
+       /**
+        * Escape a string to make it suitable for inclusion in a preg_replace()
+        * replacement parameter.
+        *
+        * @param string $string
+        * @return string
+        */
+       static function escapeRegexReplacement( $string ) {
+               $string = str_replace( '\\', '\\\\', $string );
+               $string = str_replace( '$', '\\$', $string );
+
+               return $string;
+       }
+
+       /**
+        * Workalike for explode() with limited memory usage.
+        * Returns an Iterator
+        * @param string $separator
+        * @param string $subject
+        * @return ArrayIterator|ExplodeIterator
+        */
+       static function explode( $separator, $subject ) {
+               if ( substr_count( $subject, $separator ) > 1000 ) {
+                       return new ExplodeIterator( $separator, $subject );
+               } else {
+                       return new ArrayIterator( explode( $separator, $subject ) );
+               }
+       }
+}
diff --git a/includes/utils/StringUtils.php b/includes/utils/StringUtils.php
deleted file mode 100644 (file)
index 77fff59..0000000
+++ /dev/null
@@ -1,317 +0,0 @@
-<?php
-/**
- * Methods to play with strings.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- */
-
-/**
- * A collection of static methods to play with strings.
- */
-class StringUtils {
-       /**
-        * Test whether a string is valid UTF-8.
-        *
-        * The function check for invalid byte sequences, overlong encoding but
-        * not for different normalisations.
-        *
-        * This relies internally on the mbstring function mb_check_encoding()
-        * hardcoded to check against UTF-8. Whenever the function is not available
-        * we fallback to a pure PHP implementation. Setting $disableMbstring to
-        * true will skip the use of mb_check_encoding, this is mostly intended for
-        * unit testing our internal implementation.
-        *
-        * @since 1.21
-        * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
-        * In particular, the pure PHP code path did not in fact check for overlong forms.
-        * Beware of this when backporting code to that version of MediaWiki.
-        *
-        * @param string $value String to check
-        * @param bool $disableMbstring Whether to use the pure PHP
-        * implementation instead of trying mb_check_encoding. Intended for unit
-        * testing. Default: false
-        *
-        * @return bool Whether the given $value is a valid UTF-8 encoded string
-        */
-       static function isUtf8( $value, $disableMbstring = false ) {
-               $value = (string)$value;
-
-               // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
-               // U+10FFFF are incorrectly allowed, so we have to check for them separately.
-               if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
-                       static $newPHP;
-                       if ( $newPHP === null ) {
-                               $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
-                       }
-
-                       return mb_check_encoding( $value, 'UTF-8' ) &&
-                               ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
-               }
-
-               if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
-                       // String contains only ASCII characters, has to be valid
-                       return true;
-               }
-
-               // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
-               // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
-               // sequences, which can be as long as the input string is. Multiple short regexes are
-               // used rather than a single long regex for performance.
-               static $regexes;
-               if ( $regexes === null ) {
-                       $cont = "[\x80-\xbf]";
-                       $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
-                       $regexes = array(
-                               // Continuation byte at the start
-                               "/^$cont/",
-
-                               // ASCII byte followed by a continuation byte
-                               "/[\\x00-\x7f]$cont/S",
-
-                               // Illegal byte
-                               "/[\xc0\xc1\xf5-\xff]/S",
-
-                               // Invalid 2-byte sequence, or valid one then an extra continuation byte
-                               "/[\xc2-\xdf](?!$cont$after)/S",
-
-                               // Invalid 3-byte sequence, or valid one then an extra continuation byte
-                               "/\xe0(?![\xa0-\xbf]$cont$after)/",
-                               "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
-                               "/\xed(?![\x80-\x9f]$cont$after)/",
-
-                               // Invalid 4-byte sequence, or valid one then an extra continuation byte
-                               "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
-                               "/[\xf1-\xf3](?!$cont{3}$after)/S",
-                               "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
-                       );
-               }
-
-               foreach ( $regexes as $regex ) {
-                       if ( preg_match( $regex, $value ) !== 0 ) {
-                               return false;
-                       }
-               }
-
-               return true;
-       }
-
-       /**
-        * Perform an operation equivalent to
-        *
-        *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
-        *
-        * except that it's worst-case O(N) instead of O(N^2)
-        *
-        * Compared to delimiterReplace(), this implementation is fast but memory-
-        * hungry and inflexible. The memory requirements are such that I don't
-        * recommend using it on anything but guaranteed small chunks of text.
-        *
-        * @param string $startDelim
-        * @param string $endDelim
-        * @param string $replace
-        * @param string $subject
-        *
-        * @return string
-        */
-       static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
-               $segments = explode( $startDelim, $subject );
-               $output = array_shift( $segments );
-               foreach ( $segments as $s ) {
-                       $endDelimPos = strpos( $s, $endDelim );
-                       if ( $endDelimPos === false ) {
-                               $output .= $startDelim . $s;
-                       } else {
-                               $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
-                       }
-               }
-
-               return $output;
-       }
-
-       /**
-        * Perform an operation equivalent to
-        *
-        *   preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
-        *
-        * This implementation is slower than hungryDelimiterReplace but uses far less
-        * memory. The delimiters are literal strings, not regular expressions.
-        *
-        * If the start delimiter ends with an initial substring of the end delimiter,
-        * e.g. in the case of C-style comments, the behavior differs from the model
-        * regex. In this implementation, the end must share no characters with the
-        * start, so e.g. /*\/ is not considered to be both the start and end of a
-        * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
-        *
-        * @param string $startDelim Start delimiter
-        * @param string $endDelim End delimiter
-        * @param callable $callback Function to call on each match
-        * @param string $subject
-        * @param string $flags Regular expression flags
-        * @throws MWException
-        * @return string
-        */
-       static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
-               $subject, $flags = ''
-       ) {
-               $inputPos = 0;
-               $outputPos = 0;
-               $output = '';
-               $foundStart = false;
-               $encStart = preg_quote( $startDelim, '!' );
-               $encEnd = preg_quote( $endDelim, '!' );
-               $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
-               $endLength = strlen( $endDelim );
-               $m = array();
-
-               while ( $inputPos < strlen( $subject ) &&
-                       preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
-               ) {
-                       $tokenOffset = $m[0][1];
-                       if ( $m[1][0] != '' ) {
-                               if ( $foundStart &&
-                                       $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
-                               ) {
-                                       # An end match is present at the same location
-                                       $tokenType = 'end';
-                                       $tokenLength = $endLength;
-                               } else {
-                                       $tokenType = 'start';
-                                       $tokenLength = strlen( $m[0][0] );
-                               }
-                       } elseif ( $m[2][0] != '' ) {
-                               $tokenType = 'end';
-                               $tokenLength = strlen( $m[0][0] );
-                       } else {
-                               throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
-                       }
-
-                       if ( $tokenType == 'start' ) {
-                               # Only move the start position if we haven't already found a start
-                               # This means that START START END matches outer pair
-                               if ( !$foundStart ) {
-                                       # Found start
-                                       $inputPos = $tokenOffset + $tokenLength;
-                                       # Write out the non-matching section
-                                       $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
-                                       $outputPos = $tokenOffset;
-                                       $contentPos = $inputPos;
-                                       $foundStart = true;
-                               } else {
-                                       # Move the input position past the *first character* of START,
-                                       # to protect against missing END when it overlaps with START
-                                       $inputPos = $tokenOffset + 1;
-                               }
-                       } elseif ( $tokenType == 'end' ) {
-                               if ( $foundStart ) {
-                                       # Found match
-                                       $output .= call_user_func( $callback, array(
-                                               substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
-                                               substr( $subject, $contentPos, $tokenOffset - $contentPos )
-                                       ) );
-                                       $foundStart = false;
-                               } else {
-                                       # Non-matching end, write it out
-                                       $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
-                               }
-                               $inputPos = $outputPos = $tokenOffset + $tokenLength;
-                       } else {
-                               throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
-                       }
-               }
-               if ( $outputPos < strlen( $subject ) ) {
-                       $output .= substr( $subject, $outputPos );
-               }
-
-               return $output;
-       }
-
-       /**
-        * Perform an operation equivalent to
-        *
-        *   preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
-        *
-        * @param string $startDelim Start delimiter regular expression
-        * @param string $endDelim End delimiter regular expression
-        * @param string $replace Replacement string. May contain $1, which will be
-        *                 replaced by the text between the delimiters
-        * @param string $subject String to search
-        * @param string $flags Regular expression flags
-        * @return string The string with the matches replaced
-        */
-       static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
-               $replacer = new RegexlikeReplacer( $replace );
-
-               return self::delimiterReplaceCallback( $startDelim, $endDelim,
-                       $replacer->cb(), $subject, $flags );
-       }
-
-       /**
-        * More or less "markup-safe" explode()
-        * Ignores any instances of the separator inside <...>
-        * @param string $separator
-        * @param string $text
-        * @return array
-        */
-       static function explodeMarkup( $separator, $text ) {
-               $placeholder = "\x00";
-
-               // Remove placeholder instances
-               $text = str_replace( $placeholder, '', $text );
-
-               // Replace instances of the separator inside HTML-like tags with the placeholder
-               $replacer = new DoubleReplacer( $separator, $placeholder );
-               $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
-
-               // Explode, then put the replaced separators back in
-               $items = explode( $separator, $cleaned );
-               foreach ( $items as $i => $str ) {
-                       $items[$i] = str_replace( $placeholder, $separator, $str );
-               }
-
-               return $items;
-       }
-
-       /**
-        * Escape a string to make it suitable for inclusion in a preg_replace()
-        * replacement parameter.
-        *
-        * @param string $string
-        * @return string
-        */
-       static function escapeRegexReplacement( $string ) {
-               $string = str_replace( '\\', '\\\\', $string );
-               $string = str_replace( '$', '\\$', $string );
-
-               return $string;
-       }
-
-       /**
-        * Workalike for explode() with limited memory usage.
-        * Returns an Iterator
-        * @param string $separator
-        * @param string $subject
-        * @return ArrayIterator|ExplodeIterator
-        */
-       static function explode( $separator, $subject ) {
-               if ( substr_count( $subject, $separator ) > 1000 ) {
-                       return new ExplodeIterator( $separator, $subject );
-               } else {
-                       return new ArrayIterator( explode( $separator, $subject ) );
-               }
-       }
-}
diff --git a/tests/phpunit/includes/libs/StringUtilsTest.php b/tests/phpunit/includes/libs/StringUtilsTest.php
new file mode 100644 (file)
index 0000000..7c24fae
--- /dev/null
@@ -0,0 +1,149 @@
+<?php
+
+class StringUtilsTest extends PHPUnit_Framework_TestCase {
+
+       /**
+        * This tests StringUtils::isUtf8 whenever we have the mbstring extension
+        * loaded.
+        *
+        * @covers StringUtils::isUtf8
+        * @dataProvider provideStringsForIsUtf8Check
+        */
+       public function testIsUtf8WithMbstring( $expected, $string ) {
+               if ( !function_exists( 'mb_check_encoding' ) ) {
+                       $this->markTestSkipped( 'Test requires the mbstring PHP extension' );
+               }
+               $this->assertEquals( $expected,
+                       StringUtils::isUtf8( $string ),
+                       'Testing string "' . $this->escaped( $string ) . '" with mb_check_encoding'
+               );
+       }
+
+       /**
+        * This tests StringUtils::isUtf8 making sure we use the pure PHP
+        * implementation used as a fallback when mb_check_encoding() is
+        * not available.
+        *
+        * @covers StringUtils::isUtf8
+        * @dataProvider provideStringsForIsUtf8Check
+        */
+       public function testIsUtf8WithPhpFallbackImplementation( $expected, $string ) {
+               $this->assertEquals( $expected,
+                       StringUtils::isUtf8( $string, /** disable mbstring: */true ),
+                       'Testing string "' . $this->escaped( $string ) . '" with pure PHP implementation'
+               );
+       }
+
+       /**
+        * Print high range characters as a hexadecimal
+        * @param string $string
+        * @return string
+        */
+       function escaped( $string ) {
+               $escaped = '';
+               $length = strlen( $string );
+               for ( $i = 0; $i < $length; $i++ ) {
+                       $char = $string[$i];
+                       $val = ord( $char );
+                       if ( $val > 127 ) {
+                               $escaped .= '\x' . dechex( $val );
+                       } else {
+                               $escaped .= $char;
+                       }
+               }
+
+               return $escaped;
+       }
+
+       /**
+        * See also "UTF-8 decoder capability and stress test" by
+        * Markus Kuhn:
+        * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+        */
+       public static function provideStringsForIsUtf8Check() {
+               // Expected return values for StringUtils::isUtf8()
+               $PASS = true;
+               $FAIL = false;
+
+               return array(
+                       'some ASCII' => array( $PASS, 'Some ASCII' ),
+                       'euro sign' => array( $PASS, "Euro sign €" ),
+
+                       'first possible sequence 1 byte' => array( $PASS, "\x00" ),
+                       'first possible sequence 2 bytes' => array( $PASS, "\xc2\x80" ),
+                       'first possible sequence 3 bytes' => array( $PASS, "\xe0\xa0\x80" ),
+                       'first possible sequence 4 bytes' => array( $PASS, "\xf0\x90\x80\x80" ),
+                       'first possible sequence 5 bytes' => array( $FAIL, "\xf8\x88\x80\x80\x80" ),
+                       'first possible sequence 6 bytes' => array( $FAIL, "\xfc\x84\x80\x80\x80\x80" ),
+
+                       'last possible sequence 1 byte' => array( $PASS, "\x7f" ),
+                       'last possible sequence 2 bytes' => array( $PASS, "\xdf\xbf" ),
+                       'last possible sequence 3 bytes' => array( $PASS, "\xef\xbf\xbf" ),
+                       'last possible sequence 4 bytes (U+1FFFFF)' => array( $FAIL, "\xf7\xbf\xbf\xbf" ),
+                       'last possible sequence 5 bytes' => array( $FAIL, "\xfb\xbf\xbf\xbf\xbf" ),
+                       'last possible sequence 6 bytes' => array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ),
+
+                       'boundary 1' => array( $PASS, "\xed\x9f\xbf" ),
+                       'boundary 2' => array( $PASS, "\xee\x80\x80" ),
+                       'boundary 3' => array( $PASS, "\xef\xbf\xbd" ),
+                       'boundary 4' => array( $PASS, "\xf2\x80\x80\x80" ),
+                       'boundary 5 (U+FFFFF)' => array( $PASS, "\xf3\xbf\xbf\xbf" ),
+                       'boundary 6 (U+100000)' => array( $PASS, "\xf4\x80\x80\x80" ),
+                       'boundary 7 (U+10FFFF)' => array( $PASS, "\xf4\x8f\xbf\xbf" ),
+                       'boundary 8 (U+110000)' => array( $FAIL, "\xf4\x90\x80\x80" ),
+
+                       'malformed 1' => array( $FAIL, "\x80" ),
+                       'malformed 2' => array( $FAIL, "\xbf" ),
+                       'malformed 3' => array( $FAIL, "\x80\xbf" ),
+                       'malformed 4' => array( $FAIL, "\x80\xbf\x80" ),
+                       'malformed 5' => array( $FAIL, "\x80\xbf\x80\xbf" ),
+                       'malformed 6' => array( $FAIL, "\x80\xbf\x80\xbf\x80" ),
+                       'malformed 7' => array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ),
+                       'malformed 8' => array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ),
+
+                       'last byte missing 1' => array( $FAIL, "\xc0" ),
+                       'last byte missing 2' => array( $FAIL, "\xe0\x80" ),
+                       'last byte missing 3' => array( $FAIL, "\xf0\x80\x80" ),
+                       'last byte missing 4' => array( $FAIL, "\xf8\x80\x80\x80" ),
+                       'last byte missing 5' => array( $FAIL, "\xfc\x80\x80\x80\x80" ),
+                       'last byte missing 6' => array( $FAIL, "\xdf" ),
+                       'last byte missing 7' => array( $FAIL, "\xef\xbf" ),
+                       'last byte missing 8' => array( $FAIL, "\xf7\xbf\xbf" ),
+                       'last byte missing 9' => array( $FAIL, "\xfb\xbf\xbf\xbf" ),
+                       'last byte missing 10' => array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ),
+
+                       'extra continuation byte 1' => array( $FAIL, "e\xaf" ),
+                       'extra continuation byte 2' => array( $FAIL, "\xc3\x89\xaf" ),
+                       'extra continuation byte 3' => array( $FAIL, "\xef\xbc\xa5\xaf" ),
+                       'extra continuation byte 4' => array( $FAIL, "\xf0\x9d\x99\xb4\xaf" ),
+
+                       'impossible bytes 1' => array( $FAIL, "\xfe" ),
+                       'impossible bytes 2' => array( $FAIL, "\xff" ),
+                       'impossible bytes 3' => array( $FAIL, "\xfe\xfe\xff\xff" ),
+
+                       'overlong sequences 1' => array( $FAIL, "\xc0\xaf" ),
+                       'overlong sequences 2' => array( $FAIL, "\xc1\xaf" ),
+                       'overlong sequences 3' => array( $FAIL, "\xe0\x80\xaf" ),
+                       'overlong sequences 4' => array( $FAIL, "\xf0\x80\x80\xaf" ),
+                       'overlong sequences 5' => array( $FAIL, "\xf8\x80\x80\x80\xaf" ),
+                       'overlong sequences 6' => array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ),
+
+                       'maximum overlong sequences 1' => array( $FAIL, "\xc1\xbf" ),
+                       'maximum overlong sequences 2' => array( $FAIL, "\xe0\x9f\xbf" ),
+                       'maximum overlong sequences 3' => array( $FAIL, "\xf0\x8f\xbf\xbf" ),
+                       'maximum overlong sequences 4' => array( $FAIL, "\xf8\x87\xbf\xbf" ),
+                       'maximum overlong sequences 5' => array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ),
+
+                       'surrogates 1 (U+D799)' => array( $PASS, "\xed\x9f\xbf" ),
+                       'surrogates 2 (U+E000)' => array( $PASS, "\xee\x80\x80" ),
+                       'surrogates 3 (U+D800)' => array( $FAIL, "\xed\xa0\x80" ),
+                       'surrogates 4 (U+DBFF)' => array( $FAIL, "\xed\xaf\xbf" ),
+                       'surrogates 5 (U+DC00)' => array( $FAIL, "\xed\xb0\x80" ),
+                       'surrogates 6 (U+DFFF)' => array( $FAIL, "\xed\xbf\xbf" ),
+                       'surrogates 7 (U+D800 U+DC00)' => array( $FAIL, "\xed\xa0\x80\xed\xb0\x80" ),
+
+                       'noncharacters 1' => array( $PASS, "\xef\xbf\xbe" ),
+                       'noncharacters 2' => array( $PASS, "\xef\xbf\xbf" ),
+               );
+       }
+}
diff --git a/tests/phpunit/includes/utils/StringUtilsTest.php b/tests/phpunit/includes/utils/StringUtilsTest.php
deleted file mode 100644 (file)
index 7c24fae..0000000
+++ /dev/null
@@ -1,149 +0,0 @@
-<?php
-
-class StringUtilsTest extends PHPUnit_Framework_TestCase {
-
-       /**
-        * This tests StringUtils::isUtf8 whenever we have the mbstring extension
-        * loaded.
-        *
-        * @covers StringUtils::isUtf8
-        * @dataProvider provideStringsForIsUtf8Check
-        */
-       public function testIsUtf8WithMbstring( $expected, $string ) {
-               if ( !function_exists( 'mb_check_encoding' ) ) {
-                       $this->markTestSkipped( 'Test requires the mbstring PHP extension' );
-               }
-               $this->assertEquals( $expected,
-                       StringUtils::isUtf8( $string ),
-                       'Testing string "' . $this->escaped( $string ) . '" with mb_check_encoding'
-               );
-       }
-
-       /**
-        * This tests StringUtils::isUtf8 making sure we use the pure PHP
-        * implementation used as a fallback when mb_check_encoding() is
-        * not available.
-        *
-        * @covers StringUtils::isUtf8
-        * @dataProvider provideStringsForIsUtf8Check
-        */
-       public function testIsUtf8WithPhpFallbackImplementation( $expected, $string ) {
-               $this->assertEquals( $expected,
-                       StringUtils::isUtf8( $string, /** disable mbstring: */true ),
-                       'Testing string "' . $this->escaped( $string ) . '" with pure PHP implementation'
-               );
-       }
-
-       /**
-        * Print high range characters as a hexadecimal
-        * @param string $string
-        * @return string
-        */
-       function escaped( $string ) {
-               $escaped = '';
-               $length = strlen( $string );
-               for ( $i = 0; $i < $length; $i++ ) {
-                       $char = $string[$i];
-                       $val = ord( $char );
-                       if ( $val > 127 ) {
-                               $escaped .= '\x' . dechex( $val );
-                       } else {
-                               $escaped .= $char;
-                       }
-               }
-
-               return $escaped;
-       }
-
-       /**
-        * See also "UTF-8 decoder capability and stress test" by
-        * Markus Kuhn:
-        * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
-        */
-       public static function provideStringsForIsUtf8Check() {
-               // Expected return values for StringUtils::isUtf8()
-               $PASS = true;
-               $FAIL = false;
-
-               return array(
-                       'some ASCII' => array( $PASS, 'Some ASCII' ),
-                       'euro sign' => array( $PASS, "Euro sign €" ),
-
-                       'first possible sequence 1 byte' => array( $PASS, "\x00" ),
-                       'first possible sequence 2 bytes' => array( $PASS, "\xc2\x80" ),
-                       'first possible sequence 3 bytes' => array( $PASS, "\xe0\xa0\x80" ),
-                       'first possible sequence 4 bytes' => array( $PASS, "\xf0\x90\x80\x80" ),
-                       'first possible sequence 5 bytes' => array( $FAIL, "\xf8\x88\x80\x80\x80" ),
-                       'first possible sequence 6 bytes' => array( $FAIL, "\xfc\x84\x80\x80\x80\x80" ),
-
-                       'last possible sequence 1 byte' => array( $PASS, "\x7f" ),
-                       'last possible sequence 2 bytes' => array( $PASS, "\xdf\xbf" ),
-                       'last possible sequence 3 bytes' => array( $PASS, "\xef\xbf\xbf" ),
-                       'last possible sequence 4 bytes (U+1FFFFF)' => array( $FAIL, "\xf7\xbf\xbf\xbf" ),
-                       'last possible sequence 5 bytes' => array( $FAIL, "\xfb\xbf\xbf\xbf\xbf" ),
-                       'last possible sequence 6 bytes' => array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ),
-
-                       'boundary 1' => array( $PASS, "\xed\x9f\xbf" ),
-                       'boundary 2' => array( $PASS, "\xee\x80\x80" ),
-                       'boundary 3' => array( $PASS, "\xef\xbf\xbd" ),
-                       'boundary 4' => array( $PASS, "\xf2\x80\x80\x80" ),
-                       'boundary 5 (U+FFFFF)' => array( $PASS, "\xf3\xbf\xbf\xbf" ),
-                       'boundary 6 (U+100000)' => array( $PASS, "\xf4\x80\x80\x80" ),
-                       'boundary 7 (U+10FFFF)' => array( $PASS, "\xf4\x8f\xbf\xbf" ),
-                       'boundary 8 (U+110000)' => array( $FAIL, "\xf4\x90\x80\x80" ),
-
-                       'malformed 1' => array( $FAIL, "\x80" ),
-                       'malformed 2' => array( $FAIL, "\xbf" ),
-                       'malformed 3' => array( $FAIL, "\x80\xbf" ),
-                       'malformed 4' => array( $FAIL, "\x80\xbf\x80" ),
-                       'malformed 5' => array( $FAIL, "\x80\xbf\x80\xbf" ),
-                       'malformed 6' => array( $FAIL, "\x80\xbf\x80\xbf\x80" ),
-                       'malformed 7' => array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ),
-                       'malformed 8' => array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ),
-
-                       'last byte missing 1' => array( $FAIL, "\xc0" ),
-                       'last byte missing 2' => array( $FAIL, "\xe0\x80" ),
-                       'last byte missing 3' => array( $FAIL, "\xf0\x80\x80" ),
-                       'last byte missing 4' => array( $FAIL, "\xf8\x80\x80\x80" ),
-                       'last byte missing 5' => array( $FAIL, "\xfc\x80\x80\x80\x80" ),
-                       'last byte missing 6' => array( $FAIL, "\xdf" ),
-                       'last byte missing 7' => array( $FAIL, "\xef\xbf" ),
-                       'last byte missing 8' => array( $FAIL, "\xf7\xbf\xbf" ),
-                       'last byte missing 9' => array( $FAIL, "\xfb\xbf\xbf\xbf" ),
-                       'last byte missing 10' => array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ),
-
-                       'extra continuation byte 1' => array( $FAIL, "e\xaf" ),
-                       'extra continuation byte 2' => array( $FAIL, "\xc3\x89\xaf" ),
-                       'extra continuation byte 3' => array( $FAIL, "\xef\xbc\xa5\xaf" ),
-                       'extra continuation byte 4' => array( $FAIL, "\xf0\x9d\x99\xb4\xaf" ),
-
-                       'impossible bytes 1' => array( $FAIL, "\xfe" ),
-                       'impossible bytes 2' => array( $FAIL, "\xff" ),
-                       'impossible bytes 3' => array( $FAIL, "\xfe\xfe\xff\xff" ),
-
-                       'overlong sequences 1' => array( $FAIL, "\xc0\xaf" ),
-                       'overlong sequences 2' => array( $FAIL, "\xc1\xaf" ),
-                       'overlong sequences 3' => array( $FAIL, "\xe0\x80\xaf" ),
-                       'overlong sequences 4' => array( $FAIL, "\xf0\x80\x80\xaf" ),
-                       'overlong sequences 5' => array( $FAIL, "\xf8\x80\x80\x80\xaf" ),
-                       'overlong sequences 6' => array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ),
-
-                       'maximum overlong sequences 1' => array( $FAIL, "\xc1\xbf" ),
-                       'maximum overlong sequences 2' => array( $FAIL, "\xe0\x9f\xbf" ),
-                       'maximum overlong sequences 3' => array( $FAIL, "\xf0\x8f\xbf\xbf" ),
-                       'maximum overlong sequences 4' => array( $FAIL, "\xf8\x87\xbf\xbf" ),
-                       'maximum overlong sequences 5' => array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ),
-
-                       'surrogates 1 (U+D799)' => array( $PASS, "\xed\x9f\xbf" ),
-                       'surrogates 2 (U+E000)' => array( $PASS, "\xee\x80\x80" ),
-                       'surrogates 3 (U+D800)' => array( $FAIL, "\xed\xa0\x80" ),
-                       'surrogates 4 (U+DBFF)' => array( $FAIL, "\xed\xaf\xbf" ),
-                       'surrogates 5 (U+DC00)' => array( $FAIL, "\xed\xb0\x80" ),
-                       'surrogates 6 (U+DFFF)' => array( $FAIL, "\xed\xbf\xbf" ),
-                       'surrogates 7 (U+D800 U+DC00)' => array( $FAIL, "\xed\xa0\x80\xed\xb0\x80" ),
-
-                       'noncharacters 1' => array( $PASS, "\xef\xbf\xbe" ),
-                       'noncharacters 2' => array( $PASS, "\xef\xbf\xbf" ),
-               );
-       }
-}