* The function check for invalid byte sequences, overlong encoding but
* not for different normalisations.
*
- * This relies internally on the mbstring function mb_check_encoding()
- * hardcoded to check against UTF-8. Whenever the function is not available
- * we fallback to a pure PHP implementation. Setting $disableMbstring to
- * true will skip the use of mb_check_encoding, this is mostly intended for
- * unit testing our internal implementation.
- *
- * @since 1.21
* @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
* In particular, the pure PHP code path did not in fact check for overlong forms.
* Beware of this when backporting code to that version of MediaWiki.
*
+ * @since 1.21
* @param string $value String to check
- * @param bool $disableMbstring Whether to use the pure PHP
- * implementation instead of trying mb_check_encoding. Intended for unit
- * testing. Default: false
- *
* @return bool Whether the given $value is a valid UTF-8 encoded string
*/
- static function isUtf8( $value, $disableMbstring = false ) {
+ static function isUtf8( $value ) {
$value = (string)$value;
- // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
- // U+10FFFF are incorrectly allowed, so we have to check for them separately.
- if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
- static $newPHP;
- if ( $newPHP === null ) {
- $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
- }
-
- return mb_check_encoding( $value, 'UTF-8' ) &&
- ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
+ // HHVM 3.4 and older come with an outdated version of libmbfl that
+ // incorrectly allows values above U+10FFFF, so we have to check
+ // for them separately. (This issue also exists in PHP 5.3 and
+ // older, which are no longer supported.)
+ static $newPHP;
+ if ( $newPHP === null ) {
+ $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
}
- if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
- // String contains only ASCII characters, has to be valid
- return true;
- }
-
- // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
- // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
- // sequences, which can be as long as the input string is. Multiple short regexes are
- // used rather than a single long regex for performance.
- static $regexes;
- if ( $regexes === null ) {
- $cont = "[\x80-\xbf]";
- $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
- $regexes = array(
- // Continuation byte at the start
- "/^$cont/",
-
- // ASCII byte followed by a continuation byte
- "/[\\x00-\x7f]$cont/S",
-
- // Illegal byte
- "/[\xc0\xc1\xf5-\xff]/S",
-
- // Invalid 2-byte sequence, or valid one then an extra continuation byte
- "/[\xc2-\xdf](?!$cont$after)/S",
-
- // Invalid 3-byte sequence, or valid one then an extra continuation byte
- "/\xe0(?![\xa0-\xbf]$cont$after)/",
- "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
- "/\xed(?![\x80-\x9f]$cont$after)/",
-
- // Invalid 4-byte sequence, or valid one then an extra continuation byte
- "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
- "/[\xf1-\xf3](?!$cont{3}$after)/S",
- "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
- );
- }
+ return mb_check_encoding( $value, 'UTF-8' ) &&
+ ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
+ }
- foreach ( $regexes as $regex ) {
- if ( preg_match( $regex, $value ) !== 0 ) {
- return false;
+ /**
+ * Explode a string, but ignore any instances of the separator inside
+ * the given start and end delimiters, which may optionally nest.
+ * The delimiters are literal strings, not regular expressions.
+ * @param string $startDelim Start delimiter
+ * @param string $endDelim End delimiter
+ * @param string $separator Separator string for the explode.
+ * @param string $subject Subject string to explode.
+ * @param bool $nested True iff the delimiters are allowed to nest.
+ * @return ArrayIterator
+ */
+ static function delimiterExplode( $startDelim, $endDelim, $separator,
+ $subject, $nested = false ) {
+ $inputPos = 0;
+ $lastPos = 0;
+ $depth = 0;
+ $encStart = preg_quote( $startDelim, '!' );
+ $encEnd = preg_quote( $endDelim, '!' );
+ $encSep = preg_quote( $separator, '!' );
+ $len = strlen( $subject );
+ $m = [];
+ $exploded = [];
+ while (
+ $inputPos < $len &&
+ preg_match(
+ "!$encStart|$encEnd|$encSep!S", $subject, $m,
+ PREG_OFFSET_CAPTURE, $inputPos
+ )
+ ) {
+ $match = $m[0][0];
+ $matchPos = $m[0][1];
+ $inputPos = $matchPos + strlen( $match );
+ if ( $match === $separator ) {
+ if ( $depth === 0 ) {
+ $exploded[] = substr(
+ $subject, $lastPos, $matchPos - $lastPos
+ );
+ $lastPos = $inputPos;
+ }
+ } elseif ( $match === $startDelim ) {
+ if ( $depth === 0 || $nested ) {
+ $depth++;
+ }
+ } else {
+ $depth--;
}
}
-
- return true;
+ $exploded[] = substr( $subject, $lastPos );
+ // This method could be rewritten in the future to avoid creating an
+ // intermediate array, since the return type is just an iterator.
+ return new ArrayIterator( $exploded );
}
/**
- * Perform an operation equivalent to
+ * Perform an operation equivalent to `preg_replace()`
*
- * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
+ * Matches this code:
*
- * except that it's worst-case O(N) instead of O(N^2)
+ * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
*
- * Compared to delimiterReplace(), this implementation is fast but memory-
- * hungry and inflexible. The memory requirements are such that I don't
- * recommend using it on anything but guaranteed small chunks of text.
+ * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
+ * implementation is fast but memory-hungry and inflexible. The memory requirements are such
+ * that I don't recommend using it on anything but guaranteed small chunks of text.
*
* @param string $startDelim
* @param string $endDelim
* @param string $replace
* @param string $subject
- *
* @return string
*/
static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
}
/**
- * Perform an operation equivalent to
+ * Perform an operation equivalent to `preg_replace_callback()`
*
- * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
+ * Matches this code:
*
- * This implementation is slower than hungryDelimiterReplace but uses far less
- * memory. The delimiters are literal strings, not regular expressions.
+ * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
*
* If the start delimiter ends with an initial substring of the end delimiter,
* e.g. in the case of C-style comments, the behavior differs from the model
* regex. In this implementation, the end must share no characters with the
- * start, so e.g. /*\/ is not considered to be both the start and end of a
- * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
+ * start, so e.g. `/*\/` is not considered to be both the start and end of a
+ * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
+ *
+ * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
+ * but uses far less memory. The delimiters are literal strings, not regular expressions.
*
* @param string $startDelim Start delimiter
* @param string $endDelim End delimiter
$encEnd = preg_quote( $endDelim, '!' );
$strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
$endLength = strlen( $endDelim );
- $m = array();
+ $m = [];
while ( $inputPos < strlen( $subject ) &&
preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
} elseif ( $tokenType == 'end' ) {
if ( $foundStart ) {
# Found match
- $output .= call_user_func( $callback, array(
+ $output .= call_user_func( $callback, [
substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
substr( $subject, $contentPos, $tokenOffset - $contentPos )
- ) );
+ ] );
$foundStart = false;
} else {
# Non-matching end, write it out
}
/**
- * Perform an operation equivalent to
+ * Perform an operation equivalent to `preg_replace()` with flags.
+ *
+ * Matches this code:
*
- * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
+ * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
*
* @param string $startDelim Start delimiter regular expression
* @param string $endDelim End delimiter regular expression
* @param string $replace Replacement string. May contain $1, which will be
- * replaced by the text between the delimiters
+ * replaced by the text between the delimiters
* @param string $subject String to search
* @param string $flags Regular expression flags
* @return string The string with the matches replaced
/**
* More or less "markup-safe" explode()
- * Ignores any instances of the separator inside <...>
+ * Ignores any instances of the separator inside `<...>`
* @param string $separator
* @param string $text
* @return array
return $items;
}
+ /**
+ * More or less "markup-safe" str_replace()
+ * Ignores any instances of the separator inside `<...>`
+ * @param string $search
+ * @param string $replace
+ * @param string $text
+ * @return string
+ */
+ static function replaceMarkup( $search, $replace, $text ) {
+ $placeholder = "\x00";
+
+ // Remove placeholder instances
+ $text = str_replace( $placeholder, '', $text );
+
+ // Replace instances of the separator inside HTML-like tags with the placeholder
+ $replacer = new DoubleReplacer( $search, $placeholder );
+ $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
+
+ // Explode, then put the replaced separators back in
+ $cleaned = str_replace( $search, $replace, $cleaned );
+ $text = str_replace( $placeholder, $search, $cleaned );
+
+ return $text;
+ }
+
/**
* Escape a string to make it suitable for inclusion in a preg_replace()
* replacement parameter.
static function escapeRegexReplacement( $string ) {
$string = str_replace( '\\', '\\\\', $string );
$string = str_replace( '$', '\\$', $string );
-
return $string;
}
/**
* Workalike for explode() with limited memory usage.
- * Returns an Iterator
+ *
* @param string $separator
* @param string $subject
* @return ArrayIterator|ExplodeIterator