- // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
- // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
- // sequences, which can be as long as the input string is. Multiple short regexes are
- // used rather than a single long regex for performance.
- static $regexes;
- if ( $regexes === null ) {
- $cont = "[\x80-\xbf]";
- $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
- $regexes = array(
- // Continuation byte at the start
- "/^$cont/",
-
- // ASCII byte followed by a continuation byte
- "/[\\x00-\x7f]$cont/S",
-
- // Illegal byte
- "/[\xc0\xc1\xf5-\xff]/S",
-
- // Invalid 2-byte sequence, or valid one then an extra continuation byte
- "/[\xc2-\xdf](?!$cont$after)/S",
-
- // Invalid 3-byte sequence, or valid one then an extra continuation byte
- "/\xe0(?![\xa0-\xbf]$cont$after)/",
- "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
- "/\xed(?![\x80-\x9f]$cont$after)/",
-
- // Invalid 4-byte sequence, or valid one then an extra continuation byte
- "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
- "/[\xf1-\xf3](?!$cont{3}$after)/S",
- "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
- );
- }
-
- foreach ( $regexes as $regex ) {
- if ( preg_match( $regex, $value ) !== 0 ) {
- return false;
- }
- }
-
- return true;