+ /**
+ * Utility method for converting a character sequence from bytes to Unicode.
+ *
+ * Primary usecase being converting $wgLegalTitleChars to a sequence usable in
+ * javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units.
+ *
+ * @param string $byteClass
+ * @return string
+ */
+ public static function convertByteClassToUnicodeClass( $byteClass ) {
+ $length = strlen( $byteClass );
+ // Input token queue
+ $x0 = $x1 = $x2 = '';
+ // Decoded queue
+ $d0 = $d1 = $d2 = '';
+ // Decoded integer codepoints
+ $ord0 = $ord1 = $ord2 = 0;
+ // Re-encoded queue
+ $r0 = $r1 = $r2 = '';
+ // Output
+ $out = '';
+ // Flags
+ $allowUnicode = false;
+ for ( $pos = 0; $pos < $length; $pos++ ) {
+ // Shift the queues down
+ $x2 = $x1;
+ $x1 = $x0;
+ $d2 = $d1;
+ $d1 = $d0;
+ $ord2 = $ord1;
+ $ord1 = $ord0;
+ $r2 = $r1;
+ $r1 = $r0;
+ // Load the current input token and decoded values
+ $inChar = $byteClass[$pos];
+ if ( $inChar == '\\' ) {
+ if ( preg_match( '/x([0-9a-fA-F]{2})/A', $byteClass, $m, 0, $pos + 1 ) ) {
+ $x0 = $inChar . $m[0];
+ $d0 = chr( hexdec( $m[1] ) );
+ $pos += strlen( $m[0] );
+ } elseif ( preg_match( '/[0-7]{3}/A', $byteClass, $m, 0, $pos + 1 ) ) {
+ $x0 = $inChar . $m[0];
+ $d0 = chr( octdec( $m[0] ) );
+ $pos += strlen( $m[0] );
+ } elseif ( $pos + 1 >= $length ) {
+ $x0 = $d0 = '\\';
+ } else {
+ $d0 = $byteClass[$pos + 1];
+ $x0 = $inChar . $d0;
+ $pos += 1;
+ }
+ } else {
+ $x0 = $d0 = $inChar;
+ }
+ $ord0 = ord( $d0 );
+ // Load the current re-encoded value
+ if ( $ord0 < 32 || $ord0 == 0x7f ) {
+ $r0 = sprintf( '\x%02x', $ord0 );
+ } elseif ( $ord0 >= 0x80 ) {
+ // Allow unicode if a single high-bit character appears
+ $r0 = sprintf( '\x%02x', $ord0 );
+ $allowUnicode = true;
+ } elseif ( strpos( '-\\[]^', $d0 ) !== false ) {
+ $r0 = '\\' . $d0;
+ } else {
+ $r0 = $d0;
+ }
+ // Do the output
+ if ( $x0 !== '' && $x1 === '-' && $x2 !== '' ) {
+ // Range
+ if ( $ord2 > $ord0 ) {
+ // Empty range
+ } elseif ( $ord0 >= 0x80 ) {
+ // Unicode range
+ $allowUnicode = true;
+ if ( $ord2 < 0x80 ) {
+ // Keep the non-unicode section of the range
+ $out .= "$r2-\\x7F";
+ }
+ } else {
+ // Normal range
+ $out .= "$r2-$r0";
+ }
+ // Reset state to the initial value
+ $x0 = $x1 = $d0 = $d1 = $r0 = $r1 = '';
+ } elseif ( $ord2 < 0x80 ) {
+ // ASCII character
+ $out .= $r2;
+ }
+ }
+ if ( $ord1 < 0x80 ) {
+ $out .= $r1;
+ }
+ if ( $ord0 < 0x80 ) {
+ $out .= $r0;
+ }
+ if ( $allowUnicode ) {
+ $out .= '\u0080-\uFFFF';
+ }
+ return $out;
+ }
+