X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2Fnormal%2FUtfNormal.php;h=2b400e5a857c89e2fca6b1a29bbe4b58b026f685;hb=e0bc12d1770d81d38a8529687e15fd0a6bcd6f87;hp=4f8b12935dad05e72ad1f9431dc760dbe706efa2;hpb=3ad85f1a20277e2e71f471fa693e73ad1f2a9adc;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 4f8b12935d..2b400e5a85 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -1,50 +1,35 @@ -# http://www.mediawiki.org/ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# http://www.gnu.org/copyleft/gpl.html - /** - * @defgroup UtfNormal UtfNormal + * Unicode normalization routines + * + * Copyright © 2004 Brion Vibber + * http://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @ingroup UtfNormal */ -/** */ -require_once dirname(__FILE__).'/UtfNormalUtil.php'; - -global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp; -$utfCombiningClass = NULL; -$utfCanonicalComp = NULL; -$utfCanonicalDecomp = NULL; - -# Load compatibility decompositions on demand if they are needed. -global $utfCompatibilityDecomp; -$utfCompatibilityDecomp = NULL; - /** - * For using the ICU wrapper + * @defgroup UtfNormal UtfNormal */ -define( 'UNORM_NONE', 1 ); -define( 'UNORM_NFD', 2 ); -define( 'UNORM_NFKD', 3 ); -define( 'UNORM_NFC', 4 ); -define( 'UNORM_DEFAULT', UNORM_NFC ); -define( 'UNORM_NFKC', 5 ); -define( 'UNORM_FCD', 6 ); define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); +define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) ); /** * Unicode normalization routines for working with UTF-8 strings. @@ -61,6 +46,26 @@ define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); * @ingroup UtfNormal */ class UtfNormal { + /** + * For using the ICU wrapper + */ + const UNORM_NONE = 1; + const UNORM_NFD = 2; + const UNORM_NFKD = 3; + const UNORM_NFC = 4; + const UNORM_NFKC = 5; + const UNORM_FCD = 6; + const UNORM_DEFAULT = self::UNORM_NFC; + + static $utfCombiningClass = null; + static $utfCanonicalComp = null; + static $utfCanonicalDecomp = null; + + # Load compatibility decompositions on demand if they are needed. + static $utfCompatibilityDecomp = null; + + static $utfCheckNFC; + /** * The ultimate convenience function! Clean up invalid UTF-8 sequences, * and convert to normal form C, canonical composition. @@ -73,17 +78,29 @@ class UtfNormal { */ static function cleanUp( $string ) { if( NORMALIZE_ICU ) { - # We exclude a few chars that ICU would not. - $string = preg_replace( - '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', - UTF8_REPLACEMENT, - $string ); - $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); - $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string ); + $string = self::replaceForNativeNormalize( $string ); # UnicodeString constructor fails if the string ends with a # head byte. Add a junk char at the end, we'll strip it off. - return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); + return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" ); + } elseif( NORMALIZE_INTL ) { + $string = self::replaceForNativeNormalize( $string ); + $norm = normalizer_normalize( $string, Normalizer::FORM_C ); + if( $norm === null || $norm === false ) { + # normalizer_normalize will either return false or null + # (depending on which doc you read) if invalid utf8 string. + # quickIsNFCVerify cleans up invalid sequences. + + if( UtfNormal::quickIsNFCVerify( $string ) ) { + # if that's true, the string is actually already normal. + return $string; + } else { + # Now we are valid but non-normal + return normalizer_normalize( $string, Normalizer::FORM_C ); + } + } else { + return $norm; + } } elseif( UtfNormal::quickIsNFCVerify( $string ) ) { # Side effect -- $string has had UTF-8 errors cleaned up. return $string; @@ -101,8 +118,10 @@ class UtfNormal { * @return string a UTF-8 string in normal form C */ static function toNFC( $string ) { - if( NORMALIZE_ICU ) - return utf8_normalize( $string, UNORM_NFC ); + if( NORMALIZE_INTL ) + return normalizer_normalize( $string, Normalizer::FORM_C ); + elseif( NORMALIZE_ICU ) + return utf8_normalize( $string, self::UNORM_NFC ); elseif( UtfNormal::quickIsNFC( $string ) ) return $string; else @@ -117,8 +136,10 @@ class UtfNormal { * @return string a UTF-8 string in normal form D */ static function toNFD( $string ) { - if( NORMALIZE_ICU ) - return utf8_normalize( $string, UNORM_NFD ); + if( NORMALIZE_INTL ) + return normalizer_normalize( $string, Normalizer::FORM_D ); + elseif( NORMALIZE_ICU ) + return utf8_normalize( $string, self::UNORM_NFD ); elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFD( $string ); else @@ -134,8 +155,10 @@ class UtfNormal { * @return string a UTF-8 string in normal form KC */ static function toNFKC( $string ) { - if( NORMALIZE_ICU ) - return utf8_normalize( $string, UNORM_NFKC ); + if( NORMALIZE_INTL ) + return normalizer_normalize( $string, Normalizer::FORM_KC ); + elseif( NORMALIZE_ICU ) + return utf8_normalize( $string, self::UNORM_NFKC ); elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFKC( $string ); else @@ -151,8 +174,10 @@ class UtfNormal { * @return string a UTF-8 string in normal form KD */ static function toNFKD( $string ) { - if( NORMALIZE_ICU ) - return utf8_normalize( $string, UNORM_NFKD ); + if( NORMALIZE_INTL ) + return normalizer_normalize( $string, Normalizer::FORM_KD ); + elseif( NORMALIZE_ICU ) + return utf8_normalize( $string, self::UNORM_NFKD ); elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFKD( $string ); else @@ -164,8 +189,7 @@ class UtfNormal { * @private */ static function loadData() { - global $utfCombiningClass; - if( !isset( $utfCombiningClass ) ) { + if( !isset( self::$utfCombiningClass ) ) { require_once( dirname(__FILE__) . '/UtfNormalData.inc' ); } } @@ -182,10 +206,9 @@ class UtfNormal { if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; UtfNormal::loadData(); - global $utfCheckNFC, $utfCombiningClass; $len = strlen( $string ); for( $i = 0; $i < $len; $i++ ) { - $c = $string{$i}; + $c = $string[$i]; $n = ord( $c ); if( $n < 0x80 ) { continue; @@ -199,11 +222,11 @@ class UtfNormal { $c = substr( $string, $i, 2 ); $i++; } - if( isset( $utfCheckNFC[$c] ) ) { + if( isset( self::$utfCheckNFC[$c] ) ) { # If it's NO or MAYBE, bail and do the slow check. return false; } - if( isset( $utfCombiningClass[$c] ) ) { + if( isset( self::$utfCombiningClass[$c] ) ) { # Combining character? We might have to do sorting, at least. return false; } @@ -215,6 +238,7 @@ class UtfNormal { * Returns true if the string is _definitely_ in NFC. * Returns false if not or uncertain. * @param $string String: a UTF-8 string, altered on output to be valid UTF-8 safe for XML. + * @return bool */ static function quickIsNFCVerify( &$string ) { # Screen out some characters that eg won't be allowed in XML @@ -229,9 +253,8 @@ class UtfNormal { if( !isset( $checkit ) ) { # Load/build some scary lookup tables... UtfNormal::loadData(); - global $utfCheckNFC, $utfCombiningClass; - $utfCheckOrCombining = array_merge( $utfCheckNFC, $utfCombiningClass ); + $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass ); # Head bytes for sequences which we should do further validity checks $checkit = array_flip( array_map( 'chr', @@ -277,7 +300,7 @@ class UtfNormal { foreach( $matches[1] as $str ) { $chunk = strlen( $str ); - if( $str{0} < "\x80" ) { + if( $str[0] < "\x80" ) { # ASCII chunk: guaranteed to be valid UTF-8 # and in normal form C, so skip over it. $base += $chunk; @@ -295,12 +318,13 @@ class UtfNormal { $len = $chunk + 1; # Counting down is faster. I'm *so* sorry. for( $i = -1; --$len; ) { - if( $remaining = $tailBytes[$c = $str{++$i}] ) { + $remaining = $tailBytes[$c = $str[++$i]]; + if( $remaining ) { # UTF-8 head byte! $sequence = $head = $c; do { # Look for the defined number of tail bytes... - if( --$len && ( $c = $str{++$i} ) >= "\x80" && $c < "\xc0" ) { + if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) { # Legal tail bytes are nice. $sequence .= $c; } else { @@ -446,9 +470,9 @@ class UtfNormal { */ static function NFD( $string ) { UtfNormal::loadData(); - global $utfCanonicalDecomp; + return UtfNormal::fastCombiningSort( - UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) ); + UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) ); } /** @@ -466,12 +490,11 @@ class UtfNormal { * @private */ static function NFKD( $string ) { - global $utfCompatibilityDecomp; - if( !isset( $utfCompatibilityDecomp ) ) { + if( !isset( self::$utfCompatibilityDecomp ) ) { require_once( 'UtfNormalDataK.inc' ); } - return UtfNormal::fastCombiningSort( - UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) ); + return self::fastCombiningSort( + self::fastDecompose( $string, self::$utfCompatibilityDecomp ) ); } @@ -489,7 +512,7 @@ class UtfNormal { $len = strlen( $string ); $out = ''; for( $i = 0; $i < $len; $i++ ) { - $c = $string{$i}; + $c = $string[$i]; $n = ord( $c ); if( $n < 0x80 ) { # ASCII chars never decompose @@ -516,9 +539,9 @@ class UtfNormal { # A lookup table would be slightly faster, # but adds a lot of memory & disk needs. # - $index = ( (ord( $c{0} ) & 0x0f) << 12 - | (ord( $c{1} ) & 0x3f) << 6 - | (ord( $c{2} ) & 0x3f) ) + $index = ( (ord( $c[0] ) & 0x0f) << 12 + | (ord( $c[1] ) & 0x3f) << 6 + | (ord( $c[2] ) & 0x3f) ) - UNICODE_HANGUL_FIRST; $l = intval( $index / UNICODE_HANGUL_NCOUNT ); $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT); @@ -546,13 +569,12 @@ class UtfNormal { */ static function fastCombiningSort( $string ) { UtfNormal::loadData(); - global $utfCombiningClass; $len = strlen( $string ); $out = ''; $combiners = array(); $lastClass = -1; for( $i = 0; $i < $len; $i++ ) { - $c = $string{$i}; + $c = $string[$i]; $n = ord( $c ); if( $n >= 0x80 ) { if( $n >= 0xf0 ) { @@ -565,8 +587,8 @@ class UtfNormal { $c = substr( $string, $i, 2 ); $i++; } - if( isset( $utfCombiningClass[$c] ) ) { - $lastClass = $utfCombiningClass[$c]; + if( isset( self::$utfCombiningClass[$c] ) ) { + $lastClass = self::$utfCombiningClass[$c]; if( isset( $combiners[$lastClass] ) ) { $combiners[$lastClass] .= $c; } else { @@ -599,7 +621,6 @@ class UtfNormal { */ static function fastCompose( $string ) { UtfNormal::loadData(); - global $utfCanonicalComp, $utfCombiningClass; $len = strlen( $string ); $out = ''; $lastClass = -1; @@ -609,7 +630,7 @@ class UtfNormal { $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1)); $x2 = ord(substr(UTF8_HANGUL_TEND,0,1)); for( $i = 0; $i < $len; $i++ ) { - $c = $string{$i}; + $c = $string[$i]; $n = ord( $c ); if( $n < 0x80 ) { # No combining characters here... @@ -631,14 +652,14 @@ class UtfNormal { } $pair = $startChar . $c; if( $n > 0x80 ) { - if( isset( $utfCombiningClass[$c] ) ) { + if( isset( self::$utfCombiningClass[$c] ) ) { # A combining char; see what we can do with it - $class = $utfCombiningClass[$c]; + $class = self::$utfCombiningClass[$c]; if( !empty( $startChar ) && $lastClass < $class && $class > 0 && - isset( $utfCanonicalComp[$pair] ) ) { - $startChar = $utfCanonicalComp[$pair]; + isset( self::$utfCanonicalComp[$pair] ) ) { + $startChar = self::$utfCanonicalComp[$pair]; $class = 0; } else { $combining .= $c; @@ -650,8 +671,8 @@ class UtfNormal { } # New start char if( $lastClass == 0 ) { - if( isset( $utfCanonicalComp[$pair] ) ) { - $startChar = $utfCanonicalComp[$pair]; + if( isset( self::$utfCanonicalComp[$pair] ) ) { + $startChar = self::$utfCanonicalComp[$pair]; $lastHangul = 0; continue; } @@ -669,8 +690,8 @@ class UtfNormal { # #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; - $lIndex = ord( $startChar{2} ) - 0x80; - $vIndex = ord( $c{2} ) - 0xa1; + $lIndex = ord( $startChar[2] ) - 0x80; + $vIndex = ord( $c[2] ) - 0xa1; $hangulPoint = UNICODE_HANGUL_FIRST + UNICODE_HANGUL_TCOUNT * @@ -688,23 +709,23 @@ class UtfNormal { $startChar <= UTF8_HANGUL_LAST && !$lastHangul ) { # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; - $tIndex = ord( $c{2} ) - 0xa7; - if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7); + $tIndex = ord( $c[2] ) - 0xa7; + if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7); # Increment the code point by $tIndex, without # the function overhead of decoding and recoding UTF-8 # - $tail = ord( $startChar{2} ) + $tIndex; + $tail = ord( $startChar[2] ) + $tIndex; if( $tail > 0xbf ) { $tail -= 0x40; - $mid = ord( $startChar{1} ) + 1; + $mid = ord( $startChar[1] ) + 1; if( $mid > 0xbf ) { - $startChar{0} = chr( ord( $startChar{0} ) + 1 ); + $startChar[0] = chr( ord( $startChar[0] ) + 1 ); $mid -= 0x40; } - $startChar{1} = chr( $mid ); + $startChar[1] = chr( $mid ); } - $startChar{2} = chr( $tail ); + $startChar[2] = chr( $tail ); # If there's another jamo char after this, *don't* try to merge it. $lastHangul = 1; @@ -733,8 +754,24 @@ class UtfNormal { $len = strlen( $string ); $out = ''; for( $i = 0; $i < $len; $i++ ) { - $out .= $string{$i}; + $out .= $string[$i]; } return $out; } + /** + * Function to replace some characters that we don't want + * but most of the native normalize functions keep. + * + * @param $string String The string + * @return String String with the character codes replaced. + */ + private static function replaceForNativeNormalize( $string ) { + $string = preg_replace( + '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', + UTF8_REPLACEMENT, + $string ); + $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); + $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string ); + return $string; + } }