(follow-up r69626) Make it so the intl normalizer_normalize function is not
authorBrian Wolff <bawolff@users.mediawiki.org>
Fri, 15 Apr 2011 18:39:43 +0000 (18:39 +0000)
committerBrian Wolff <bawolff@users.mediawiki.org>
Fri, 15 Apr 2011 18:39:43 +0000 (18:39 +0000)
fed an invalid sequence in UtfNormal::cleanUp

normalizer_normalize seems to return false if fed an invalid unicode sequence (Which is quite different
from what our built in normalization functions do). So use quickIsNFC if it returns false.
(Noticed when investigating bug 28541).

RELEASE-NOTES
includes/normal/UtfNormal.php

index 4fb62c3..9af339d 100644 (file)
@@ -237,6 +237,7 @@ PHP if you have not done so prior to upgrading MediaWiki.
 * (bug 27473) Fix regression: bold, italic no longer interfere with linktrail for ca, kaa
 * (bug 28444) Fix regression: edit-on-doubleclick retains revision id again
 * &apos; character entity is now allowed in wikitext
+* UtfNormal::cleanUp on an invalid utf-8 sequence no longer returns false if intl installed.
 
 === API changes in 1.18 ===
 * (bug 26339) Throw warning when truncating an overlarge API result
index 84ebc76..75e3a08 100644 (file)
@@ -79,7 +79,7 @@ class UtfNormal {
         * @return string a clean, shiny, normalized UTF-8 string
         */
        static function cleanUp( $string ) {
-               if( NORMALIZE_ICU || NORMALIZE_INTL ) {
+               if( NORMALIZE_ICU ) {
                        # We exclude a few chars that ICU would not.
                        $string = preg_replace(
                                '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
@@ -90,8 +90,24 @@ class UtfNormal {
 
                        # UnicodeString constructor fails if the string ends with a
                        # head byte. Add a junk char at the end, we'll strip it off.
-                       if ( NORMALIZE_ICU ) return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
-                       if ( NORMALIZE_INTL ) return normalizer_normalize( $string, Normalizer::FORM_C );
+                       return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
+               } elseif( NORMALIZE_INTL ) {
+                       $norm = normalizer_normalize( $string, Normalizer::FORM_C );
+                       if( $norm === null || $norm === false ) {
+                               # normalizer_normalize will either return false or null
+                               # (depending on which doc you read) if invalid utf8 string.
+                               # quickIsNFCVerify cleans up invalid sequences.
+
+                               if( UtfNormal::quickIsNFCVerify( $string ) ) {
+                                       # if that's true, the string is actually already normal.
+                                       return $string;
+                               } else {
+                                       # Now we are valid but non-normal
+                                       return normalizer_normalize( $string, Normalizer::FORM_C );
+                               }
+                       } else {
+                               return $norm;
+                       }
                } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
                        # Side effect -- $string has had UTF-8 errors cleaned up.
                        return $string;