(follow-up r69626) Make it so the intl normalizer_normalize function is not

author Brian Wolff <bawolff@users.mediawiki.org>

Fri, 15 Apr 2011 18:39:43 +0000 (18:39 +0000)

committer Brian Wolff <bawolff@users.mediawiki.org>

Fri, 15 Apr 2011 18:39:43 +0000 (18:39 +0000)
author Brian Wolff <bawolff@users.mediawiki.org>
Fri, 15 Apr 2011 18:39:43 +0000 (18:39 +0000)
committer Brian Wolff <bawolff@users.mediawiki.org>
Fri, 15 Apr 2011 18:39:43 +0000 (18:39 +0000)
diff --git a/RELEASE-NOTES b/RELEASE-NOTES

index 4fb62c3..9af339d 100644 (file)
--- a/RELEASE-NOTES
+++ b/RELEASE-NOTES
@@ -237,6 +237,7 @@ PHP if you have not done so prior to upgrading MediaWiki.
  * (bug 27473) Fix regression: bold, italic no longer interfere with linktrail for ca, kaa
  * (bug 28444) Fix regression: edit-on-doubleclick retains revision id again
  * &apos; character entity is now allowed in wikitext
+* UtfNormal::cleanUp on an invalid utf-8 sequence no longer returns false if intl installed.
  
  === API changes in 1.18 ===
  * (bug 26339) Throw warning when truncating an overlarge API result
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php

index 84ebc76..75e3a08 100644 (file)
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -79,7 +79,7 @@ class UtfNormal {
          * @return string a clean, shiny, normalized UTF-8 string
          */
         static function cleanUp( $string ) {
-               if( NORMALIZE_ICU || NORMALIZE_INTL ) {
+               if( NORMALIZE_ICU ) {
                         # We exclude a few chars that ICU would not.
                         $string = preg_replace(
                                 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
@@ -90,8 +90,24 @@ class UtfNormal {
  
                         # UnicodeString constructor fails if the string ends with a
                         # head byte. Add a junk char at the end, we'll strip it off.
-                       if ( NORMALIZE_ICU ) return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
-                       if ( NORMALIZE_INTL ) return normalizer_normalize( $string, Normalizer::FORM_C );
+                       return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
+               } elseif( NORMALIZE_INTL ) {
+                       $norm = normalizer_normalize( $string, Normalizer::FORM_C );
+                       if( $norm === null || $norm === false ) {
+                               # normalizer_normalize will either return false or null
+                               # (depending on which doc you read) if invalid utf8 string.
+                               # quickIsNFCVerify cleans up invalid sequences.
+
+                               if( UtfNormal::quickIsNFCVerify( $string ) ) {
+                                       # if that's true, the string is actually already normal.
+                                       return $string;
+                               } else {
+                                       # Now we are valid but non-normal
+                                       return normalizer_normalize( $string, Normalizer::FORM_C );
+                               }
+                       } else {
+                               return $norm;
+                       }
                 } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
                         # Side effect -- $string has had UTF-8 errors cleaned up.
                         return $string;
author	Brian Wolff <bawolff@users.mediawiki.org>
	Fri, 15 Apr 2011 18:39:43 +0000 (18:39 +0000)
committer	Brian Wolff <bawolff@users.mediawiki.org>
	Fri, 15 Apr 2011 18:39:43 +0000 (18:39 +0000)
RELEASE-NOTES		patch \| blob \| history
includes/normal/UtfNormal.php		patch \| blob \| history