Language::truncate(): don't chop up multibyte characters when input contains newlines
authorRoan Kattouw <roan.kattouw@gmail.com>
Tue, 27 Oct 2015 03:17:37 +0000 (20:17 -0700)
committerRoan Kattouw <roan.kattouw@gmail.com>
Tue, 27 Oct 2015 03:17:37 +0000 (20:17 -0700)
To detect whether the truncation had chopped up a multibyte
character after the first byte, a regex was used. But in this
regex, the dot (.) didn't match newlines, so it failed to
detect chopped multibyte characters (after the first byte)
if there was a newline preceding the chopped character.

Bug: T116693
Change-Id: I66e4fd451acac0a1019da7060d5a37d70963a15a

languages/Language.php
tests/phpunit/languages/LanguageTest.php

index 50ed513..3ea2693 100644 (file)
@@ -3691,8 +3691,9 @@ class Language {
                                # We got the first byte only of a multibyte char; remove it.
                                $string = substr( $string, 0, -1 );
                        } elseif ( $char >= 0x80 &&
+                               // Use the /s modifier (PCRE_DOTALL) so (.*) also matches newlines
                                preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
-                                       '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m )
+                                       '[\xf0-\xf7][\x80-\xbf]{1,2})$/s', $string, $m )
                        ) {
                                # We chopped in the middle of a character; remove it
                                $string = $m[1];
index 4fca002..77c3c02 100644 (file)
@@ -261,6 +261,16 @@ class LanguageTest extends LanguageClassesTestCase {
                        $this->getLang()->truncate( "1234567890", 5, 'XXX', false ),
                        'truncate without adjustment'
                );
+               $this->assertEquals(
+                       "泰乐菌...",
+                       $this->getLang()->truncate( "泰乐菌素123456789", 11, '...', false ),
+                       'truncate does not chop Unicode characters in half'
+               );
+               $this->assertEquals(
+                       "\n泰乐菌...",
+                       $this->getLang()->truncate( "\n泰乐菌素123456789", 12, '...', false ),
+                       'truncate does not chop Unicode characters in half if there is a preceding newline'
+               );
        }
 
        /**