Truncate tag filter descriptions
authorpetarpetkovic <ppetkovic@wikimedia.org>
Wed, 6 Dec 2017 15:59:30 +0000 (16:59 +0100)
committerpetarpetkovic <ppetkovic@wikimedia.org>
Fri, 9 Feb 2018 21:45:20 +0000 (22:45 +0100)
Introduce truncateInternal() method in Language class, based on
existing truncate() method. New method abstracts string truncation,
allowing users to specify callable functions for text length measurement
and string truncation.

New method, truncateInternal(), is used to provide two options for
text truncation:
* For DB usage: truncateForDatabase() method is truncating text by
number of bytes.
* For UI usage: truncateForVisual() method is truncating text by number
of characters, using multibyte string PHP methods.

Old truncate() method is deprecated and just returns the results of
truncateForDatabase() method.

Newly introduced truncateForVisual() method is used for
truncation of long tag descriptions in RCFilters menu.

Bug: T179626
Change-Id: Ib01a8c303304064dde3ce983b817d93a88a5affd

includes/changetags/ChangeTags.php
includes/specialpage/ChangesListSpecialPage.php
languages/Language.php
tests/phpunit/languages/LanguageTest.php

index 7e4dd00..b30b82d 100644 (file)
@@ -181,6 +181,28 @@ class ChangeTags {
                return $msg;
        }
 
+       /**
+        * Get truncated message for the tag's long description.
+        *
+        * @param string $tag Tag name.
+        * @param int $length Maximum length of truncated message, including ellipsis.
+        * @param IContextSource $context
+        *
+        * @return string Truncated long tag description.
+        */
+       public static function truncateTagDescription( $tag, $length, IContextSource $context ) {
+               $originalDesc = self::tagLongDescriptionMessage( $tag, $context );
+               // If there is no tag description, return empty string
+               if ( !$originalDesc ) {
+                       return '';
+               }
+
+               $taglessDesc = Sanitizer::stripAllTags( $originalDesc->parse() );
+               $escapedDesc = Sanitizer::escapeHtmlAllowEntities( $taglessDesc );
+
+               return $context->getLanguage()->truncateForVisual( $escapedDesc, $length );
+       }
+
        /**
         * Add tags to a change given its rc_id, rev_id and/or log_id
         *
index cf990c2..5aa1c6b 100644 (file)
@@ -33,6 +33,12 @@ use Wikimedia\Rdbms\IDatabase;
  * @ingroup SpecialPage
  */
 abstract class ChangesListSpecialPage extends SpecialPage {
+       /**
+        * Maximum length of a tag description in UTF-8 characters.
+        * Longer descriptions will be truncated.
+        */
+       const TAG_DESC_CHARACTER_LIMIT = 120;
+
        /**
         * Preference name for saved queries. Subclasses that use saved queries should override this.
         * @var string
@@ -794,15 +800,15 @@ abstract class ChangesListSpecialPage extends SpecialPage {
                                                isset( $explicitlyDefinedTags[ $tagName ] ) ||
                                                isset( $softwareActivatedTags[ $tagName ] )
                                        ) {
-                                               // Parse description
-                                               $desc = ChangeTags::tagLongDescriptionMessage( $tagName, $context );
-
                                                $result[] = [
                                                        'name' => $tagName,
                                                        'label' => Sanitizer::stripAllTags(
                                                                ChangeTags::tagDescription( $tagName, $context )
                                                        ),
-                                                       'description' => $desc ? Sanitizer::stripAllTags( $desc->parse() ) : '',
+                                                       'description' =>
+                                                               ChangeTags::truncateTagDescription(
+                                                                       $tagName, self::TAG_DESC_CHARACTER_LIMIT, $context
+                                                               ),
                                                        'cssClass' => Sanitizer::escapeClass( 'mw-tag-' . $tagName ),
                                                        'hits' => $hits,
                                                ];
index 27c9faf..252ce47 100644 (file)
@@ -3472,27 +3472,103 @@ class Language {
        }
 
        /**
-        * Truncate a string to a specified length in bytes, appending an optional
-        * string (e.g. for ellipses)
+        * This method is deprecated since 1.31 and kept as alias for truncateForDatabase, which
+        * has replaced it. This method provides truncation suitable for DB.
         *
         * The database offers limited byte lengths for some columns in the database;
         * multi-byte character sets mean we need to ensure that only whole characters
-        * are included, otherwise broken characters can be passed to the user
+        * are included, otherwise broken characters can be passed to the user.
         *
-        * If $length is negative, the string will be truncated from the beginning
+        * @deprecated since 1.31, use truncateForDatabase or truncateForVisual as appropriate.
         *
         * @param string $string String to truncate
-        * @param int $length Maximum length (including ellipses)
+        * @param int $length Maximum length (including ellipsis)
         * @param string $ellipsis String to append to the truncated text
         * @param bool $adjustLength Subtract length of ellipsis from $length.
         *      $adjustLength was introduced in 1.18, before that behaved as if false.
         * @return string
         */
        function truncate( $string, $length, $ellipsis = '...', $adjustLength = true ) {
+               return $this->truncateForDatabase( $string, $length, $ellipsis, $adjustLength );
+       }
+
+       /**
+        * Truncate a string to a specified length in bytes, appending an optional
+        * string (e.g. for ellipsis)
+        *
+        * If $length is negative, the string will be truncated from the beginning
+        *
+        * @since 1.31
+        *
+        * @param string $string String to truncate
+        * @param int $length Maximum length in bytes
+        * @param string $ellipsis String to append to the end of truncated text
+        * @param bool $adjustLength Subtract length of ellipsis from $length
+        *
+        * @return string
+        */
+       function truncateForDatabase( $string, $length, $ellipsis = '...', $adjustLength = true ) {
+               return $this->truncateInternal(
+                       $string, $length, $ellipsis, $adjustLength, 'strlen', 'substr'
+               );
+       }
+
+       /**
+        * Truncate a string to a specified number of characters, appending an optional
+        * string (e.g. for ellipsis).
+        *
+        * This provides multibyte version of truncate() method of this class, suitable for truncation
+        * based on number of characters, instead of number of bytes.
+        *
+        * If $length is negative, the string will be truncated from the beginning.
+        *
+        * @since 1.31
+        *
+        * @param string $string String to truncate
+        * @param int $length Maximum number of characters
+        * @param string $ellipsis String to append to the end of truncated text
+        * @param bool $adjustLength Subtract length of ellipsis from $length
+        *
+        * @return string
+        */
+       function truncateForVisual( $string, $length, $ellipsis = '...', $adjustLength = true ) {
+               // Passing encoding to mb_strlen and mb_substr is optional.
+               // Encoding defaults to mb_internal_encoding(), which is set to UTF-8 in Setup.php, so
+               // explicit specification of encoding is skipped.
+               // Note: Both multibyte methods are callables invoked in truncateInternal.
+               return $this->truncateInternal(
+                       $string, $length, $ellipsis, $adjustLength, 'mb_strlen', 'mb_substr'
+               );
+       }
+
+       /**
+        * Internal method used for truncation. This method abstracts text truncation into
+        * one common method, allowing users to provide length measurement function and
+        * function for finding substring.
+        *
+        * For usages, see truncateForDatabase and truncateForVisual.
+        *
+        * @param string $string String to truncate
+        * @param int $length Maximum length of final text
+        * @param string $ellipsis String to append to the end of truncated text
+        * @param bool $adjustLength Subtract length of ellipsis from $length
+        * @param callable $measureLength Callable function used for determining the length of text
+        * @param callable $getSubstring Callable function used for getting the substrings
+        *
+        * @return string
+        */
+       private function truncateInternal(
+               $string, $length, $ellipsis = '...', $adjustLength = true, $measureLength, $getSubstring
+       ) {
+               if ( !is_callable( $measureLength ) || !is_callable( $getSubstring ) ) {
+                       throw new InvalidArgumentException( 'Invalid callback provided' );
+               }
+
                # Check if there is no need to truncate
-               if ( strlen( $string ) <= abs( $length ) ) {
+               if ( $measureLength( $string ) <= abs( $length ) ) {
                        return $string; // no need to truncate
                }
+
                # Use the localized ellipsis character
                if ( $ellipsis == '...' ) {
                        $ellipsis = wfMessage( 'ellipsis' )->inLanguage( $this )->escaped();
@@ -3500,31 +3576,33 @@ class Language {
                if ( $length == 0 ) {
                        return $ellipsis; // convention
                }
+
                $stringOriginal = $string;
                # If ellipsis length is >= $length then we can't apply $adjustLength
-               if ( $adjustLength && strlen( $ellipsis ) >= abs( $length ) ) {
+               if ( $adjustLength && $measureLength( $ellipsis ) >= abs( $length ) ) {
                        $string = $ellipsis; // this can be slightly unexpected
                # Otherwise, truncate and add ellipsis...
                } else {
-                       $eLength = $adjustLength ? strlen( $ellipsis ) : 0;
+                       $ellipsisLength = $adjustLength ? $measureLength( $ellipsis ) : 0;
                        if ( $length > 0 ) {
-                               $length -= $eLength;
-                               $string = substr( $string, 0, $length ); // xyz...
+                               $length -= $ellipsisLength;
+                               $string = $getSubstring( $string, 0, $length ); // xyz...
                                $string = $this->removeBadCharLast( $string );
                                $string = rtrim( $string );
                                $string = $string . $ellipsis;
                        } else {
-                               $length += $eLength;
-                               $string = substr( $string, $length ); // ...xyz
+                               $length += $ellipsisLength;
+                               $string = $getSubstring( $string, $length ); // ...xyz
                                $string = $this->removeBadCharFirst( $string );
                                $string = ltrim( $string );
                                $string = $ellipsis . $string;
                        }
                }
+
                # Do not truncate if the ellipsis makes the string longer/equal (T24181).
                # This check is *not* redundant if $adjustLength, due to the single case where
                # LEN($ellipsis) > ABS($limit arg); $stringOriginal could be shorter than $string.
-               if ( strlen( $string ) < strlen( $stringOriginal ) ) {
+               if ( $measureLength( $string ) < $measureLength( $stringOriginal ) ) {
                        return $string;
                } else {
                        return $stringOriginal;
index 5cb5602..050ed83 100644 (file)
@@ -209,70 +209,104 @@ class LanguageTest extends LanguageClassesTestCase {
        }
 
        /**
-        * @covers Language::truncate
+        * @covers Language::truncateForDatabase
+        * @covers Language::truncateInternal
         */
-       public function testTruncate() {
+       public function testTruncateForDatabase() {
                $this->assertEquals(
                        "XXX",
-                       $this->getLang()->truncate( "1234567890", 0, 'XXX' ),
+                       $this->getLang()->truncateForDatabase( "1234567890", 0, 'XXX' ),
                        'truncate prefix, len 0, small ellipsis'
                );
 
                $this->assertEquals(
                        "12345XXX",
-                       $this->getLang()->truncate( "1234567890", 8, 'XXX' ),
+                       $this->getLang()->truncateForDatabase( "1234567890", 8, 'XXX' ),
                        'truncate prefix, small ellipsis'
                );
 
                $this->assertEquals(
                        "123456789",
-                       $this->getLang()->truncate( "123456789", 5, 'XXXXXXXXXXXXXXX' ),
+                       $this->getLang()->truncateForDatabase( "123456789", 5, 'XXXXXXXXXXXXXXX' ),
                        'truncate prefix, large ellipsis'
                );
 
                $this->assertEquals(
                        "XXX67890",
-                       $this->getLang()->truncate( "1234567890", -8, 'XXX' ),
+                       $this->getLang()->truncateForDatabase( "1234567890", -8, 'XXX' ),
                        'truncate suffix, small ellipsis'
                );
 
                $this->assertEquals(
                        "123456789",
-                       $this->getLang()->truncate( "123456789", -5, 'XXXXXXXXXXXXXXX' ),
+                       $this->getLang()->truncateForDatabase( "123456789", -5, 'XXXXXXXXXXXXXXX' ),
                        'truncate suffix, large ellipsis'
                );
                $this->assertEquals(
                        "123XXX",
-                       $this->getLang()->truncate( "123                ", 9, 'XXX' ),
+                       $this->getLang()->truncateForDatabase( "123                ", 9, 'XXX' ),
                        'truncate prefix, with spaces'
                );
                $this->assertEquals(
                        "12345XXX",
-                       $this->getLang()->truncate( "12345            8", 11, 'XXX' ),
+                       $this->getLang()->truncateForDatabase( "12345            8", 11, 'XXX' ),
                        'truncate prefix, with spaces and non-space ending'
                );
                $this->assertEquals(
                        "XXX234",
-                       $this->getLang()->truncate( "1              234", -8, 'XXX' ),
+                       $this->getLang()->truncateForDatabase( "1              234", -8, 'XXX' ),
                        'truncate suffix, with spaces'
                );
                $this->assertEquals(
                        "12345XXX",
-                       $this->getLang()->truncate( "1234567890", 5, 'XXX', false ),
+                       $this->getLang()->truncateForDatabase( "1234567890", 5, 'XXX', false ),
                        'truncate without adjustment'
                );
                $this->assertEquals(
                        "泰乐菌...",
-                       $this->getLang()->truncate( "泰乐菌素123456789", 11, '...', false ),
+                       $this->getLang()->truncateForDatabase( "泰乐菌素123456789", 11, '...', false ),
                        'truncate does not chop Unicode characters in half'
                );
                $this->assertEquals(
                        "\n泰乐菌...",
-                       $this->getLang()->truncate( "\n泰乐菌素123456789", 12, '...', false ),
+                       $this->getLang()->truncateForDatabase( "\n泰乐菌素123456789", 12, '...', false ),
                        'truncate does not chop Unicode characters in half if there is a preceding newline'
                );
        }
 
+       /**
+        * @dataProvider provideTruncateData
+        * @covers Language::truncateForVisual
+        * @covers Language::truncateInternal
+        */
+       public function testTruncateForVisual(
+               $expected, $string, $length, $ellipsis = '...', $adjustLength = true
+       ) {
+               $this->assertEquals(
+                       $expected,
+                       $this->getLang()->truncateForVisual( $string, $length, $ellipsis, $adjustLength )
+               );
+       }
+
+       /**
+        * @return array Format is ($expected, $string, $length, $ellipsis, $adjustLength)
+        */
+       public static function provideTruncateData() {
+               return [
+                       [ "XXX", "тестирам да ли ради", 0, "XXX" ],
+                       [ "testnXXX", "testni scenarij", 8, "XXX" ],
+                       [ "حالة اختبار", "حالة اختبار", 5, "XXXXXXXXXXXXXXX" ],
+                       [ "XXXедент", "прецедент", -8, "XXX" ],
+                       [ "XXപിൾ", "ആപ്പിൾ", -5, "XX" ],
+                       [ "神秘XXX", "神秘                ", 9, "XXX" ],
+                       [ "ΔημιουργXXX", "Δημιουργία           Σύμπαντος", 11, "XXX" ],
+                       [ "XXXの家です", "地球は私たちの唯               の家です", -8, "XXX" ],
+                       [ "زندگیXXX", "زندگی زیباست", 6, "XXX", false ],
+                       [ "ცხოვრება...", "ცხოვრება არის საოცარი", 8, "...", false ],
+                       [ "\nທ່ານ...", "\nທ່ານບໍ່ຮູ້ຫນັງສື", 5, "...", false ],
+               ];
+       }
+
        /**
         * @dataProvider provideHTMLTruncateData
         * @covers Language::truncateHTML