jquery.byteLimit: Handle characters outside BMP (surrogate pairs) when trimming
authorBartosz Dziewoński <matma.rex@gmail.com>
Mon, 5 Feb 2018 18:38:10 +0000 (19:38 +0100)
committerBartosz Dziewoński <matma.rex@gmail.com>
Mon, 19 Feb 2018 21:56:45 +0000 (22:56 +0100)
Bug: T186364
Change-Id: I6282d97bcd637ae8e86d70996adb468582c8f02f

resources/src/jquery/jquery.byteLimit.js
tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js

index c75246c..3ce6e7f 100644 (file)
                'blur.byteLimit'
        ].join( ' ' );
 
+       // Like String#charAt, but return the pair of UTF-16 surrogates for characters outside of BMP.
+       function codePointAt( string, offset, backwards ) {
+               // We don't need to check for offsets at the beginning or end of string,
+               // String#slice will simply return a shorter (or empty) substring.
+               var maybePair = backwards ?
+                       string.slice( offset - 1, offset + 1 ) :
+                       string.slice( offset, offset + 2 );
+               if ( /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( maybePair ) ) {
+                       return maybePair;
+               } else {
+                       return string.charAt( offset );
+               }
+       }
+
        /**
         * Utility function to trim down a string, based on byteLimit
         * and given a safe start position. It supports insertion anywhere
@@ -32,7 +46,7 @@
         * @return {boolean} return.trimmed
         */
        $.trimByteLength = function ( safeVal, newVal, byteLimit, fn ) {
-               var startMatches, endMatches, matchesLen, inpParts,
+               var startMatches, endMatches, matchesLen, inpParts, chopOff, oldChar, newChar,
                        oldVal = safeVal;
 
                // Run the hook if one was provided, but only on the length
 
                // Count same characters from the left, first.
                // (if "foo" -> "foofoo", assume addition was at the end).
-               while (
-                       startMatches < matchesLen &&
-                       oldVal.charAt( startMatches ) === newVal.charAt( startMatches )
-               ) {
-                       startMatches += 1;
+               while ( startMatches < matchesLen ) {
+                       oldChar = codePointAt( oldVal, startMatches, false );
+                       newChar = codePointAt( newVal, startMatches, false );
+                       if ( oldChar !== newChar ) {
+                               break;
+                       }
+                       startMatches += oldChar.length;
                }
 
-               while (
-                       endMatches < ( matchesLen - startMatches ) &&
-                       oldVal.charAt( oldVal.length - 1 - endMatches ) === newVal.charAt( newVal.length - 1 - endMatches )
-               ) {
-                       endMatches += 1;
+               while ( endMatches < ( matchesLen - startMatches ) ) {
+                       oldChar = codePointAt( oldVal, oldVal.length - 1 - endMatches, true );
+                       newChar = codePointAt( newVal, newVal.length - 1 - endMatches, true );
+                       if ( oldChar !== newChar ) {
+                               break;
+                       }
+                       endMatches += oldChar.length;
                }
 
                inpParts = [
                if ( fn ) {
                        // stop, when there is nothing to slice - T43450
                        while ( $.byteLength( fn( inpParts.join( '' ) ) ) > byteLimit && inpParts[ 1 ].length > 0 ) {
-                               inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -1 );
+                               // Do not chop off halves of surrogate pairs
+                               chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
+                               inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
                        }
                } else {
                        while ( $.byteLength( inpParts.join( '' ) ) > byteLimit ) {
-                               inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -1 );
+                               // Do not chop off halves of surrogate pairs
+                               chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
+                               inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
                        }
                }
 
index 8555a7e..1a660cf 100644 (file)
@@ -1,5 +1,5 @@
 ( function ( $, mw ) {
-       var simpleSample, U_20AC, mbSample;
+       var simpleSample, U_20AC, poop, mbSample;
 
        QUnit.module( 'jquery.byteLimit', QUnit.newMwEnvironment() );
 
@@ -9,6 +9,9 @@
        // 3 bytes (euro-symbol)
        U_20AC = '\u20AC';
 
+       // Outside of the BMP (pile of poo emoji)
+       poop = '\uD83D\uDCA9'; // "💩"
+
        // Multi-byte sample (22 chars, 26 bytes)
        mbSample = '1234567890' + U_20AC + '1234567890' + U_20AC;
 
                expected: '1234567890' + U_20AC + '1'
        } );
 
+       byteLimitTest( {
+               description: 'Limit using a custom value (multibyte, outside BMP)',
+               $input: $( '<input>' ).attr( 'type', 'text' )
+                       .byteLimit( 3 ),
+               sample: poop,
+               expected: ''
+       } );
+
        byteLimitTest( {
                description: 'Limit using a custom value (multibyte) overlapping a byte',
                $input: $( '<input>' ).attr( 'type', 'text' )
 
                assert.strictEqual( $el.val(), 'abc', 'Trim from the insertion point (at 1), not the end' );
        } );
+
+       QUnit.test( 'Do not cut up false matching substrings in emoji insertions', function ( assert ) {
+               var $el,
+                       oldVal = '\uD83D\uDCA9\uD83D\uDCA9', // "💩💩"
+                       newVal = '\uD83D\uDCA9\uD83D\uDCB9\uD83E\uDCA9\uD83D\uDCA9', // "💩💹🢩💩"
+                       expected = '\uD83D\uDCA9\uD83D\uDCB9\uD83D\uDCA9'; // "💩💹💩"
+
+               // Possible bad results:
+               // * With no surrogate support:
+               //   '\uD83D\uDCA9\uD83D\uDCB9\uD83E\uDCA9' "💩💹🢩"
+               // * With correct trimming but bad detection of inserted text:
+               //   '\uD83D\uDCA9\uD83D\uDCB9\uDCA9' "💩💹�"
+
+               $el = $( '<input>' ).attr( 'type', 'text' )
+                       .appendTo( '#qunit-fixture' )
+                       .byteLimit( 12 )
+                       .val( oldVal ).trigger( 'change' )
+                       .val( newVal ).trigger( 'change' );
+
+               assert.strictEqual( $el.val(), expected, 'Pasted emoji correctly trimmed at the end' );
+       } );
+
+       byteLimitTest( {
+               description: 'Unpaired surrogates do not crash',
+               $input: $( '<input>' ).attr( 'type', 'text' ).byteLimit( 4 ),
+               sample: '\uD800\uD800\uDFFF',
+               expected: '\uD800'
+       } );
+
 }( jQuery, mediaWiki ) );