jquery.byteLimit: Handle characters outside BMP (surrogate pairs) when trimming

author Bartosz Dziewoński <matma.rex@gmail.com>

Mon, 5 Feb 2018 18:38:10 +0000 (19:38 +0100)

committer Bartosz Dziewoński <matma.rex@gmail.com>

Mon, 19 Feb 2018 21:56:45 +0000 (22:56 +0100)
author Bartosz Dziewoński <matma.rex@gmail.com>
Mon, 5 Feb 2018 18:38:10 +0000 (19:38 +0100)
committer Bartosz Dziewoński <matma.rex@gmail.com>
Mon, 19 Feb 2018 21:56:45 +0000 (22:56 +0100)
diff --git a/resources/src/jquery/jquery.byteLimit.js b/resources/src/jquery/jquery.byteLimit.js

index c75246c..3ce6e7f 100644 (file)
--- a/resources/src/jquery/jquery.byteLimit.js
+++ b/resources/src/jquery/jquery.byteLimit.js
@@ -14,6 +14,20 @@
                 'blur.byteLimit'
         ].join( ' ' );
  
+       // Like String#charAt, but return the pair of UTF-16 surrogates for characters outside of BMP.
+       function codePointAt( string, offset, backwards ) {
+               // We don't need to check for offsets at the beginning or end of string,
+               // String#slice will simply return a shorter (or empty) substring.
+               var maybePair = backwards ?
+                       string.slice( offset - 1, offset + 1 ) :
+                       string.slice( offset, offset + 2 );
+               if ( /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( maybePair ) ) {
+                       return maybePair;
+               } else {
+                       return string.charAt( offset );
+               }
+       }
+
         /**
          * Utility function to trim down a string, based on byteLimit
          * and given a safe start position. It supports insertion anywhere
@@ -32,7 +46,7 @@
          * @return {boolean} return.trimmed
          */
         $.trimByteLength = function ( safeVal, newVal, byteLimit, fn ) {
-               var startMatches, endMatches, matchesLen, inpParts,
+               var startMatches, endMatches, matchesLen, inpParts, chopOff, oldChar, newChar,
                         oldVal = safeVal;
  
                 // Run the hook if one was provided, but only on the length
@@ -61,18 +75,22 @@
  
                 // Count same characters from the left, first.
                 // (if "foo" -> "foofoo", assume addition was at the end).
-               while (
-                       startMatches < matchesLen &&
-                       oldVal.charAt( startMatches ) === newVal.charAt( startMatches )
-               ) {
-                       startMatches += 1;
+               while ( startMatches < matchesLen ) {
+                       oldChar = codePointAt( oldVal, startMatches, false );
+                       newChar = codePointAt( newVal, startMatches, false );
+                       if ( oldChar !== newChar ) {
+                               break;
+                       }
+                       startMatches += oldChar.length;
                 }
  
-               while (
-                       endMatches < ( matchesLen - startMatches ) &&
-                       oldVal.charAt( oldVal.length - 1 - endMatches ) === newVal.charAt( newVal.length - 1 - endMatches )
-               ) {
-                       endMatches += 1;
+               while ( endMatches < ( matchesLen - startMatches ) ) {
+                       oldChar = codePointAt( oldVal, oldVal.length - 1 - endMatches, true );
+                       newChar = codePointAt( newVal, newVal.length - 1 - endMatches, true );
+                       if ( oldChar !== newChar ) {
+                               break;
+                       }
+                       endMatches += oldChar.length;
                 }
  
                 inpParts = [
@@ -89,11 +107,15 @@
                 if ( fn ) {
                         // stop, when there is nothing to slice - T43450
                         while ( $.byteLength( fn( inpParts.join( '' ) ) ) > byteLimit && inpParts[ 1 ].length > 0 ) {
-                               inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -1 );
+                               // Do not chop off halves of surrogate pairs
+                               chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
+                               inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
                         }
                 } else {
                         while ( $.byteLength( inpParts.join( '' ) ) > byteLimit ) {
-                               inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -1 );
+                               // Do not chop off halves of surrogate pairs
+                               chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
+                               inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
                         }
                 }
  
diff --git a/tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js b/tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js

index 8555a7e..1a660cf 100644 (file)
--- a/tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js
+++ b/tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js
@@ -1,5 +1,5 @@
  ( function ( $, mw ) {
-       var simpleSample, U_20AC, mbSample;
+       var simpleSample, U_20AC, poop, mbSample;
  
         QUnit.module( 'jquery.byteLimit', QUnit.newMwEnvironment() );
  
@@ -9,6 +9,9 @@
         // 3 bytes (euro-symbol)
         U_20AC = '\u20AC';
  
+       // Outside of the BMP (pile of poo emoji)
+       poop = '\uD83D\uDCA9'; // "💩"
+
         // Multi-byte sample (22 chars, 26 bytes)
         mbSample = '1234567890' + U_20AC + '1234567890' + U_20AC;
  
@@ -109,6 +112,14 @@
                 expected: '1234567890' + U_20AC + '1'
         } );
  
+       byteLimitTest( {
+               description: 'Limit using a custom value (multibyte, outside BMP)',
+               $input: $( '<input>' ).attr( 'type', 'text' )
+                       .byteLimit( 3 ),
+               sample: poop,
+               expected: ''
+       } );
+
         byteLimitTest( {
                 description: 'Limit using a custom value (multibyte) overlapping a byte',
                 $input: $( '<input>' ).attr( 'type', 'text' )
@@ -245,4 +256,33 @@
  
                 assert.strictEqual( $el.val(), 'abc', 'Trim from the insertion point (at 1), not the end' );
         } );
+
+       QUnit.test( 'Do not cut up false matching substrings in emoji insertions', function ( assert ) {
+               var $el,
+                       oldVal = '\uD83D\uDCA9\uD83D\uDCA9', // "💩💩"
+                       newVal = '\uD83D\uDCA9\uD83D\uDCB9\uD83E\uDCA9\uD83D\uDCA9', // "💩💹🢩💩"
+                       expected = '\uD83D\uDCA9\uD83D\uDCB9\uD83D\uDCA9'; // "💩💹💩"
+
+               // Possible bad results:
+               // * With no surrogate support:
+               //   '\uD83D\uDCA9\uD83D\uDCB9\uD83E\uDCA9' "💩💹🢩"
+               // * With correct trimming but bad detection of inserted text:
+               //   '\uD83D\uDCA9\uD83D\uDCB9\uDCA9' "💩💹�"
+
+               $el = $( '<input>' ).attr( 'type', 'text' )
+                       .appendTo( '#qunit-fixture' )
+                       .byteLimit( 12 )
+                       .val( oldVal ).trigger( 'change' )
+                       .val( newVal ).trigger( 'change' );
+
+               assert.strictEqual( $el.val(), expected, 'Pasted emoji correctly trimmed at the end' );
+       } );
+
+       byteLimitTest( {
+               description: 'Unpaired surrogates do not crash',
+               $input: $( '<input>' ).attr( 'type', 'text' ).byteLimit( 4 ),
+               sample: '\uD800\uD800\uDFFF',
+               expected: '\uD800'
+       } );
+
  }( jQuery, mediaWiki ) );
author	Bartosz Dziewoński <matma.rex@gmail.com>
	Mon, 5 Feb 2018 18:38:10 +0000 (19:38 +0100)
committer	Bartosz Dziewoński <matma.rex@gmail.com>
	Mon, 19 Feb 2018 21:56:45 +0000 (22:56 +0100)
resources/src/jquery/jquery.byteLimit.js		patch \| blob \| history
tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js		patch \| blob \| history