From e1ed8a7b8992ca27e9b435fdcdfaa97bf429c32d Mon Sep 17 00:00:00 2001 From: David Chan Date: Thu, 5 Sep 2013 19:08:36 -0700 Subject: [PATCH] jquery.byteLength: Improve documentation and tests Change-Id: I6793487b7cd9f58b23554bc29c853bd3f02da49c --- resources/jquery/jquery.byteLength.js | 12 ++++++++++ .../jquery/jquery.byteLength.test.js | 24 ++++++++++--------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/resources/jquery/jquery.byteLength.js b/resources/jquery/jquery.byteLength.js index 3d5b720677..398937e6c0 100644 --- a/resources/jquery/jquery.byteLength.js +++ b/resources/jquery/jquery.byteLength.js @@ -4,6 +4,8 @@ * Calculate the byte length of a string (accounting for UTF-8). * * @author Jan Paul Posma, 2011 + * @author Timo Tijhof, 2012 + * @author David Chan, 2013 */ jQuery.byteLength = function ( str ) { @@ -12,8 +14,18 @@ jQuery.byteLength = function ( str ) { // Note, surrogate (\uD800-\uDFFF) characters are counted as 2 bytes, since there's two of them // and the actual character takes 4 bytes in UTF-8 (2*2=4). Might not work perfectly in // edge cases such as illegal sequences, but that should never happen. + + // https://en.wikipedia.org/wiki/UTF-8#Description + // The mapping from UTF-16 code units to UTF-8 bytes is as follows: + // > Range 0000-007F: codepoints that become 1 byte of UTF-8 + // > Range 0080-07FF: codepoints that become 2 bytes of UTF-8 + // > Range 0800-D7FF: codepoints that become 3 bytes of UTF-8 + // > Range D800-DFFF: Surrogates (each pair becomes 4 bytes of UTF-8) + // > Range E000-FFFF: codepoints that become 3 bytes of UTF-8 (continued) + return str .replace( /[\u0080-\u07FF\uD800-\uDFFF]/g, '**' ) .replace( /[\u0800-\uD7FF\uE000-\uFFFF]/g, '***' ) .length; + }; diff --git a/tests/qunit/suites/resources/jquery/jquery.byteLength.test.js b/tests/qunit/suites/resources/jquery/jquery.byteLength.test.js index e4e579b063..e6aa3aa8e0 100644 --- a/tests/qunit/suites/resources/jquery/jquery.byteLength.test.js +++ b/tests/qunit/suites/resources/jquery/jquery.byteLength.test.js @@ -16,20 +16,22 @@ } ); - QUnit.test( 'Special text', 5, function ( assert ) { - // http://en.wikipedia.org/wiki/UTF-8 + QUnit.test( 'Special text', 4, function ( assert ) { + // https://en.wikipedia.org/wiki/UTF-8 var u0024 = '$', + // Cent symbol u00A2 = '\u00A2', + // Euro symbol u20AC = '\u20AC', - u024B62 = '\u024B62', - // The normal one doesn't display properly, try the below which is the same - // according to http://www.fileformat.info/info/unicode/char/24B62/index.htm - u024B62alt = '\uD852\uDF62'; + // Character \U00024B62 (Han script) can't be represented in javascript as a single + // code point, instead it is composed as a surrogate pair of two separate code units. + // http://codepoints.net/U+24B62 + // http://www.fileformat.info/info/unicode/char/24B62/index.htm + u024B62 = '\uD852\uDF62'; - assert.strictEqual( $.byteLength( u0024 ), 1, 'U+0024: 1 byte. $ (dollar sign)' ); - assert.strictEqual( $.byteLength( u00A2 ), 2, 'U+00A2: 2 bytes. \u00A2 (cent sign)' ); - assert.strictEqual( $.byteLength( u20AC ), 3, 'U+20AC: 3 bytes. \u20AC (euro sign)' ); - assert.strictEqual( $.byteLength( u024B62 ), 4, 'U+024B62: 4 bytes. \uD852\uDF62 (a Han character)' ); - assert.strictEqual( $.byteLength( u024B62alt ), 4, 'U+024B62: 4 bytes. \uD852\uDF62 (a Han character) - alternative method' ); + assert.strictEqual( $.byteLength( u0024 ), 1, 'U+0024' ); + assert.strictEqual( $.byteLength( u00A2 ), 2, 'U+00A2' ); + assert.strictEqual( $.byteLength( u20AC ), 3, 'U+20AC' ); + assert.strictEqual( $.byteLength( u024B62 ), 4, 'U+024B62 (surrogate pair: \\uD852\\uDF62)' ); } ); }( jQuery ) ); -- 2.20.1