jquery.byteLength: Improve documentation and tests
authorDavid Chan <david@sheetmusic.org.uk>
Fri, 6 Sep 2013 02:08:36 +0000 (19:08 -0700)
committerDavid Chan <david@sheetmusic.org.uk>
Sat, 7 Sep 2013 01:23:35 +0000 (18:23 -0700)
Change-Id: I6793487b7cd9f58b23554bc29c853bd3f02da49c

resources/jquery/jquery.byteLength.js
tests/qunit/suites/resources/jquery/jquery.byteLength.test.js

index 3d5b720..398937e 100644 (file)
@@ -4,6 +4,8 @@
  * Calculate the byte length of a string (accounting for UTF-8).
  *
  * @author Jan Paul Posma, 2011
+ * @author Timo Tijhof, 2012
+ * @author David Chan, 2013
  */
 jQuery.byteLength = function ( str ) {
 
@@ -12,8 +14,18 @@ jQuery.byteLength = function ( str ) {
        // Note, surrogate (\uD800-\uDFFF) characters are counted as 2 bytes, since there's two of them
        // and the actual character takes 4 bytes in UTF-8 (2*2=4). Might not work perfectly in
        // edge cases such as illegal sequences, but that should never happen.
+
+       // https://en.wikipedia.org/wiki/UTF-8#Description
+       // The mapping from UTF-16 code units to UTF-8 bytes is as follows:
+       // > Range 0000-007F: codepoints that become 1 byte of UTF-8
+       // > Range 0080-07FF: codepoints that become 2 bytes of UTF-8
+       // > Range 0800-D7FF: codepoints that become 3 bytes of UTF-8
+       // > Range D800-DFFF: Surrogates (each pair becomes 4 bytes of UTF-8)
+       // > Range E000-FFFF: codepoints that become 3 bytes of UTF-8 (continued)
+
        return str
                .replace( /[\u0080-\u07FF\uD800-\uDFFF]/g, '**' )
                .replace( /[\u0800-\uD7FF\uE000-\uFFFF]/g, '***' )
                .length;
+
 };
index e4e579b..e6aa3aa 100644 (file)
 
        } );
 
-       QUnit.test( 'Special text', 5, function ( assert ) {
-               // http://en.wikipedia.org/wiki/UTF-8
+       QUnit.test( 'Special text', 4, function ( assert ) {
+               // https://en.wikipedia.org/wiki/UTF-8
                var u0024 = '$',
+                       // Cent symbol
                        u00A2 = '\u00A2',
+                       // Euro symbol
                        u20AC = '\u20AC',
-                       u024B62 = '\u024B62',
-                       // The normal one doesn't display properly, try the below which is the same
-                       // according to http://www.fileformat.info/info/unicode/char/24B62/index.htm
-                       u024B62alt = '\uD852\uDF62';
+                       // Character \U00024B62 (Han script) can't be represented in javascript as a single
+                       // code point, instead it is composed as a surrogate pair of two separate code units.
+                       // http://codepoints.net/U+24B62
+                       // http://www.fileformat.info/info/unicode/char/24B62/index.htm
+                       u024B62 = '\uD852\uDF62';
 
-               assert.strictEqual( $.byteLength( u0024 ), 1, 'U+0024: 1 byte. $ (dollar sign)' );
-               assert.strictEqual( $.byteLength( u00A2 ), 2, 'U+00A2: 2 bytes. \u00A2 (cent sign)' );
-               assert.strictEqual( $.byteLength( u20AC ), 3, 'U+20AC: 3 bytes. \u20AC (euro sign)' );
-               assert.strictEqual( $.byteLength( u024B62 ), 4, 'U+024B62: 4 bytes. \uD852\uDF62 (a Han character)' );
-               assert.strictEqual( $.byteLength( u024B62alt ), 4, 'U+024B62: 4 bytes. \uD852\uDF62 (a Han character) - alternative method' );
+               assert.strictEqual( $.byteLength( u0024 ), 1, 'U+0024' );
+               assert.strictEqual( $.byteLength( u00A2 ), 2, 'U+00A2' );
+               assert.strictEqual( $.byteLength( u20AC ), 3, 'U+20AC' );
+               assert.strictEqual( $.byteLength( u024B62 ), 4, 'U+024B62 (surrogate pair: \\uD852\\uDF62)' );
        } );
 }( jQuery ) );