From e1ed8a7b8992ca27e9b435fdcdfaa97bf429c32d Mon Sep 17 00:00:00 2001
From: David Chan <david@sheetmusic.org.uk>
Date: Thu, 5 Sep 2013 19:08:36 -0700
Subject: [PATCH] jquery.byteLength: Improve documentation and tests

Change-Id: I6793487b7cd9f58b23554bc29c853bd3f02da49c
---
 resources/jquery/jquery.byteLength.js         | 12 ++++++++++
 .../jquery/jquery.byteLength.test.js          | 24 ++++++++++---------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/resources/jquery/jquery.byteLength.js b/resources/jquery/jquery.byteLength.js
index 3d5b720677..398937e6c0 100644
--- a/resources/jquery/jquery.byteLength.js
+++ b/resources/jquery/jquery.byteLength.js
@@ -4,6 +4,8 @@
  * Calculate the byte length of a string (accounting for UTF-8).
  *
  * @author Jan Paul Posma, 2011
+ * @author Timo Tijhof, 2012
+ * @author David Chan, 2013
  */
 jQuery.byteLength = function ( str ) {
 
@@ -12,8 +14,18 @@ jQuery.byteLength = function ( str ) {
 	// Note, surrogate (\uD800-\uDFFF) characters are counted as 2 bytes, since there's two of them
 	// and the actual character takes 4 bytes in UTF-8 (2*2=4). Might not work perfectly in
 	// edge cases such as illegal sequences, but that should never happen.
+
+	// https://en.wikipedia.org/wiki/UTF-8#Description
+	// The mapping from UTF-16 code units to UTF-8 bytes is as follows:
+	// > Range 0000-007F: codepoints that become 1 byte of UTF-8
+	// > Range 0080-07FF: codepoints that become 2 bytes of UTF-8
+	// > Range 0800-D7FF: codepoints that become 3 bytes of UTF-8
+	// > Range D800-DFFF: Surrogates (each pair becomes 4 bytes of UTF-8)
+	// > Range E000-FFFF: codepoints that become 3 bytes of UTF-8 (continued)
+
 	return str
 		.replace( /[\u0080-\u07FF\uD800-\uDFFF]/g, '**' )
 		.replace( /[\u0800-\uD7FF\uE000-\uFFFF]/g, '***' )
 		.length;
+
 };
diff --git a/tests/qunit/suites/resources/jquery/jquery.byteLength.test.js b/tests/qunit/suites/resources/jquery/jquery.byteLength.test.js
index e4e579b063..e6aa3aa8e0 100644
--- a/tests/qunit/suites/resources/jquery/jquery.byteLength.test.js
+++ b/tests/qunit/suites/resources/jquery/jquery.byteLength.test.js
@@ -16,20 +16,22 @@
 
 	} );
 
-	QUnit.test( 'Special text', 5, function ( assert ) {
-		// http://en.wikipedia.org/wiki/UTF-8
+	QUnit.test( 'Special text', 4, function ( assert ) {
+		// https://en.wikipedia.org/wiki/UTF-8
 		var u0024 = '$',
+			// Cent symbol
 			u00A2 = '\u00A2',
+			// Euro symbol
 			u20AC = '\u20AC',
-			u024B62 = '\u024B62',
-			// The normal one doesn't display properly, try the below which is the same
-			// according to http://www.fileformat.info/info/unicode/char/24B62/index.htm
-			u024B62alt = '\uD852\uDF62';
+			// Character \U00024B62 (Han script) can't be represented in javascript as a single
+			// code point, instead it is composed as a surrogate pair of two separate code units.
+			// http://codepoints.net/U+24B62
+			// http://www.fileformat.info/info/unicode/char/24B62/index.htm
+			u024B62 = '\uD852\uDF62';
 
-		assert.strictEqual( $.byteLength( u0024 ), 1, 'U+0024: 1 byte. $ (dollar sign)' );
-		assert.strictEqual( $.byteLength( u00A2 ), 2, 'U+00A2: 2 bytes. \u00A2 (cent sign)' );
-		assert.strictEqual( $.byteLength( u20AC ), 3, 'U+20AC: 3 bytes. \u20AC (euro sign)' );
-		assert.strictEqual( $.byteLength( u024B62 ), 4, 'U+024B62: 4 bytes. \uD852\uDF62 (a Han character)' );
-		assert.strictEqual( $.byteLength( u024B62alt ), 4, 'U+024B62: 4 bytes. \uD852\uDF62 (a Han character) - alternative method' );
+		assert.strictEqual( $.byteLength( u0024 ), 1, 'U+0024' );
+		assert.strictEqual( $.byteLength( u00A2 ), 2, 'U+00A2' );
+		assert.strictEqual( $.byteLength( u20AC ), 3, 'U+20AC' );
+		assert.strictEqual( $.byteLength( u024B62 ), 4, 'U+024B62 (surrogate pair: \\uD852\\uDF62)' );
 	} );
 }( jQuery ) );
-- 
2.20.1