JS: Fixed UTF-8 string encoder/decoder for high codepoints.

author: Wojciech Mandrysz <tetek1@gmail.com> 2016-08-02 17:12:27 +0200
committer: Wojciech Mandrysz <tetek1@gmail.com> 2016-10-03 01:44:15 +0200
commit: 23f108d471ec6488efbef1a5b96d7a2abbf785c2 (patch)
tree: 39d72c7c5cd46a488c450152c54aed8a2ec87ba7 /js/binary/decoder.js
parent: 8d8115bf524044f2ab5928d7b4050d04f64d3f1c (diff)
1 files changed, 28 insertions, 13 deletions
diff --git a/js/binary/decoder.js b/js/binary/decoder.js
index 41094a36..62f7b8b6 100644
--- a/js/binary/decoder.js
+++ b/js/binary/decoder.js
@@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
 
 /**
  * Reads and parses a UTF-8 encoded unicode string from the stream.
- * The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with
- * the exception that the implementation here does not get confused if it
- * encounters characters longer than three bytes. These characters are ignored
- * though, as they are extremely rare: three UTF-8 bytes cover virtually all
- * characters in common use (http://en.wikipedia.org/wiki/UTF-8).
+ * The code is inspired by maps.vectortown.parse.StreamedDataViewReader.
+ * Supports codepoints from U+0000 up to U+10FFFF. 
+ * (http://en.wikipedia.org/wiki/UTF-8).
  * @param {number} length The length of the string to read.
  * @return {string} The decoded string.
  */
@@ -907,30 +905,47 @@ jspb.BinaryDecoder.prototype.readString = function(length) {
   var bytes = this.bytes_;
   var cursor = this.cursor_;
   var end = cursor + length;
-  var chars = [];
+  var codepoints = [];
 
   while (cursor < end) {
     var c = bytes[cursor++];
     if (c < 128) { // Regular 7-bit ASCII.
-      chars.push(c);
+      codepoints.push(c);
     } else if (c < 192) {
       // UTF-8 continuation mark. We are out of sync. This
       // might happen if we attempted to read a character
-      // with more than three bytes.
+      // with more than four bytes.
       continue;
     } else if (c < 224) { // UTF-8 with two bytes.
       var c2 = bytes[cursor++];
-      chars.push(((c & 31) << 6) | (c2 & 63));
+      codepoints.push(((c & 31) << 6) | (c2 & 63));
     } else if (c < 240) { // UTF-8 with three bytes.
       var c2 = bytes[cursor++];
       var c3 = bytes[cursor++];
-      chars.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+      codepoints.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+    } else if (c < 248) { // UTF-8 with 4 bytes.
+      var c2 = bytes[cursor++];
+      var c3 = bytes[cursor++];
+      var c4 = bytes[cursor++];
+      // Characters written on 4 bytes have 21 bits for a codepoint. 
+      // We can't fit that on 16bit characters, so we use surrogates.
+      var codepoint = ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
+      // Surrogates formula from wikipedia.
+      // 1. Subtract 0x10000 from codepoint
+      codepoint -= 65536;
+      // 2. Split this into the high 10-bit value and the low 10-bit value
+      var low = codepoint & 1023;
+      var high = (codepoint >> 10) & 1023;
+      // 3. Add 0xD800 to the high value to form the high surrogate
+      high += 55296;
+      // 4. Add 0xDC00 to the low value to form the low surrogate:
+      low += 56320;
+      codepoints.push(high);
+      codepoints.push(low);
     }
   }
 
-  // String.fromCharCode.apply is faster than manually appending characters on
-  // Chrome 25+, and generates no additional cons string garbage.
-  var result = String.fromCharCode.apply(null, chars);
+  var result = String.fromCodePoint.apply(null, codepoints);
   this.cursor_ = cursor;
   return result;
 };
author	Wojciech Mandrysz <tetek1@gmail.com>	2016-08-02 17:12:27 +0200
committer	Wojciech Mandrysz <tetek1@gmail.com>	2016-10-03 01:44:15 +0200
commit	23f108d471ec6488efbef1a5b96d7a2abbf785c2 (patch)
tree	39d72c7c5cd46a488c450152c54aed8a2ec87ba7 /js/binary/decoder.js
parent	8d8115bf524044f2ab5928d7b4050d04f64d3f1c (diff)