From 23f108d471ec6488efbef1a5b96d7a2abbf785c2 Mon Sep 17 00:00:00 2001 From: Wojciech Mandrysz Date: Tue, 2 Aug 2016 17:12:27 +0200 Subject: JS: Fixed UTF-8 string encoder/decoder for high codepoints. --- js/binary/decoder.js | 41 ++++++++++++++++++++++++++++------------- js/binary/encoder.js | 10 ++++++++-- 2 files changed, 36 insertions(+), 15 deletions(-) (limited to 'js') diff --git a/js/binary/decoder.js b/js/binary/decoder.js index 41094a36..62f7b8b6 100644 --- a/js/binary/decoder.js +++ b/js/binary/decoder.js @@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() { /** * Reads and parses a UTF-8 encoded unicode string from the stream. - * The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with - * the exception that the implementation here does not get confused if it - * encounters characters longer than three bytes. These characters are ignored - * though, as they are extremely rare: three UTF-8 bytes cover virtually all - * characters in common use (http://en.wikipedia.org/wiki/UTF-8). + * The code is inspired by maps.vectortown.parse.StreamedDataViewReader. + * Supports codepoints from U+0000 up to U+10FFFF. + * (http://en.wikipedia.org/wiki/UTF-8). * @param {number} length The length of the string to read. * @return {string} The decoded string. */ @@ -907,30 +905,47 @@ jspb.BinaryDecoder.prototype.readString = function(length) { var bytes = this.bytes_; var cursor = this.cursor_; var end = cursor + length; - var chars = []; + var codepoints = []; while (cursor < end) { var c = bytes[cursor++]; if (c < 128) { // Regular 7-bit ASCII. - chars.push(c); + codepoints.push(c); } else if (c < 192) { // UTF-8 continuation mark. We are out of sync. This // might happen if we attempted to read a character - // with more than three bytes. + // with more than four bytes. continue; } else if (c < 224) { // UTF-8 with two bytes. var c2 = bytes[cursor++]; - chars.push(((c & 31) << 6) | (c2 & 63)); + codepoints.push(((c & 31) << 6) | (c2 & 63)); } else if (c < 240) { // UTF-8 with three bytes. var c2 = bytes[cursor++]; var c3 = bytes[cursor++]; - chars.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); + codepoints.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); + } else if (c < 248) { // UTF-8 with 4 bytes. + var c2 = bytes[cursor++]; + var c3 = bytes[cursor++]; + var c4 = bytes[cursor++]; + // Characters written on 4 bytes have 21 bits for a codepoint. + // We can't fit that on 16bit characters, so we use surrogates. + var codepoint = ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63); + // Surrogates formula from wikipedia. + // 1. Subtract 0x10000 from codepoint + codepoint -= 65536; + // 2. Split this into the high 10-bit value and the low 10-bit value + var low = codepoint & 1023; + var high = (codepoint >> 10) & 1023; + // 3. Add 0xD800 to the high value to form the high surrogate + high += 55296; + // 4. Add 0xDC00 to the low value to form the low surrogate: + low += 56320; + codepoints.push(high); + codepoints.push(low); } } - // String.fromCharCode.apply is faster than manually appending characters on - // Chrome 25+, and generates no additional cons string garbage. - var result = String.fromCharCode.apply(null, chars); + var result = String.fromCodePoint.apply(null, codepoints); this.cursor_ = cursor; return result; }; diff --git a/js/binary/encoder.js b/js/binary/encoder.js index c9b0c2ae..59c4ccb9 100644 --- a/js/binary/encoder.js +++ b/js/binary/encoder.js @@ -412,16 +412,22 @@ jspb.BinaryEncoder.prototype.writeString = function(value) { // UTF16 to UTF8 conversion loop swiped from goog.crypt.stringToUtf8ByteArray. for (var i = 0; i < value.length; i++) { - var c = value.charCodeAt(i); + var c = value.codePointAt(i); if (c < 128) { this.buffer_.push(c); } else if (c < 2048) { this.buffer_.push((c >> 6) | 192); this.buffer_.push((c & 63) | 128); - } else { + } else if (c < 65536) { this.buffer_.push((c >> 12) | 224); this.buffer_.push(((c >> 6) & 63) | 128); this.buffer_.push((c & 63) | 128); + } else { + this.buffer_.push((c >> 18) | 240); + this.buffer_.push(((c >> 12) & 63 ) | 128); + this.buffer_.push(((c >> 6) & 63) | 128); + this.buffer_.push((c & 63) | 128); + i++; } } -- cgit v1.2.3 From fe1d0a1f5a32fe144ee125eb0058927d7a359926 Mon Sep 17 00:00:00 2001 From: Wojciech Mandrysz Date: Mon, 3 Oct 2016 01:42:58 +0200 Subject: JS: Added string encoding/decoding tests for UTF-8 --- js/binary/decoder_test.js | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'js') diff --git a/js/binary/decoder_test.js b/js/binary/decoder_test.js index ac312648..12da72a7 100644 --- a/js/binary/decoder_test.js +++ b/js/binary/decoder_test.js @@ -209,7 +209,30 @@ describe('binaryDecoderTest', function() { assertEquals(hashC, decoder.readFixedHash64()); assertEquals(hashD, decoder.readFixedHash64()); }); + + /** + * Test encoding and decoding utf-8. + */ + it('testUtf8', function() { + var encoder = new jspb.BinaryEncoder(); + var ascii = "ASCII should work in 3, 2, 1..." + var utf8_two_bytes = "©"; + var utf8_tree_bytes = "❄"; + var utf8_four_bytes = "😁"; + + encoder.writeString(ascii); + encoder.writeString(utf8_two_bytes); + encoder.writeString(utf8_tree_bytes); + encoder.writeString(utf8_four_bytes); + + var decoder = jspb.BinaryDecoder.alloc(encoder.end()); + + assertEquals(ascii, decoder.readString(ascii.length)); + assertEquals(utf8_two_bytes, decoder.readString(utf8_two_bytes.length)); + assertEquals(utf8_tree_bytes, decoder.readString(utf8_tree_bytes.length)); + assertEquals(utf8_four_bytes, decoder.readString(utf8_four_bytes.length)); + }); /** * Verifies that misuse of the decoder class triggers assertions. -- cgit v1.2.3 From 7332ffb1f08c9414119aa0a59ec8334c7599bfd8 Mon Sep 17 00:00:00 2001 From: Wojciech Mandrysz Date: Mon, 3 Oct 2016 18:59:34 +0200 Subject: JS: Replaced fromCodePoint/codePointAt with fromCharCode/charCodeAt because of functions limited availability, fixed typo in tests. --- js/binary/decoder.js | 21 +++++++++------------ js/binary/decoder_test.js | 6 +++--- js/binary/encoder.js | 15 ++++++++++++--- 3 files changed, 24 insertions(+), 18 deletions(-) (limited to 'js') diff --git a/js/binary/decoder.js b/js/binary/decoder.js index 62f7b8b6..e4fb9148 100644 --- a/js/binary/decoder.js +++ b/js/binary/decoder.js @@ -905,12 +905,12 @@ jspb.BinaryDecoder.prototype.readString = function(length) { var bytes = this.bytes_; var cursor = this.cursor_; var end = cursor + length; - var codepoints = []; + var codeUnits = []; while (cursor < end) { var c = bytes[cursor++]; if (c < 128) { // Regular 7-bit ASCII. - codepoints.push(c); + codeUnits.push(c); } else if (c < 192) { // UTF-8 continuation mark. We are out of sync. This // might happen if we attempted to read a character @@ -918,11 +918,11 @@ jspb.BinaryDecoder.prototype.readString = function(length) { continue; } else if (c < 224) { // UTF-8 with two bytes. var c2 = bytes[cursor++]; - codepoints.push(((c & 31) << 6) | (c2 & 63)); + codeUnits.push(((c & 31) << 6) | (c2 & 63)); } else if (c < 240) { // UTF-8 with three bytes. var c2 = bytes[cursor++]; var c3 = bytes[cursor++]; - codepoints.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); + codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); } else if (c < 248) { // UTF-8 with 4 bytes. var c2 = bytes[cursor++]; var c3 = bytes[cursor++]; @@ -932,20 +932,17 @@ jspb.BinaryDecoder.prototype.readString = function(length) { var codepoint = ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63); // Surrogates formula from wikipedia. // 1. Subtract 0x10000 from codepoint - codepoint -= 65536; + codepoint -= 0x10000; // 2. Split this into the high 10-bit value and the low 10-bit value - var low = codepoint & 1023; - var high = (codepoint >> 10) & 1023; // 3. Add 0xD800 to the high value to form the high surrogate - high += 55296; // 4. Add 0xDC00 to the low value to form the low surrogate: - low += 56320; - codepoints.push(high); - codepoints.push(low); + var low = (codepoint & 1023) + 0xDC00; + var high = ((codepoint >> 10) & 1023) + 0xD800; + codeUnits.push(high, low) } } - var result = String.fromCodePoint.apply(null, codepoints); + var result = String.fromCharCode.apply(null, codeUnits); this.cursor_ = cursor; return result; }; diff --git a/js/binary/decoder_test.js b/js/binary/decoder_test.js index 12da72a7..9f947b99 100644 --- a/js/binary/decoder_test.js +++ b/js/binary/decoder_test.js @@ -218,19 +218,19 @@ describe('binaryDecoderTest', function() { var ascii = "ASCII should work in 3, 2, 1..." var utf8_two_bytes = "©"; - var utf8_tree_bytes = "❄"; + var utf8_three_bytes = "❄"; var utf8_four_bytes = "😁"; encoder.writeString(ascii); encoder.writeString(utf8_two_bytes); - encoder.writeString(utf8_tree_bytes); + encoder.writeString(utf8_three_bytes); encoder.writeString(utf8_four_bytes); var decoder = jspb.BinaryDecoder.alloc(encoder.end()); assertEquals(ascii, decoder.readString(ascii.length)); assertEquals(utf8_two_bytes, decoder.readString(utf8_two_bytes.length)); - assertEquals(utf8_tree_bytes, decoder.readString(utf8_tree_bytes.length)); + assertEquals(utf8_three_bytes, decoder.readString(utf8_three_bytes.length)); assertEquals(utf8_four_bytes, decoder.readString(utf8_four_bytes.length)); }); diff --git a/js/binary/encoder.js b/js/binary/encoder.js index 59c4ccb9..fe5e34e9 100644 --- a/js/binary/encoder.js +++ b/js/binary/encoder.js @@ -409,10 +409,19 @@ jspb.BinaryEncoder.prototype.writeFixedHash64 = function(hash) { */ jspb.BinaryEncoder.prototype.writeString = function(value) { var oldLength = this.buffer_.length; - - // UTF16 to UTF8 conversion loop swiped from goog.crypt.stringToUtf8ByteArray. + for (var i = 0; i < value.length; i++) { - var c = value.codePointAt(i); + + var c = value.charCodeAt(i); + // Look for surrogates + if (c >= 0xD800 && c <= 0xDBFF && i + 1 < value.length) { + var second = value.charCodeAt(i + 1); + if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate + // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + c = (c - 0xD800) * 0x400 + second - 0xDC00 + 0x10000; + } + } + if (c < 128) { this.buffer_.push(c); } else if (c < 2048) { -- cgit v1.2.3 From 292c2c91cfc16eda5dc8f835ef6073febef118e5 Mon Sep 17 00:00:00 2001 From: Wojciech Mandrysz Date: Tue, 15 Nov 2016 12:44:15 +0100 Subject: JS: Re-added comment, moved surrogates code to the right place --- js/binary/decoder.js | 3 ++- js/binary/encoder.js | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'js') diff --git a/js/binary/decoder.js b/js/binary/decoder.js index e4fb9148..040cf715 100644 --- a/js/binary/decoder.js +++ b/js/binary/decoder.js @@ -941,7 +941,8 @@ jspb.BinaryDecoder.prototype.readString = function(length) { codeUnits.push(high, low) } } - + // String.fromCharCode.apply is faster than manually appending characters on + // Chrome 25+, and generates no additional cons string garbage. var result = String.fromCharCode.apply(null, codeUnits); this.cursor_ = cursor; return result; diff --git a/js/binary/encoder.js b/js/binary/encoder.js index fe5e34e9..a9d09d72 100644 --- a/js/binary/encoder.js +++ b/js/binary/encoder.js @@ -413,14 +413,6 @@ jspb.BinaryEncoder.prototype.writeString = function(value) { for (var i = 0; i < value.length; i++) { var c = value.charCodeAt(i); - // Look for surrogates - if (c >= 0xD800 && c <= 0xDBFF && i + 1 < value.length) { - var second = value.charCodeAt(i + 1); - if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate - // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae - c = (c - 0xD800) * 0x400 + second - 0xDC00 + 0x10000; - } - } if (c < 128) { this.buffer_.push(c); @@ -428,6 +420,14 @@ jspb.BinaryEncoder.prototype.writeString = function(value) { this.buffer_.push((c >> 6) | 192); this.buffer_.push((c & 63) | 128); } else if (c < 65536) { + // Look for surrogates + if (c >= 0xD800 && c <= 0xDBFF && i + 1 < value.length) { + var second = value.charCodeAt(i + 1); + if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate + // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + c = (c - 0xD800) * 0x400 + second - 0xDC00 + 0x10000; + } + } this.buffer_.push((c >> 12) | 224); this.buffer_.push(((c >> 6) & 63) | 128); this.buffer_.push((c & 63) | 128); -- cgit v1.2.3 From bd850a25f51dfb662a761473c151c016c815bcb5 Mon Sep 17 00:00:00 2001 From: Wojciech Mandrysz Date: Tue, 15 Nov 2016 14:08:49 +0100 Subject: JS: Well, this is the right place for surrogates. --- js/binary/encoder.js | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'js') diff --git a/js/binary/encoder.js b/js/binary/encoder.js index a9d09d72..d952d714 100644 --- a/js/binary/encoder.js +++ b/js/binary/encoder.js @@ -426,17 +426,19 @@ jspb.BinaryEncoder.prototype.writeString = function(value) { if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae c = (c - 0xD800) * 0x400 + second - 0xDC00 + 0x10000; + + this.buffer_.push((c >> 18) | 240); + this.buffer_.push(((c >> 12) & 63 ) | 128); + this.buffer_.push(((c >> 6) & 63) | 128); + this.buffer_.push((c & 63) | 128); + i++; } } - this.buffer_.push((c >> 12) | 224); - this.buffer_.push(((c >> 6) & 63) | 128); - this.buffer_.push((c & 63) | 128); - } else { - this.buffer_.push((c >> 18) | 240); - this.buffer_.push(((c >> 12) & 63 ) | 128); - this.buffer_.push(((c >> 6) & 63) | 128); - this.buffer_.push((c & 63) | 128); - i++; + else { + this.buffer_.push((c >> 12) | 224); + this.buffer_.push(((c >> 6) & 63) | 128); + this.buffer_.push((c & 63) | 128); + } } } -- cgit v1.2.3