1 files changed, 105 insertions, 41 deletions
diff --git a/js/binary/decoder.js b/js/binary/decoder.js
index 9004eff0..e33bf1be 100644
--- a/js/binary/decoder.js
+++ b/js/binary/decoder.js
@@ -47,6 +47,7 @@ goog.provide('jspb.BinaryDecoder');
 goog.provide('jspb.BinaryIterator');
 
 goog.require('goog.asserts');
+goog.require('goog.crypt');
 goog.require('jspb.utils');
 
 
@@ -57,7 +58,7 @@ goog.require('jspb.utils');
  * @param {?jspb.BinaryDecoder=} opt_decoder
  * @param {?function(this:jspb.BinaryDecoder):(number|boolean|string)=}
  *     opt_next The decoder method to use for next().
- * @param {?Array.<number|boolean|string>=} opt_elements
+ * @param {?Array<number|boolean|string>=} opt_elements
  * @constructor
  * @struct
  */
@@ -71,7 +72,7 @@ jspb.BinaryIterator = function(opt_decoder, opt_next, opt_elements) {
    */
   this.nextMethod_ = null;
 
-  /** @private {Array.<number>} */
+  /** @private {?Array<number|boolean|string>} */
   this.elements_ = null;
 
   /** @private {number} */
@@ -91,7 +92,7 @@ jspb.BinaryIterator = function(opt_decoder, opt_next, opt_elements) {
  * @param {?jspb.BinaryDecoder=} opt_decoder
  * @param {?function(this:jspb.BinaryDecoder):(number|boolean|string)=}
  *     opt_next The decoder method to use for next().
- * @param {?Array.<number|boolean|string>=} opt_elements
+ * @param {?Array<number|boolean|string>=} opt_elements
  * @private
  */
 jspb.BinaryIterator.prototype.init_ =
@@ -100,7 +101,7 @@ jspb.BinaryIterator.prototype.init_ =
     this.decoder_ = opt_decoder;
     this.nextMethod_ = opt_next;
   }
-  this.elements_ = opt_elements ? opt_elements : null;
+  this.elements_ = opt_elements || null;
   this.cursor_ = 0;
   this.nextValue_ = null;
   this.atEnd_ = !this.decoder_ && !this.elements_;
@@ -111,7 +112,7 @@ jspb.BinaryIterator.prototype.init_ =
 
 /**
  * Global pool of BinaryIterator instances.
- * @private {!Array.<!jspb.BinaryIterator>}
+ * @private {!Array<!jspb.BinaryIterator>}
  */
 jspb.BinaryIterator.instanceCache_ = [];
 
@@ -122,7 +123,7 @@ jspb.BinaryIterator.instanceCache_ = [];
  * @param {?jspb.BinaryDecoder=} opt_decoder
  * @param {?function(this:jspb.BinaryDecoder):(number|boolean|string)=}
  *     opt_next The decoder method to use for next().
- * @param {?Array.<number|boolean|string>=} opt_elements
+ * @param {?Array<number|boolean|string>=} opt_elements
  * @return {!jspb.BinaryIterator}
  */
 jspb.BinaryIterator.alloc = function(opt_decoder, opt_next, opt_elements) {
@@ -223,7 +224,7 @@ jspb.BinaryIterator.prototype.next = function() {
 jspb.BinaryDecoder = function(opt_bytes, opt_start, opt_length) {
   /**
    * Typed byte-wise view of the source buffer.
-   * @private {Uint8Array}
+   * @private {?Uint8Array}
    */
   this.bytes_ = null;
 
@@ -273,7 +274,7 @@ jspb.BinaryDecoder = function(opt_bytes, opt_start, opt_length) {
 
 /**
  * Global pool of BinaryDecoder instances.
- * @private {!Array.<!jspb.BinaryDecoder>}
+ * @private {!Array<!jspb.BinaryDecoder>}
  */
 jspb.BinaryDecoder.instanceCache_ = [];
 
@@ -335,7 +336,7 @@ jspb.BinaryDecoder.prototype.clear = function() {
 
 /**
  * Returns the raw buffer.
- * @return {Uint8Array} The raw buffer.
+ * @return {?Uint8Array} The raw buffer.
  */
 jspb.BinaryDecoder.prototype.getBuffer = function() {
   return this.bytes_;
@@ -582,27 +583,24 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
   x |= (temp & 0x0F) << 28;
   if (temp < 128) {
     // We're reading the high bits of an unsigned varint. The byte we just read
-    // also contains bits 33 through 35, which we're going to discard. Those
-    // bits _must_ be zero, or the encoding is invalid.
-    goog.asserts.assert((temp & 0xF0) == 0);
+    // also contains bits 33 through 35, which we're going to discard.
     this.cursor_ += 5;
     goog.asserts.assert(this.cursor_ <= this.end_);
     return x >>> 0;
   }
 
-  // If we get here, we're reading the sign extension of a negative 32-bit int.
-  // We can skip these bytes, as we know in advance that they have to be all
-  // 1's if the varint is correctly encoded. Since we also know the value is
-  // negative, we don't have to coerce it to unsigned before we return it.
-
-  goog.asserts.assert((temp & 0xF0) == 0xF0);
-  goog.asserts.assert(bytes[this.cursor_ + 5] == 0xFF);
-  goog.asserts.assert(bytes[this.cursor_ + 6] == 0xFF);
-  goog.asserts.assert(bytes[this.cursor_ + 7] == 0xFF);
-  goog.asserts.assert(bytes[this.cursor_ + 8] == 0xFF);
-  goog.asserts.assert(bytes[this.cursor_ + 9] == 0x01);
+  // If we get here, we need to truncate coming bytes. However we need to make
+  // sure cursor place is correct.
+  this.cursor_ += 5;
+  if (bytes[this.cursor_++] >= 128 &&
+      bytes[this.cursor_++] >= 128 &&
+      bytes[this.cursor_++] >= 128 &&
+      bytes[this.cursor_++] >= 128 &&
+      bytes[this.cursor_++] >= 128) {
+    // If we get here, the varint is too long.
+    goog.asserts.assert(false);
+  }
 
-  this.cursor_ += 10;
   goog.asserts.assert(this.cursor_ <= this.end_);
   return x;
 };
@@ -631,6 +629,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32String = function() {
   return value.toString();
 };
 
+
 /**
  * Reads a 32-bit signed variant and returns its value as a string.
  *
@@ -732,6 +731,24 @@ jspb.BinaryDecoder.prototype.readZigzagVarint64 = function() {
 
 
 /**
+ * Reads a signed, zigzag-encoded 64-bit varint from the binary stream and
+ * returns its valud as a string.
+ *
+ * Zigzag encoding is a modification of varint encoding that reduces the
+ * storage overhead for small negative integers - for more details on the
+ * format, see https://developers.google.com/protocol-buffers/docs/encoding
+ *
+ * @return {string} The decoded signed, zigzag-encoded 64-bit varint as a
+ * string.
+ */
+jspb.BinaryDecoder.prototype.readZigzagVarint64String = function() {
+  // TODO(haberman): write lossless 64-bit zig-zag math.
+  var value = this.readZigzagVarint64();
+  return value.toString();
+};
+
+
+/**
  * Reads a raw unsigned 8-bit integer from the binary stream.
  *
  * @return {number} The unsigned 8-bit integer read from the binary stream.
@@ -790,6 +807,20 @@ jspb.BinaryDecoder.prototype.readUint64 = function() {
 
 
 /**
+ * Reads a raw unsigned 64-bit integer from the binary stream. Note that since
+ * Javascript represents all numbers as double-precision floats, there will be
+ * precision lost if the absolute value of the integer is larger than 2^53.
+ *
+ * @return {string} The unsigned 64-bit integer read from the binary stream.
+ */
+jspb.BinaryDecoder.prototype.readUint64String = function() {
+  var bitsLow = this.readUint32();
+  var bitsHigh = this.readUint32();
+  return jspb.utils.joinUnsignedDecimalString(bitsLow, bitsHigh);
+};
+
+
+/**
  * Reads a raw signed 8-bit integer from the binary stream.
  *
  * @return {number} The signed 8-bit integer read from the binary stream.
@@ -848,6 +879,20 @@ jspb.BinaryDecoder.prototype.readInt64 = function() {
 
 
 /**
+ * Reads a raw signed 64-bit integer from the binary stream and returns it as a
+ * string.
+ *
+ * @return {string} The signed 64-bit integer read from the binary stream.
+ *     Precision will be lost if the integer exceeds 2^53.
+ */
+jspb.BinaryDecoder.prototype.readInt64String = function() {
+  var bitsLow = this.readUint32();
+  var bitsHigh = this.readUint32();
+  return jspb.utils.joinSignedDecimalString(bitsLow, bitsHigh);
+};
+
+
+/**
  * Reads a 32-bit floating-point number from the binary stream, using the
  * temporary buffer to realign the data.
  *
@@ -894,11 +939,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
 
 /**
  * Reads and parses a UTF-8 encoded unicode string from the stream.
- * The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with
- * the exception that the implementation here does not get confused if it
- * encounters characters longer than three bytes. These characters are ignored
- * though, as they are extremely rare: three UTF-8 bytes cover virtually all
- * characters in common use (http://en.wikipedia.org/wiki/UTF-8).
+ * The code is inspired by maps.vectortown.parse.StreamedDataViewReader.
+ * Supports codepoints from U+0000 up to U+10FFFF.
+ * (http://en.wikipedia.org/wiki/UTF-8).
  * @param {number} length The length of the string to read.
  * @return {string} The decoded string.
  */
@@ -906,30 +949,50 @@ jspb.BinaryDecoder.prototype.readString = function(length) {
   var bytes = this.bytes_;
   var cursor = this.cursor_;
   var end = cursor + length;
-  var chars = [];
+  var codeUnits = [];
 
+  var result = '';
   while (cursor < end) {
     var c = bytes[cursor++];
     if (c < 128) { // Regular 7-bit ASCII.
-      chars.push(c);
+      codeUnits.push(c);
     } else if (c < 192) {
       // UTF-8 continuation mark. We are out of sync. This
       // might happen if we attempted to read a character
-      // with more than three bytes.
+      // with more than four bytes.
       continue;
     } else if (c < 224) { // UTF-8 with two bytes.
       var c2 = bytes[cursor++];
-      chars.push(((c & 31) << 6) | (c2 & 63));
+      codeUnits.push(((c & 31) << 6) | (c2 & 63));
     } else if (c < 240) { // UTF-8 with three bytes.
       var c2 = bytes[cursor++];
       var c3 = bytes[cursor++];
-      chars.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+      codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+    } else if (c < 248) { // UTF-8 with 4 bytes.
+      var c2 = bytes[cursor++];
+      var c3 = bytes[cursor++];
+      var c4 = bytes[cursor++];
+      // Characters written on 4 bytes have 21 bits for a codepoint.
+      // We can't fit that on 16bit characters, so we use surrogates.
+      var codepoint = ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
+      // Surrogates formula from wikipedia.
+      // 1. Subtract 0x10000 from codepoint
+      codepoint -= 0x10000;
+      // 2. Split this into the high 10-bit value and the low 10-bit value
+      // 3. Add 0xD800 to the high value to form the high surrogate
+      // 4. Add 0xDC00 to the low value to form the low surrogate:
+      var low = (codepoint & 1023) + 0xDC00;
+      var high = ((codepoint >> 10) & 1023) + 0xD800;
+      codeUnits.push(high, low);
     }
-  }
 
-  // String.fromCharCode.apply is faster than manually appending characters on
-  // Chrome 25+, and generates no additional cons string garbage.
-  var result = String.fromCharCode.apply(null, chars);
+    // Avoid exceeding the maximum stack size when calling `apply`.
+    if (codeUnits.length >= 8192) {
+      result += String.fromCharCode.apply(null, codeUnits);
+      codeUnits.length = 0;
+    }
+  }
+  result += goog.crypt.byteArrayToString(codeUnits);
   this.cursor_ = cursor;
   return result;
 };
@@ -950,14 +1013,15 @@ jspb.BinaryDecoder.prototype.readStringWithLength = function() {
  * Reads a block of raw bytes from the binary stream.
  *
  * @param {number} length The number of bytes to read.
- * @return {Uint8Array} The decoded block of bytes, or null if the length was
- *     invalid.
+ * @return {!Uint8Array} The decoded block of bytes, or an empty block if the
+ *     length was invalid.
  */
 jspb.BinaryDecoder.prototype.readBytes = function(length) {
   if (length < 0 ||
       this.cursor_ + length > this.bytes_.length) {
     this.error_ = true;
-    return null;
+    goog.asserts.fail('Invalid byte length!');
+    return new Uint8Array(0);
   }
 
   var result = this.bytes_.subarray(this.cursor_, this.cursor_ + length);