diff options
Diffstat (limited to 'src/google/protobuf/util/internal/json_stream_parser.cc')
-rw-r--r-- | src/google/protobuf/util/internal/json_stream_parser.cc | 740 |
1 files changed, 740 insertions, 0 deletions
diff --git a/src/google/protobuf/util/internal/json_stream_parser.cc b/src/google/protobuf/util/internal/json_stream_parser.cc new file mode 100644 index 00000000..d439a221 --- /dev/null +++ b/src/google/protobuf/util/internal/json_stream_parser.cc @@ -0,0 +1,740 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <google/protobuf/util/internal/json_stream_parser.h> + +#include <algorithm> +#include <cctype> +#include <cerrno> +#include <cstdlib> +#include <cstring> +#include <memory> +#ifndef _SHARED_PTR_H +#include <google/protobuf/stubs/shared_ptr.h> +#endif + +#include <google/protobuf/stubs/common.h> +#include <google/protobuf/stubs/strutil.h> +#include <google/protobuf/util/internal/object_writer.h> + +namespace google { +namespace protobuf { +namespace util { + +// Allow these symbols to be referenced as util::Status, util::error::* in +// this file. +using util::Status; +namespace error { +using util::error::INTERNAL; +using util::error::INVALID_ARGUMENT; +} // namespace error + +namespace converter { + +// Number of digits in a unicode escape sequence (/uXXXX) +static const int kUnicodeEscapedLength = 6; + +// Length of the true, false, and null literals. +static const int true_len = strlen("true"); +static const int false_len = strlen("false"); +static const int null_len = strlen("null"); + +inline bool IsLetter(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') || + (c == '$'); +} + +inline bool IsAlphanumeric(char c) { + return IsLetter(c) || ('0' <= c && c <= '9'); +} + +static bool ConsumeKey(StringPiece* input, StringPiece* key) { + if (input->empty() || !IsLetter((*input)[0])) return false; + int len = 1; + for (; len < input->size(); ++len) { + if (!IsAlphanumeric((*input)[len])) { + break; + } + } + *key = StringPiece(input->data(), len); + *input = StringPiece(input->data() + len, input->size() - len); + return true; +} + +static bool MatchKey(StringPiece input) { + return !input.empty() && IsLetter(input[0]); +} + +JsonStreamParser::JsonStreamParser(ObjectWriter* ow) + : ow_(ow), + stack_(), + leftover_(), + json_(), + p_(), + key_(), + key_storage_(), + finishing_(false), + parsed_(), + parsed_storage_(), + string_open_(0), + utf8_storage_(), + utf8_length_(0) { + // Initialize the stack with a single value to be parsed. + stack_.push(VALUE); +} + +JsonStreamParser::~JsonStreamParser() {} + +util::Status JsonStreamParser::Parse(StringPiece json) { + return ParseChunk(json); +} + +util::Status JsonStreamParser::FinishParse() { + // If we do not expect anything and there is nothing left to parse we're all + // done. + if (stack_.empty() && leftover_.empty()) { + return util::Status::OK; + } + // Parse the remainder in finishing mode, which reports errors for things like + // unterminated strings or unknown tokens that would normally be retried. + p_ = json_ = StringPiece(leftover_); + finishing_ = true; + util::Status result = RunParser(); + if (result.ok()) { + SkipWhitespace(); + if (!p_.empty()) { + result = ReportFailure("Parsing terminated before end of input."); + } + } + return result; +} + +util::Status JsonStreamParser::ParseChunk(StringPiece chunk) { + // If we have leftovers from a previous chunk, append the new chunk to it and + // create a new StringPiece pointing at the string's data. This could be + // large but we rely on the chunks to be small, assuming they are fragments + // of a Cord. + if (!leftover_.empty()) { + chunk.AppendToString(&leftover_); + p_ = json_ = StringPiece(leftover_); + } else { + p_ = json_ = chunk; + } + + finishing_ = false; + util::Status result = RunParser(); + if (!result.ok()) return result; + + SkipWhitespace(); + if (p_.empty()) { + // If we parsed everything we had, clear the leftover. + leftover_.clear(); + } else { + // If we do not expect anything i.e. stack is empty, and we have non-empty + // string left to parse, we report an error. + if (stack_.empty()) { + return ReportFailure("Parsing terminated before end of input."); + } + // If we expect future data i.e. stack is non-empty, and we have some + // unparsed data left, we save it for later parse. + leftover_ = p_.ToString(); + } + return util::Status::OK; +} + +util::Status JsonStreamParser::RunParser() { + while (!stack_.empty()) { + ParseType type = stack_.top(); + TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING; + stack_.pop(); + util::Status result; + switch (type) { + case VALUE: + result = ParseValue(t); + break; + + case OBJ_MID: + result = ParseObjectMid(t); + break; + + case ENTRY: + result = ParseEntry(t); + break; + + case ENTRY_MID: + result = ParseEntryMid(t); + break; + + case ARRAY_VALUE: + result = ParseArrayValue(t); + break; + + case ARRAY_MID: + result = ParseArrayMid(t); + break; + + default: + result = util::Status(util::error::INTERNAL, + StrCat("Unknown parse type: ", type)); + break; + } + if (!result.ok()) { + // If we were cancelled, save our state and try again later. + if (!finishing_ && result == util::Status::CANCELLED) { + stack_.push(type); + // If we have a key we still need to render, make sure to save off the + // contents in our own storage. + if (!key_.empty() && key_storage_.empty()) { + key_.AppendToString(&key_storage_); + key_ = StringPiece(key_storage_); + } + result = util::Status::OK; + } + return result; + } + } + return util::Status::OK; +} + +util::Status JsonStreamParser::ParseValue(TokenType type) { + switch (type) { + case BEGIN_OBJECT: + return HandleBeginObject(); + case BEGIN_ARRAY: + return HandleBeginArray(); + case BEGIN_STRING: + return ParseString(); + case BEGIN_NUMBER: + return ParseNumber(); + case BEGIN_TRUE: + return ParseTrue(); + case BEGIN_FALSE: + return ParseFalse(); + case BEGIN_NULL: + return ParseNull(); + case UNKNOWN: + return ReportUnknown("Expected a value."); + default: { + // Special case for having been cut off while parsing, wait for more data. + // This handles things like 'fals' being at the end of the string, we + // don't know if the next char would be e, completing it, or something + // else, making it invalid. + if (!finishing_ && p_.length() < false_len) { + return util::Status::CANCELLED; + } + return ReportFailure("Unexpected token."); + } + } +} + +util::Status JsonStreamParser::ParseString() { + util::Status result = ParseStringHelper(); + if (result.ok()) { + ow_->RenderString(key_, parsed_); + key_.clear(); + parsed_.clear(); + parsed_storage_.clear(); + } + return result; +} + +util::Status JsonStreamParser::ParseStringHelper() { + // If we haven't seen the start quote, grab it and remember it for later. + if (string_open_ == 0) { + string_open_ = *p_.data(); + GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\''); + Advance(); + } + // Track where we last copied data from so we can minimize copying. + const char* last = p_.data(); + while (!p_.empty()) { + const char* data = p_.data(); + if (*data == '\\') { + // We're about to handle an escape, copy all bytes from last to data. + if (last < data) { + parsed_storage_.append(last, data - last); + last = data; + } + // If we ran out of string after the \, cancel or report an error + // depending on if we expect more data later. + if (p_.length() == 1) { + if (!finishing_) { + return util::Status::CANCELLED; + } + return ReportFailure("Closing quote expected in string."); + } + // Parse a unicode escape if we found \u in the string. + if (data[1] == 'u') { + util::Status result = ParseUnicodeEscape(); + if (!result.ok()) { + return result; + } + // Move last pointer past the unicode escape and continue. + last = p_.data(); + continue; + } + // Handle the standard set of backslash-escaped characters. + switch (data[1]) { + case 'b': + parsed_storage_.push_back('\b'); + break; + case 'f': + parsed_storage_.push_back('\f'); + break; + case 'n': + parsed_storage_.push_back('\n'); + break; + case 'r': + parsed_storage_.push_back('\r'); + break; + case 't': + parsed_storage_.push_back('\t'); + break; + case 'v': + parsed_storage_.push_back('\v'); + break; + default: + parsed_storage_.push_back(data[1]); + } + // We handled two characters, so advance past them and continue. + p_.remove_prefix(2); + last = p_.data(); + continue; + } + // If we found the closing quote note it, advance past it, and return. + if (*data == string_open_) { + // If we didn't copy anything, reuse the input buffer. + if (parsed_storage_.empty()) { + parsed_ = StringPiece(last, data - last); + } else { + if (last < data) { + parsed_storage_.append(last, data - last); + last = data; + } + parsed_ = StringPiece(parsed_storage_); + } + // Clear the quote char so next time we try to parse a string we'll + // start fresh. + string_open_ = 0; + Advance(); + return util::Status::OK; + } + // Normal character, just advance past it. + Advance(); + } + // If we ran out of characters, copy over what we have so far. + if (last < p_.data()) { + parsed_storage_.append(last, p_.data() - last); + } + // If we didn't find the closing quote but we expect more data, cancel for now + if (!finishing_) { + return util::Status::CANCELLED; + } + // End of string reached without a closing quote, report an error. + string_open_ = 0; + return ReportFailure("Closing quote expected in string."); +} + +// Converts a unicode escaped character to a decimal value stored in a char32 +// for use in UTF8 encoding utility. We assume that str begins with \uhhhh and +// convert that from the hex number to a decimal value. +// +// There are some security exploits with UTF-8 that we should be careful of: +// - http://www.unicode.org/reports/tr36/#UTF-8_Exploit +// - http://sites/intl-eng/design-guide/core-application +util::Status JsonStreamParser::ParseUnicodeEscape() { + if (p_.length() < kUnicodeEscapedLength) { + if (!finishing_) { + return util::Status::CANCELLED; + } + return ReportFailure("Illegal hex string."); + } + GOOGLE_DCHECK_EQ('\\', p_.data()[0]); + GOOGLE_DCHECK_EQ('u', p_.data()[1]); + uint32 code = 0; + for (int i = 2; i < kUnicodeEscapedLength; ++i) { + if (!isxdigit(p_.data()[i])) { + return ReportFailure("Invalid escape sequence."); + } + code = (code << 4) + hex_digit_to_int(p_.data()[i]); + } + char buf[UTFmax]; + int len = EncodeAsUTF8Char(code, buf); + // Advance past the unicode escape. + p_.remove_prefix(kUnicodeEscapedLength); + parsed_storage_.append(buf, len); + return util::Status::OK; +} + +util::Status JsonStreamParser::ParseNumber() { + NumberResult number; + util::Status result = ParseNumberHelper(&number); + if (result.ok()) { + switch (number.type) { + case NumberResult::DOUBLE: + ow_->RenderDouble(key_, number.double_val); + key_.clear(); + break; + + case NumberResult::INT: + ow_->RenderInt64(key_, number.int_val); + key_.clear(); + break; + + case NumberResult::UINT: + ow_->RenderUint64(key_, number.uint_val); + key_.clear(); + break; + + default: + return ReportFailure("Unable to parse number."); + } + } + return result; +} + +util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) { + const char* data = p_.data(); + int length = p_.length(); + + // Look for the first non-numeric character, or the end of the string. + int index = 0; + bool floating = false; + bool negative = data[index] == '-'; + // Find the first character that cannot be part of the number. Along the way + // detect if the number needs to be parsed as a double. + // Note that this restricts numbers to the JSON specification, so for example + // we do not support hex or octal notations. + for (; index < length; ++index) { + char c = data[index]; + if (isdigit(c)) continue; + if (c == '.' || c == 'e' || c == 'E') { + floating = true; + continue; + } + if (c == '+' || c == '-') continue; + // Not a valid number character, break out. + break; + } + + // If the entire input is a valid number, and we may have more content in the + // future, we abort for now and resume when we know more. + if (index == length && !finishing_) { + return util::Status::CANCELLED; + } + + // Create a string containing just the number, so we can use safe_strtoX + string number = p_.substr(0, index).ToString(); + + // Floating point number, parse as a double. + if (floating) { + if (!safe_strtod(number, &result->double_val)) { + return ReportFailure("Unable to parse number."); + } + result->type = NumberResult::DOUBLE; + p_.remove_prefix(index); + return util::Status::OK; + } + + // Positive non-floating point number, parse as a uint64. + if (!negative) { + if (!safe_strtou64(number, &result->uint_val)) { + return ReportFailure("Unable to parse number."); + } + result->type = NumberResult::UINT; + p_.remove_prefix(index); + return util::Status::OK; + } + + // Negative non-floating point number, parse as an int64. + if (!safe_strto64(number, &result->int_val)) { + return ReportFailure("Unable to parse number."); + } + result->type = NumberResult::INT; + p_.remove_prefix(index); + return util::Status::OK; +} + +util::Status JsonStreamParser::HandleBeginObject() { + GOOGLE_DCHECK_EQ('{', *p_.data()); + Advance(); + ow_->StartObject(key_); + key_.clear(); + stack_.push(ENTRY); + return util::Status::OK; +} + +util::Status JsonStreamParser::ParseObjectMid(TokenType type) { + if (type == UNKNOWN) { + return ReportUnknown("Expected , or } after key:value pair."); + } + + // Object is complete, advance past the comma and render the EndObject. + if (type == END_OBJECT) { + Advance(); + ow_->EndObject(); + return util::Status::OK; + } + // Found a comma, advance past it and get ready for an entry. + if (type == VALUE_SEPARATOR) { + Advance(); + stack_.push(ENTRY); + return util::Status::OK; + } + // Illegal token after key:value pair. + return ReportFailure("Expected , or } after key:value pair."); +} + +util::Status JsonStreamParser::ParseEntry(TokenType type) { + if (type == UNKNOWN) { + return ReportUnknown("Expected an object key or }."); + } + + // Close the object and return. This allows for trailing commas. + if (type == END_OBJECT) { + ow_->EndObject(); + Advance(); + return util::Status::OK; + } + + util::Status result; + if (type == BEGIN_STRING) { + // Key is a string (standard JSON), parse it and store the string. + result = ParseStringHelper(); + if (result.ok()) { + key_storage_.clear(); + if (!parsed_storage_.empty()) { + parsed_storage_.swap(key_storage_); + key_ = StringPiece(key_storage_); + } else { + key_ = parsed_; + } + parsed_.clear(); + } + } else if (type == BEGIN_KEY) { + // Key is a bare key (back compat), create a StringPiece pointing to it. + result = ParseKey(); + } else { + // Unknown key type, report an error. + result = ReportFailure("Expected an object key or }."); + } + // On success we next expect an entry mid ':' then an object mid ',' or '}' + if (result.ok()) { + stack_.push(OBJ_MID); + stack_.push(ENTRY_MID); + } + return result; +} + +util::Status JsonStreamParser::ParseEntryMid(TokenType type) { + if (type == UNKNOWN) { + return ReportUnknown("Expected : between key:value pair."); + } + if (type == ENTRY_SEPARATOR) { + Advance(); + stack_.push(VALUE); + return util::Status::OK; + } + return ReportFailure("Expected : between key:value pair."); +} + +util::Status JsonStreamParser::HandleBeginArray() { + GOOGLE_DCHECK_EQ('[', *p_.data()); + Advance(); + ow_->StartList(key_); + key_.clear(); + stack_.push(ARRAY_VALUE); + return util::Status::OK; +} + +util::Status JsonStreamParser::ParseArrayValue(TokenType type) { + if (type == UNKNOWN) { + return ReportUnknown("Expected a value or ] within an array."); + } + + if (type == END_ARRAY) { + ow_->EndList(); + Advance(); + return util::Status::OK; + } + + // The ParseValue call may push something onto the stack so we need to make + // sure an ARRAY_MID is after it, so we push it on now. + stack_.push(ARRAY_MID); + util::Status result = ParseValue(type); + if (result == util::Status::CANCELLED) { + // If we were cancelled, pop back off the ARRAY_MID so we don't try to + // push it on again when we try over. + stack_.pop(); + } + return result; +} + +util::Status JsonStreamParser::ParseArrayMid(TokenType type) { + if (type == UNKNOWN) { + return ReportUnknown("Expected , or ] after array value."); + } + + if (type == END_ARRAY) { + ow_->EndList(); + Advance(); + return util::Status::OK; + } + + // Found a comma, advance past it and expect an array value next. + if (type == VALUE_SEPARATOR) { + Advance(); + stack_.push(ARRAY_VALUE); + return util::Status::OK; + } + // Illegal token after array value. + return ReportFailure("Expected , or ] after array value."); +} + +util::Status JsonStreamParser::ParseTrue() { + ow_->RenderBool(key_, true); + key_.clear(); + p_.remove_prefix(true_len); + return util::Status::OK; +} + +util::Status JsonStreamParser::ParseFalse() { + ow_->RenderBool(key_, false); + key_.clear(); + p_.remove_prefix(false_len); + return util::Status::OK; +} + +util::Status JsonStreamParser::ParseNull() { + ow_->RenderNull(key_); + key_.clear(); + p_.remove_prefix(null_len); + return util::Status::OK; +} + +util::Status JsonStreamParser::ReportFailure(StringPiece message) { + static const int kContextLength = 20; + const char* p_start = p_.data(); + const char* json_start = json_.data(); + const char* begin = max(p_start - kContextLength, json_start); + const char* end = min(p_start + kContextLength, json_start + json_.size()); + StringPiece segment(begin, end - begin); + string location(p_start - begin, ' '); + location.push_back('^'); + return util::Status(util::error::INVALID_ARGUMENT, + StrCat(message, "\n", segment, "\n", location)); +} + +util::Status JsonStreamParser::ReportUnknown(StringPiece message) { + // If we aren't finishing the parse, cancel parsing and try later. + if (!finishing_) { + return util::Status::CANCELLED; + } + if (p_.empty()) { + return ReportFailure(StrCat("Unexpected end of string. ", message)); + } + return ReportFailure(message); +} + +void JsonStreamParser::SkipWhitespace() { + while (!p_.empty() && ascii_isspace(*p_.data())) { + Advance(); + } +} + +void JsonStreamParser::Advance() { + // Advance by moving one UTF8 character while making sure we don't go beyond + // the length of StringPiece. + p_.remove_prefix( + min<int>(p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length()))); +} + +util::Status JsonStreamParser::ParseKey() { + StringPiece original = p_; + if (!ConsumeKey(&p_, &key_)) { + return ReportFailure("Invalid key or variable name."); + } + // If we consumed everything but expect more data, reset p_ and cancel since + // we can't know if the key was complete or not. + if (!finishing_ && p_.empty()) { + p_ = original; + return util::Status::CANCELLED; + } + // Since we aren't using the key storage, clear it out. + key_storage_.clear(); + return util::Status::OK; +} + +JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() { + SkipWhitespace(); + + int size = p_.size(); + if (size == 0) { + // If we ran out of data, report unknown and we'll place the previous parse + // type onto the stack and try again when we have more data. + return UNKNOWN; + } + // TODO(sven): Split this method based on context since different contexts + // support different tokens. Would slightly speed up processing? + const char* data = p_.data(); + if (*data == '\"' || *data == '\'') return BEGIN_STRING; + if (*data == '-' || ('0' <= *data && *data <= '9')) { + return BEGIN_NUMBER; + } + if (size >= true_len && !strncmp(data, "true", true_len)) { + return BEGIN_TRUE; + } + if (size >= false_len && !strncmp(data, "false", false_len)) { + return BEGIN_FALSE; + } + if (size >= null_len && !strncmp(data, "null", null_len)) { + return BEGIN_NULL; + } + if (*data == '{') return BEGIN_OBJECT; + if (*data == '}') return END_OBJECT; + if (*data == '[') return BEGIN_ARRAY; + if (*data == ']') return END_ARRAY; + if (*data == ':') return ENTRY_SEPARATOR; + if (*data == ',') return VALUE_SEPARATOR; + if (MatchKey(p_)) { + return BEGIN_KEY; + } + + // We don't know that we necessarily have an invalid token here, just that we + // can't parse what we have so far. So we don't report an error and just + // return UNKNOWN so we can try again later when we have more data, or if we + // finish and we have leftovers. + return UNKNOWN; +} + +} // namespace converter +} // namespace util +} // namespace protobuf +} // namespace google |