aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/google/protobuf/io
diff options
context:
space:
mode:
authorGravatar xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d>2012-09-22 02:40:50 +0000
committerGravatar xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d>2012-09-22 02:40:50 +0000
commitb55a20fa2c669b181f47ea9219b8e74d1263da19 (patch)
tree3936a0e7c22196587a6d8397372de41434fe2129 /src/google/protobuf/io
parent9ced30caf94bb4e7e9629c199679ff44e8ca7389 (diff)
Down-integrate from internal branch
Diffstat (limited to 'src/google/protobuf/io')
-rw-r--r--src/google/protobuf/io/coded_stream.cc48
-rw-r--r--src/google/protobuf/io/coded_stream.h47
-rw-r--r--src/google/protobuf/io/coded_stream_inl.h2
-rw-r--r--src/google/protobuf/io/coded_stream_unittest.cc62
-rw-r--r--src/google/protobuf/io/gzip_stream.cc19
-rw-r--r--src/google/protobuf/io/gzip_stream.h14
-rw-r--r--src/google/protobuf/io/printer.cc7
-rw-r--r--src/google/protobuf/io/printer_unittest.cc26
-rw-r--r--src/google/protobuf/io/tokenizer.cc497
-rw-r--r--src/google/protobuf/io/tokenizer.h85
-rw-r--r--src/google/protobuf/io/tokenizer_unittest.cc241
-rw-r--r--src/google/protobuf/io/zero_copy_stream_impl.cc3
-rw-r--r--src/google/protobuf/io/zero_copy_stream_impl_lite.cc4
-rw-r--r--src/google/protobuf/io/zero_copy_stream_unittest.cc94
14 files changed, 1030 insertions, 119 deletions
diff --git a/src/google/protobuf/io/coded_stream.cc b/src/google/protobuf/io/coded_stream.cc
index 402a3ad3..36add8c3 100644
--- a/src/google/protobuf/io/coded_stream.cc
+++ b/src/google/protobuf/io/coded_stream.cc
@@ -43,7 +43,7 @@
#include <limits.h>
#include <google/protobuf/io/zero_copy_stream.h>
#include <google/protobuf/stubs/common.h>
-#include <google/protobuf/stubs/stl_util-inl.h>
+#include <google/protobuf/stubs/stl_util.h>
namespace google {
@@ -69,6 +69,19 @@ inline bool NextNonEmpty(ZeroCopyInputStream* input,
// CodedInputStream ==================================================
+CodedInputStream::~CodedInputStream() {
+ if (input_ != NULL) {
+ BackUpInputToCurrentPosition();
+ }
+
+ if (total_bytes_warning_threshold_ == -2) {
+ GOOGLE_LOG(WARNING) << "The total number of bytes read was " << total_bytes_read_;
+ }
+}
+
+// Static.
+int CodedInputStream::default_recursion_limit_ = 100;
+
void CodedInputStream::BackUpInputToCurrentPosition() {
int backup_bytes = BufferSize() + buffer_size_after_limit_ + overflow_bytes_;
@@ -98,8 +111,7 @@ inline void CodedInputStream::RecomputeBufferLimits() {
CodedInputStream::Limit CodedInputStream::PushLimit(int byte_limit) {
// Current position relative to the beginning of the stream.
- int current_position = total_bytes_read_ -
- (BufferSize() + buffer_size_after_limit_);
+ int current_position = CurrentPosition();
Limit old_limit = current_limit_;
@@ -133,10 +145,9 @@ void CodedInputStream::PopLimit(Limit limit) {
legitimate_message_end_ = false;
}
-int CodedInputStream::BytesUntilLimit() {
+int CodedInputStream::BytesUntilLimit() const {
if (current_limit_ == INT_MAX) return -1;
- int current_position = total_bytes_read_ -
- (BufferSize() + buffer_size_after_limit_);
+ int current_position = CurrentPosition();
return current_limit_ - current_position;
}
@@ -145,10 +156,14 @@ void CodedInputStream::SetTotalBytesLimit(
int total_bytes_limit, int warning_threshold) {
// Make sure the limit isn't already past, since this could confuse other
// code.
- int current_position = total_bytes_read_ -
- (BufferSize() + buffer_size_after_limit_);
+ int current_position = CurrentPosition();
total_bytes_limit_ = max(current_position, total_bytes_limit);
- total_bytes_warning_threshold_ = warning_threshold;
+ if (warning_threshold >= 0) {
+ total_bytes_warning_threshold_ = warning_threshold;
+ } else {
+ // warning_threshold is negative
+ total_bytes_warning_threshold_ = -1;
+ }
RecomputeBufferLimits();
}
@@ -368,16 +383,17 @@ uint32 CodedInputStream::ReadTagSlow() {
// For the slow path, just do a 64-bit read. Try to optimize for one-byte tags
// again, since we have now refreshed the buffer.
- uint64 result;
+ uint64 result = 0;
if (!ReadVarint64(&result)) return 0;
return static_cast<uint32>(result);
}
uint32 CodedInputStream::ReadTagFallback() {
- if (BufferSize() >= kMaxVarintBytes ||
+ const int buf_size = BufferSize();
+ if (buf_size >= kMaxVarintBytes ||
// Optimization: If the varint ends at exactly the end of the buffer,
// we can detect that and still use the fast path.
- (buffer_end_ > buffer_ && !(buffer_end_[-1] & 0x80))) {
+ (buf_size > 0 && !(buffer_end_[-1] & 0x80))) {
uint32 tag;
const uint8* end = ReadVarint32FromArray(buffer_, &tag);
if (end == NULL) {
@@ -388,7 +404,9 @@ uint32 CodedInputStream::ReadTagFallback() {
} else {
// We are commonly at a limit when attempting to read tags. Try to quickly
// detect this case without making another function call.
- if (buffer_ == buffer_end_ && buffer_size_after_limit_ > 0 &&
+ if ((buf_size == 0) &&
+ ((buffer_size_after_limit_ > 0) ||
+ (total_bytes_read_ == current_limit_)) &&
// Make sure that the limit we hit is not total_bytes_limit_, since
// in that case we still need to call Refresh() so that it prints an
// error.
@@ -492,8 +510,8 @@ bool CodedInputStream::Refresh() {
"CodedInputStream::SetTotalBytesLimit() in "
"google/protobuf/io/coded_stream.h.";
- // Don't warn again for this stream.
- total_bytes_warning_threshold_ = -1;
+ // Don't warn again for this stream, and print total size at the end.
+ total_bytes_warning_threshold_ = -2;
}
const void* void_buffer;
diff --git a/src/google/protobuf/io/coded_stream.h b/src/google/protobuf/io/coded_stream.h
index 97ac5079..66cbee00 100644
--- a/src/google/protobuf/io/coded_stream.h
+++ b/src/google/protobuf/io/coded_stream.h
@@ -170,6 +170,9 @@ class LIBPROTOBUF_EXPORT CodedInputStream {
// successfully and the stream's byte limit.
~CodedInputStream();
+ // Return true if this CodedInputStream reads from a flat array instead of
+ // a ZeroCopyInputStream.
+ inline bool IsFlat() const;
// Skips a number of bytes. Returns false if an underlying read error
// occurs.
@@ -311,7 +314,10 @@ class LIBPROTOBUF_EXPORT CodedInputStream {
// Returns the number of bytes left until the nearest limit on the
// stack is hit, or -1 if no limits are in place.
- int BytesUntilLimit();
+ int BytesUntilLimit() const;
+
+ // Returns current position relative to the beginning of the input stream.
+ int CurrentPosition() const;
// Total Bytes Limit -----------------------------------------------
// To prevent malicious users from sending excessively large messages
@@ -327,8 +333,9 @@ class LIBPROTOBUF_EXPORT CodedInputStream {
// cause integer overflows is 512MB. The default limit is 64MB. Apps
// should set shorter limits if possible. If warning_threshold is not -1,
// a warning will be printed to stderr after warning_threshold bytes are
- // read. An error will always be printed to stderr if the limit is
- // reached.
+ // read. For backwards compatibility all negative values get squached to -1,
+ // as other negative values might have special internal meanings.
+ // An error will always be printed to stderr if the limit is reached.
//
// This is unrelated to PushLimit()/PopLimit().
//
@@ -355,9 +362,10 @@ class LIBPROTOBUF_EXPORT CodedInputStream {
// messages and groups. CodedInputStream keeps track of this because it
// is the only object that is passed down the stack during parsing.
- // Sets the maximum recursion depth. The default is 64.
+ // Sets the maximum recursion depth. The default is 100.
void SetRecursionLimit(int limit);
+
// Increments the current recursion depth. Returns true if the depth is
// under the limit, false if it has gone over.
bool IncrementRecursionDepth();
@@ -433,7 +441,8 @@ class LIBPROTOBUF_EXPORT CodedInputStream {
//
// Note that this feature is ignored when parsing "lite" messages as they do
// not have descriptors.
- void SetExtensionRegistry(DescriptorPool* pool, MessageFactory* factory);
+ void SetExtensionRegistry(const DescriptorPool* pool,
+ MessageFactory* factory);
// Get the DescriptorPool set via SetExtensionRegistry(), or NULL if no pool
// has been provided.
@@ -482,6 +491,11 @@ class LIBPROTOBUF_EXPORT CodedInputStream {
// Maximum number of bytes to read, period. This is unrelated to
// current_limit_. Set using SetTotalBytesLimit().
int total_bytes_limit_;
+
+ // If positive/0: Limit for bytes read after which a warning due to size
+ // should be logged.
+ // If -1: Printing of warning disabled. Can be set by client.
+ // If -2: Internal: Limit has been reached, print full size when destructing.
int total_bytes_warning_threshold_;
// Current recursion depth, controlled by IncrementRecursionDepth() and
@@ -539,7 +553,8 @@ class LIBPROTOBUF_EXPORT CodedInputStream {
static const int kDefaultTotalBytesLimit = 64 << 20; // 64MB
static const int kDefaultTotalBytesWarningThreshold = 32 << 20; // 32MB
- static const int kDefaultRecursionLimit = 64;
+
+ static int default_recursion_limit_; // 100 by default.
};
// Class which encodes and writes binary data which is composed of varint-
@@ -891,7 +906,9 @@ inline bool CodedInputStream::ExpectAtEnd() {
// If we are at a limit we know no more bytes can be read. Otherwise, it's
// hard to say without calling Refresh(), and we'd rather not do that.
- if (buffer_ == buffer_end_ && buffer_size_after_limit_ != 0) {
+ if (buffer_ == buffer_end_ &&
+ ((buffer_size_after_limit_ != 0) ||
+ (total_bytes_read_ == current_limit_))) {
last_tag_ = 0; // Pretend we called ReadTag()...
legitimate_message_end_ = true; // ... and it hit EOF.
return true;
@@ -900,6 +917,10 @@ inline bool CodedInputStream::ExpectAtEnd() {
}
}
+inline int CodedInputStream::CurrentPosition() const {
+ return total_bytes_read_ - (BufferSize() + buffer_size_after_limit_);
+}
+
inline uint8* CodedOutputStream::GetDirectBufferForNBytesAndAdvance(int size) {
if (buffer_size_ < size) {
return NULL;
@@ -1039,7 +1060,7 @@ inline void CodedInputStream::DecrementRecursionDepth() {
if (recursion_depth_ > 0) --recursion_depth_;
}
-inline void CodedInputStream::SetExtensionRegistry(DescriptorPool* pool,
+inline void CodedInputStream::SetExtensionRegistry(const DescriptorPool* pool,
MessageFactory* factory) {
extension_pool_ = pool;
extension_factory_ = factory;
@@ -1071,7 +1092,7 @@ inline CodedInputStream::CodedInputStream(ZeroCopyInputStream* input)
total_bytes_limit_(kDefaultTotalBytesLimit),
total_bytes_warning_threshold_(kDefaultTotalBytesWarningThreshold),
recursion_depth_(0),
- recursion_limit_(kDefaultRecursionLimit),
+ recursion_limit_(default_recursion_limit_),
extension_pool_(NULL),
extension_factory_(NULL) {
// Eagerly Refresh() so buffer space is immediately available.
@@ -1092,17 +1113,15 @@ inline CodedInputStream::CodedInputStream(const uint8* buffer, int size)
total_bytes_limit_(kDefaultTotalBytesLimit),
total_bytes_warning_threshold_(kDefaultTotalBytesWarningThreshold),
recursion_depth_(0),
- recursion_limit_(kDefaultRecursionLimit),
+ recursion_limit_(default_recursion_limit_),
extension_pool_(NULL),
extension_factory_(NULL) {
// Note that setting current_limit_ == size is important to prevent some
// code paths from trying to access input_ and segfaulting.
}
-inline CodedInputStream::~CodedInputStream() {
- if (input_ != NULL) {
- BackUpInputToCurrentPosition();
- }
+inline bool CodedInputStream::IsFlat() const {
+ return input_ == NULL;
}
} // namespace io
diff --git a/src/google/protobuf/io/coded_stream_inl.h b/src/google/protobuf/io/coded_stream_inl.h
index e9799d47..94495fb8 100644
--- a/src/google/protobuf/io/coded_stream_inl.h
+++ b/src/google/protobuf/io/coded_stream_inl.h
@@ -38,7 +38,7 @@
#include <google/protobuf/io/coded_stream.h>
#include <string>
-#include <google/protobuf/stubs/stl_util-inl.h>
+#include <google/protobuf/stubs/stl_util.h>
namespace google {
namespace protobuf {
diff --git a/src/google/protobuf/io/coded_stream_unittest.cc b/src/google/protobuf/io/coded_stream_unittest.cc
index ff268ab9..2daab194 100644
--- a/src/google/protobuf/io/coded_stream_unittest.cc
+++ b/src/google/protobuf/io/coded_stream_unittest.cc
@@ -44,7 +44,6 @@
#include <google/protobuf/testing/googletest.h>
#include <gtest/gtest.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/stubs/strutil.h>
// This declares an unsigned long long integer literal in a portable way.
@@ -125,6 +124,13 @@ namespace {
class CodedStreamTest : public testing::Test {
protected:
+ // Helper method used by tests for bytes warning. See implementation comment
+ // for further information.
+ static void SetupTotalBytesLimitWarningTest(
+ int total_bytes_limit, int warning_threshold,
+ vector<string>* out_errors, vector<string>* out_warnings);
+
+ // Buffer used during most of the tests. This assumes tests run sequentially.
static const int kBufferSize = 1024 * 64;
static uint8 buffer_[kBufferSize];
};
@@ -1022,6 +1028,59 @@ TEST_F(CodedStreamTest, TotalBytesLimitNotValidMessageEnd) {
EXPECT_FALSE(coded_input.ConsumedEntireMessage());
}
+// This method is used by the tests below.
+// It constructs a CodedInputStream with the given limits and tries to read 2KiB
+// of data from it. Then it returns the logged errors and warnings in the given
+// vectors.
+void CodedStreamTest::SetupTotalBytesLimitWarningTest(
+ int total_bytes_limit, int warning_threshold,
+ vector<string>* out_errors, vector<string>* out_warnings) {
+ ArrayInputStream raw_input(buffer_, sizeof(buffer_), 128);
+
+ ScopedMemoryLog scoped_log;
+ {
+ CodedInputStream input(&raw_input);
+ input.SetTotalBytesLimit(total_bytes_limit, warning_threshold);
+ string str;
+ EXPECT_TRUE(input.ReadString(&str, 2048));
+ }
+
+ *out_errors = scoped_log.GetMessages(ERROR);
+ *out_warnings = scoped_log.GetMessages(WARNING);
+}
+
+TEST_F(CodedStreamTest, TotalBytesLimitWarning) {
+ vector<string> errors;
+ vector<string> warnings;
+ SetupTotalBytesLimitWarningTest(10240, 1024, &errors, &warnings);
+
+ EXPECT_EQ(0, errors.size());
+
+ ASSERT_EQ(2, warnings.size());
+ EXPECT_PRED_FORMAT2(testing::IsSubstring,
+ "Reading dangerously large protocol message. If the message turns out to "
+ "be larger than 10240 bytes, parsing will be halted for security reasons.",
+ warnings[0]);
+ EXPECT_PRED_FORMAT2(testing::IsSubstring,
+ "The total number of bytes read was 2048",
+ warnings[1]);
+}
+
+TEST_F(CodedStreamTest, TotalBytesLimitWarningDisabled) {
+ vector<string> errors;
+ vector<string> warnings;
+
+ // Test with -1
+ SetupTotalBytesLimitWarningTest(10240, -1, &errors, &warnings);
+ EXPECT_EQ(0, errors.size());
+ EXPECT_EQ(0, warnings.size());
+
+ // Test again with -2, expecting the same result
+ SetupTotalBytesLimitWarningTest(10240, -2, &errors, &warnings);
+ EXPECT_EQ(0, errors.size());
+ EXPECT_EQ(0, warnings.size());
+}
+
TEST_F(CodedStreamTest, RecursionLimit) {
ArrayInputStream input(buffer_, sizeof(buffer_));
@@ -1060,6 +1119,7 @@ TEST_F(CodedStreamTest, RecursionLimit) {
EXPECT_FALSE(coded_input.IncrementRecursionDepth()); // 7
}
+
class ReallyBigInputStream : public ZeroCopyInputStream {
public:
ReallyBigInputStream() : backup_amount_(0), buffer_count_(0) {}
diff --git a/src/google/protobuf/io/gzip_stream.cc b/src/google/protobuf/io/gzip_stream.cc
index 0f1ff872..fe1f3319 100644
--- a/src/google/protobuf/io/gzip_stream.cc
+++ b/src/google/protobuf/io/gzip_stream.cc
@@ -199,16 +199,6 @@ GzipOutputStream::GzipOutputStream(ZeroCopyOutputStream* sub_stream,
Init(sub_stream, options);
}
-GzipOutputStream::GzipOutputStream(
- ZeroCopyOutputStream* sub_stream, Format format, int buffer_size) {
- Options options;
- options.format = format;
- if (buffer_size != -1) {
- options.buffer_size = buffer_size;
- }
- Init(sub_stream, options);
-}
-
void GzipOutputStream::Init(ZeroCopyOutputStream* sub_stream,
const Options& options) {
sub_stream_ = sub_stream;
@@ -309,10 +299,11 @@ int64 GzipOutputStream::ByteCount() const {
}
bool GzipOutputStream::Flush() {
- do {
- zerror_ = Deflate(Z_FULL_FLUSH);
- } while (zerror_ == Z_OK);
- return zerror_ == Z_OK;
+ zerror_ = Deflate(Z_FULL_FLUSH);
+ // Return true if the flush succeeded or if it was a no-op.
+ return (zerror_ == Z_OK) ||
+ (zerror_ == Z_BUF_ERROR && zcontext_.avail_in == 0 &&
+ zcontext_.avail_out != 0);
}
bool GzipOutputStream::Close() {
diff --git a/src/google/protobuf/io/gzip_stream.h b/src/google/protobuf/io/gzip_stream.h
index 65dbc5b5..7ee24bc3 100644
--- a/src/google/protobuf/io/gzip_stream.h
+++ b/src/google/protobuf/io/gzip_stream.h
@@ -45,6 +45,7 @@
#include <zlib.h>
+#include <google/protobuf/stubs/common.h>
#include <google/protobuf/io/zero_copy_stream.h>
namespace google {
@@ -144,12 +145,6 @@ class LIBPROTOBUF_EXPORT GzipOutputStream : public ZeroCopyOutputStream {
ZeroCopyOutputStream* sub_stream,
const Options& options);
- // DEPRECATED: Use one of the above constructors instead.
- GzipOutputStream(
- ZeroCopyOutputStream* sub_stream,
- Format format,
- int buffer_size = -1) GOOGLE_ATTRIBUTE_DEPRECATED;
-
virtual ~GzipOutputStream();
// Return last error message or NULL if no error.
@@ -165,6 +160,13 @@ class LIBPROTOBUF_EXPORT GzipOutputStream : public ZeroCopyOutputStream {
// necessary.
// Compression may be less efficient stopping and starting around flushes.
// Returns true if no error.
+ //
+ // Please ensure that block size is > 6. Here is an excerpt from the zlib
+ // doc that explains why:
+ //
+ // In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that avail_out
+ // is greater than six to avoid repeated flush markers due to
+ // avail_out == 0 on return.
bool Flush();
// Writes out all data and closes the gzip stream.
diff --git a/src/google/protobuf/io/printer.cc b/src/google/protobuf/io/printer.cc
index 9ab90dee..d2bf3f54 100644
--- a/src/google/protobuf/io/printer.cc
+++ b/src/google/protobuf/io/printer.cc
@@ -35,7 +35,6 @@
#include <google/protobuf/io/printer.h>
#include <google/protobuf/io/zero_copy_stream.h>
#include <google/protobuf/stubs/common.h>
-#include <google/protobuf/stubs/strutil.h>
namespace google {
namespace protobuf {
@@ -51,8 +50,8 @@ Printer::Printer(ZeroCopyOutputStream* output, char variable_delimiter)
}
Printer::~Printer() {
- // Only BackUp() if we're sure we've successfully called Next() at least once.
- if (buffer_size_ > 0) {
+ // Only BackUp() if we have called Next() at least once and never failed.
+ if (buffer_size_ > 0 && !failed_) {
output_->BackUp(buffer_size_);
}
}
@@ -169,7 +168,7 @@ void Printer::WriteRaw(const char* data, int size) {
if (failed_) return;
if (size == 0) return;
- if (at_start_of_line_) {
+ if (at_start_of_line_ && (size > 0) && (data[0] != '\n')) {
// Insert an indent.
at_start_of_line_ = false;
WriteRaw(indent_.data(), indent_.size());
diff --git a/src/google/protobuf/io/printer_unittest.cc b/src/google/protobuf/io/printer_unittest.cc
index 580a53da..399395c8 100644
--- a/src/google/protobuf/io/printer_unittest.cc
+++ b/src/google/protobuf/io/printer_unittest.cc
@@ -233,7 +233,31 @@ TEST(Printer, Death) {
}
#endif // GTEST_HAS_DEATH_TEST
-TEST(Printer, WriteFailure) {
+TEST(Printer, WriteFailurePartial) {
+ char buffer[17];
+
+ ArrayOutputStream output(buffer, sizeof(buffer));
+ Printer printer(&output, '$');
+
+ // Print 16 bytes to almost fill the buffer (should not fail).
+ printer.Print("0123456789abcdef");
+ EXPECT_FALSE(printer.failed());
+
+ // Try to print 2 chars. Only one fits.
+ printer.Print("<>");
+ EXPECT_TRUE(printer.failed());
+
+ // Anything else should fail too.
+ printer.Print(" ");
+ EXPECT_TRUE(printer.failed());
+ printer.Print("blah");
+ EXPECT_TRUE(printer.failed());
+
+ // Buffer should contain the first 17 bytes written.
+ EXPECT_EQ("0123456789abcdef<", string(buffer, sizeof(buffer)));
+}
+
+TEST(Printer, WriteFailureExact) {
char buffer[16];
ArrayOutputStream output(buffer, sizeof(buffer));
diff --git a/src/google/protobuf/io/tokenizer.cc b/src/google/protobuf/io/tokenizer.cc
index 513831d5..a022b71d 100644
--- a/src/google/protobuf/io/tokenizer.cc
+++ b/src/google/protobuf/io/tokenizer.cc
@@ -89,8 +89,11 @@
// exactly pretty.
#include <google/protobuf/io/tokenizer.h>
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/stubs/stringprintf.h>
#include <google/protobuf/io/zero_copy_stream.h>
#include <google/protobuf/stubs/strutil.h>
+#include <google/protobuf/stubs/stl_util.h>
namespace google {
namespace protobuf {
@@ -118,6 +121,8 @@ namespace {
CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
c == '\r' || c == '\v' || c == '\f');
+CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
+ c == '\r' || c == '\v' || c == '\f');
CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
@@ -187,7 +192,8 @@ Tokenizer::Tokenizer(ZeroCopyInputStream* input,
read_error_(false),
line_(0),
column_(0),
- token_start_(-1),
+ record_target_(NULL),
+ record_start_(-1),
allow_f_after_float_(false),
comment_style_(CPP_COMMENT_STYLE) {
@@ -238,9 +244,9 @@ void Tokenizer::Refresh() {
}
// If we're in a token, append the rest of the buffer to it.
- if (token_start_ >= 0 && token_start_ < buffer_size_) {
- current_.text.append(buffer_ + token_start_, buffer_size_ - token_start_);
- token_start_ = 0;
+ if (record_target_ != NULL && record_start_ < buffer_size_) {
+ record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
+ record_start_ = 0;
}
const void* data = NULL;
@@ -261,23 +267,33 @@ void Tokenizer::Refresh() {
current_char_ = buffer_[0];
}
+inline void Tokenizer::RecordTo(string* target) {
+ record_target_ = target;
+ record_start_ = buffer_pos_;
+}
+
+inline void Tokenizer::StopRecording() {
+ // Note: The if() is necessary because some STL implementations crash when
+ // you call string::append(NULL, 0), presumably because they are trying to
+ // be helpful by detecting the NULL pointer, even though there's nothing
+ // wrong with reading zero bytes from NULL.
+ if (buffer_pos_ != record_start_) {
+ record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
+ }
+ record_target_ = NULL;
+ record_start_ = -1;
+}
+
inline void Tokenizer::StartToken() {
- token_start_ = buffer_pos_;
current_.type = TYPE_START; // Just for the sake of initializing it.
current_.text.clear();
current_.line = line_;
current_.column = column_;
+ RecordTo(&current_.text);
}
inline void Tokenizer::EndToken() {
- // Note: The if() is necessary because some STL implementations crash when
- // you call string::append(NULL, 0), presumably because they are trying to
- // be helpful by detecting the NULL pointer, even though there's nothing
- // wrong with reading zero bytes from NULL.
- if (buffer_pos_ != token_start_) {
- current_.text.append(buffer_ + token_start_, buffer_pos_ - token_start_);
- }
- token_start_ = -1;
+ StopRecording();
current_.end_column = column_;
}
@@ -353,6 +369,27 @@ void Tokenizer::ConsumeString(char delimiter) {
AddError("Expected hex digits for escape sequence.");
}
// Possibly followed by another hex digit, but again we don't care.
+ } else if (TryConsume('u')) {
+ if (!TryConsumeOne<HexDigit>() ||
+ !TryConsumeOne<HexDigit>() ||
+ !TryConsumeOne<HexDigit>() ||
+ !TryConsumeOne<HexDigit>()) {
+ AddError("Expected four hex digits for \\u escape sequence.");
+ }
+ } else if (TryConsume('U')) {
+ // We expect 8 hex digits; but only the range up to 0x10ffff is
+ // legal.
+ if (!TryConsume('0') ||
+ !TryConsume('0') ||
+ !(TryConsume('0') || TryConsume('1')) ||
+ !TryConsumeOne<HexDigit>() ||
+ !TryConsumeOne<HexDigit>() ||
+ !TryConsumeOne<HexDigit>() ||
+ !TryConsumeOne<HexDigit>() ||
+ !TryConsumeOne<HexDigit>()) {
+ AddError("Expected eight hex digits up to 10ffff for \\U escape "
+ "sequence");
+ }
} else {
AddError("Invalid escape sequence in string literal.");
}
@@ -426,26 +463,51 @@ Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
return is_float ? TYPE_FLOAT : TYPE_INTEGER;
}
-void Tokenizer::ConsumeLineComment() {
+void Tokenizer::ConsumeLineComment(string* content) {
+ if (content != NULL) RecordTo(content);
+
while (current_char_ != '\0' && current_char_ != '\n') {
NextChar();
}
TryConsume('\n');
+
+ if (content != NULL) StopRecording();
}
-void Tokenizer::ConsumeBlockComment() {
+void Tokenizer::ConsumeBlockComment(string* content) {
int start_line = line_;
int start_column = column_ - 2;
+ if (content != NULL) RecordTo(content);
+
while (true) {
while (current_char_ != '\0' &&
current_char_ != '*' &&
- current_char_ != '/') {
+ current_char_ != '/' &&
+ current_char_ != '\n') {
NextChar();
}
- if (TryConsume('*') && TryConsume('/')) {
+ if (TryConsume('\n')) {
+ if (content != NULL) StopRecording();
+
+ // Consume leading whitespace and asterisk;
+ ConsumeZeroOrMore<WhitespaceNoNewline>();
+ if (TryConsume('*')) {
+ if (TryConsume('/')) {
+ // End of comment.
+ break;
+ }
+ }
+
+ if (content != NULL) RecordTo(content);
+ } else if (TryConsume('*') && TryConsume('/')) {
// End of comment.
+ if (content != NULL) {
+ StopRecording();
+ // Strip trailing "*/".
+ content->erase(content->size() - 2);
+ }
break;
} else if (TryConsume('/') && current_char_ == '*') {
// Note: We didn't consume the '*' because if there is a '/' after it
@@ -456,42 +518,59 @@ void Tokenizer::ConsumeBlockComment() {
AddError("End-of-file inside block comment.");
error_collector_->AddError(
start_line, start_column, " Comment started here.");
+ if (content != NULL) StopRecording();
break;
}
}
}
+Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
+ if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
+ if (TryConsume('/')) {
+ return LINE_COMMENT;
+ } else if (TryConsume('*')) {
+ return BLOCK_COMMENT;
+ } else {
+ // Oops, it was just a slash. Return it.
+ current_.type = TYPE_SYMBOL;
+ current_.text = "/";
+ current_.line = line_;
+ current_.column = column_ - 1;
+ current_.end_column = column_;
+ return SLASH_NOT_COMMENT;
+ }
+ } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
+ return LINE_COMMENT;
+ } else {
+ return NO_COMMENT;
+ }
+}
+
// -------------------------------------------------------------------
bool Tokenizer::Next() {
previous_ = current_;
- // Did we skip any characters after the last token?
- bool skipped_stuff = false;
-
while (!read_error_) {
- if (TryConsumeOne<Whitespace>()) {
- ConsumeZeroOrMore<Whitespace>();
-
- } else if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
- // Starting a comment?
- if (TryConsume('/')) {
- ConsumeLineComment();
- } else if (TryConsume('*')) {
- ConsumeBlockComment();
- } else {
- // Oops, it was just a slash. Return it.
- current_.type = TYPE_SYMBOL;
- current_.text = "/";
- current_.line = line_;
- current_.column = column_ - 1;
+ ConsumeZeroOrMore<Whitespace>();
+
+ switch (TryConsumeCommentStart()) {
+ case LINE_COMMENT:
+ ConsumeLineComment(NULL);
+ continue;
+ case BLOCK_COMMENT:
+ ConsumeBlockComment(NULL);
+ continue;
+ case SLASH_NOT_COMMENT:
return true;
- }
+ case NO_COMMENT:
+ break;
+ }
- } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
- ConsumeLineComment();
+ // Check for EOF before continuing.
+ if (read_error_) break;
- } else if (LookingAt<Unprintable>() || current_char_ == '\0') {
+ if (LookingAt<Unprintable>() || current_char_ == '\0') {
AddError("Invalid control characters encountered in text.");
NextChar();
// Skip more unprintable characters, too. But, remember that '\0' is
@@ -519,7 +598,9 @@ bool Tokenizer::Next() {
if (TryConsumeOne<Digit>()) {
// It's a floating-point number.
- if (previous_.type == TYPE_IDENTIFIER && !skipped_stuff) {
+ if (previous_.type == TYPE_IDENTIFIER &&
+ current_.line == previous_.line &&
+ current_.column == previous_.end_column) {
// We don't accept syntax like "blah.123".
error_collector_->AddError(line_, column_ - 2,
"Need space between identifier and decimal point.");
@@ -544,8 +625,6 @@ bool Tokenizer::Next() {
EndToken();
return true;
}
-
- skipped_stuff = true;
}
// EOF
@@ -557,6 +636,195 @@ bool Tokenizer::Next() {
return false;
}
+namespace {
+
+// Helper class for collecting comments and putting them in the right places.
+//
+// This basically just buffers the most recent comment until it can be decided
+// exactly where that comment should be placed. When Flush() is called, the
+// current comment goes into either prev_trailing_comments or detached_comments.
+// When the CommentCollector is destroyed, the last buffered comment goes into
+// next_leading_comments.
+class CommentCollector {
+ public:
+ CommentCollector(string* prev_trailing_comments,
+ vector<string>* detached_comments,
+ string* next_leading_comments)
+ : prev_trailing_comments_(prev_trailing_comments),
+ detached_comments_(detached_comments),
+ next_leading_comments_(next_leading_comments),
+ has_comment_(false),
+ is_line_comment_(false),
+ can_attach_to_prev_(true) {
+ if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
+ if (detached_comments != NULL) detached_comments->clear();
+ if (next_leading_comments != NULL) next_leading_comments->clear();
+ }
+
+ ~CommentCollector() {
+ // Whatever is in the buffer is a leading comment.
+ if (next_leading_comments_ != NULL && has_comment_) {
+ comment_buffer_.swap(*next_leading_comments_);
+ }
+ }
+
+ // About to read a line comment. Get the comment buffer pointer in order to
+ // read into it.
+ string* GetBufferForLineComment() {
+ // We want to combine with previous line comments, but not block comments.
+ if (has_comment_ && !is_line_comment_) {
+ Flush();
+ }
+ has_comment_ = true;
+ is_line_comment_ = true;
+ return &comment_buffer_;
+ }
+
+ // About to read a block comment. Get the comment buffer pointer in order to
+ // read into it.
+ string* GetBufferForBlockComment() {
+ if (has_comment_) {
+ Flush();
+ }
+ has_comment_ = true;
+ is_line_comment_ = false;
+ return &comment_buffer_;
+ }
+
+ void ClearBuffer() {
+ comment_buffer_.clear();
+ has_comment_ = false;
+ }
+
+ // Called once we know that the comment buffer is complete and is *not*
+ // connected to the next token.
+ void Flush() {
+ if (has_comment_) {
+ if (can_attach_to_prev_) {
+ if (prev_trailing_comments_ != NULL) {
+ prev_trailing_comments_->append(comment_buffer_);
+ }
+ can_attach_to_prev_ = false;
+ } else {
+ if (detached_comments_ != NULL) {
+ detached_comments_->push_back(comment_buffer_);
+ }
+ }
+ ClearBuffer();
+ }
+ }
+
+ void DetachFromPrev() {
+ can_attach_to_prev_ = false;
+ }
+
+ private:
+ string* prev_trailing_comments_;
+ vector<string>* detached_comments_;
+ string* next_leading_comments_;
+
+ string comment_buffer_;
+
+ // True if any comments were read into comment_buffer_. This can be true even
+ // if comment_buffer_ is empty, namely if the comment was "/**/".
+ bool has_comment_;
+
+ // Is the comment in the comment buffer a line comment?
+ bool is_line_comment_;
+
+ // Is it still possible that we could be reading a comment attached to the
+ // previous token?
+ bool can_attach_to_prev_;
+};
+
+} // namespace
+
+bool Tokenizer::NextWithComments(string* prev_trailing_comments,
+ vector<string>* detached_comments,
+ string* next_leading_comments) {
+ CommentCollector collector(prev_trailing_comments, detached_comments,
+ next_leading_comments);
+
+ if (current_.type == TYPE_START) {
+ collector.DetachFromPrev();
+ } else {
+ // A comment appearing on the same line must be attached to the previous
+ // declaration.
+ ConsumeZeroOrMore<WhitespaceNoNewline>();
+ switch (TryConsumeCommentStart()) {
+ case LINE_COMMENT:
+ ConsumeLineComment(collector.GetBufferForLineComment());
+
+ // Don't allow comments on subsequent lines to be attached to a trailing
+ // comment.
+ collector.Flush();
+ break;
+ case BLOCK_COMMENT:
+ ConsumeBlockComment(collector.GetBufferForBlockComment());
+
+ ConsumeZeroOrMore<WhitespaceNoNewline>();
+ if (!TryConsume('\n')) {
+ // Oops, the next token is on the same line. If we recorded a comment
+ // we really have no idea which token it should be attached to.
+ collector.ClearBuffer();
+ return Next();
+ }
+
+ // Don't allow comments on subsequent lines to be attached to a trailing
+ // comment.
+ collector.Flush();
+ break;
+ case SLASH_NOT_COMMENT:
+ return true;
+ case NO_COMMENT:
+ if (!TryConsume('\n')) {
+ // The next token is on the same line. There are no comments.
+ return Next();
+ }
+ break;
+ }
+ }
+
+ // OK, we are now on the line *after* the previous token.
+ while (true) {
+ ConsumeZeroOrMore<WhitespaceNoNewline>();
+
+ switch (TryConsumeCommentStart()) {
+ case LINE_COMMENT:
+ ConsumeLineComment(collector.GetBufferForLineComment());
+ break;
+ case BLOCK_COMMENT:
+ ConsumeBlockComment(collector.GetBufferForBlockComment());
+
+ // Consume the rest of the line so that we don't interpret it as a
+ // blank line the next time around the loop.
+ ConsumeZeroOrMore<WhitespaceNoNewline>();
+ TryConsume('\n');
+ break;
+ case SLASH_NOT_COMMENT:
+ return true;
+ case NO_COMMENT:
+ if (TryConsume('\n')) {
+ // Completely blank line.
+ collector.Flush();
+ collector.DetachFromPrev();
+ } else {
+ bool result = Next();
+ if (!result ||
+ current_.text == "}" ||
+ current_.text == "]" ||
+ current_.text == ")") {
+ // It looks like we're at the end of a scope. In this case it
+ // makes no sense to attach a comment to the following token.
+ collector.Flush();
+ }
+ return result;
+ }
+ break;
+ }
+ }
+}
+
// -------------------------------------------------------------------
// Token-parsing helpers. Remember that these don't need to report
// errors since any errors should already have been reported while
@@ -626,17 +894,138 @@ double Tokenizer::ParseFloat(const string& text) {
return result;
}
+// Helper to append a Unicode code point to a string as UTF8, without bringing
+// in any external dependencies.
+static void AppendUTF8(uint32 code_point, string* output) {
+ uint32 tmp = 0;
+ int len = 0;
+ if (code_point <= 0x7f) {
+ tmp = code_point;
+ len = 1;
+ } else if (code_point <= 0x07ff) {
+ tmp = 0x0000c080 |
+ ((code_point & 0x07c0) << 2) |
+ (code_point & 0x003f);
+ len = 2;
+ } else if (code_point <= 0xffff) {
+ tmp = 0x00e08080 |
+ ((code_point & 0xf000) << 4) |
+ ((code_point & 0x0fc0) << 2) |
+ (code_point & 0x003f);
+ len = 3;
+ } else if (code_point <= 0x1fffff) {
+ tmp = 0xf0808080 |
+ ((code_point & 0x1c0000) << 6) |
+ ((code_point & 0x03f000) << 4) |
+ ((code_point & 0x000fc0) << 2) |
+ (code_point & 0x003f);
+ len = 4;
+ } else {
+ // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
+ // normally only defined up to there as well.
+ StringAppendF(output, "\\U%08x", code_point);
+ return;
+ }
+ tmp = ghtonl(tmp);
+ output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
+}
+
+// Try to read <len> hex digits from ptr, and stuff the numeric result into
+// *result. Returns true if that many digits were successfully consumed.
+static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
+ *result = 0;
+ if (len == 0) return false;
+ for (const char* end = ptr + len; ptr < end; ++ptr) {
+ if (*ptr == '\0') return false;
+ *result = (*result << 4) + DigitValue(*ptr);
+ }
+ return true;
+}
+
+// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
+// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
+// surrogate. These numbers are in a reserved range of Unicode code points, so
+// if we encounter such a pair we know how to parse it and convert it into a
+// single code point.
+static const uint32 kMinHeadSurrogate = 0xd800;
+static const uint32 kMaxHeadSurrogate = 0xdc00;
+static const uint32 kMinTrailSurrogate = 0xdc00;
+static const uint32 kMaxTrailSurrogate = 0xe000;
+
+static inline bool IsHeadSurrogate(uint32 code_point) {
+ return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
+}
+
+static inline bool IsTrailSurrogate(uint32 code_point) {
+ return (code_point >= kMinTrailSurrogate) &&
+ (code_point < kMaxTrailSurrogate);
+}
+
+// Combine a head and trail surrogate into a single Unicode code point.
+static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
+ GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
+ GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
+ return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
+ (trail_surrogate - kMinTrailSurrogate));
+}
+
+// Convert the escape sequence parameter to a number of expected hex digits.
+static inline int UnicodeLength(char key) {
+ if (key == 'u') return 4;
+ if (key == 'U') return 8;
+ return 0;
+}
+
+// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
+// to parse that sequence. On success, returns a pointer to the first char
+// beyond that sequence, and fills in *code_point. On failure, returns ptr
+// itself.
+static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
+ const char* p = ptr;
+ // Fetch the code point.
+ const int len = UnicodeLength(*p++);
+ if (!ReadHexDigits(p, len, code_point))
+ return ptr;
+ p += len;
+
+ // Check if the code point we read is a "head surrogate." If so, then we
+ // expect it to be immediately followed by another code point which is a valid
+ // "trail surrogate," and together they form a UTF-16 pair which decodes into
+ // a single Unicode point. Trail surrogates may only use \u, not \U.
+ if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
+ uint32 trail_surrogate;
+ if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
+ IsTrailSurrogate(trail_surrogate)) {
+ *code_point = AssembleUTF16(*code_point, trail_surrogate);
+ p += 6;
+ }
+ // If this failed, then we just emit the head surrogate as a code point.
+ // It's bogus, but so is the string.
+ }
+
+ return p;
+}
+
+// The text string must begin and end with single or double quote
+// characters.
void Tokenizer::ParseStringAppend(const string& text, string* output) {
- // Reminder: text[0] is always the quote character. (If text is
- // empty, it's invalid, so we'll just return.)
- if (text.empty()) {
+ // Reminder: text[0] is always a quote character. (If text is
+ // empty, it's invalid, so we'll just return).
+ const size_t text_size = text.size();
+ if (text_size == 0) {
GOOGLE_LOG(DFATAL)
<< " Tokenizer::ParseStringAppend() passed text that could not"
" have been tokenized as a string: " << CEscape(text);
return;
}
- output->reserve(output->size() + text.size());
+ // Reserve room for new string. The branch is necessary because if
+ // there is already space available the reserve() call might
+ // downsize the output.
+ const size_t new_len = text_size + output->size();
+ if (new_len > output->capacity()) {
+ output->reserve(new_len);
+ }
// Loop through the string copying characters to "output" and
// interpreting escape sequences. Note that any invalid escape
@@ -674,19 +1063,27 @@ void Tokenizer::ParseStringAppend(const string& text, string* output) {
}
output->push_back(static_cast<char>(code));
+ } else if (*ptr == 'u' || *ptr == 'U') {
+ uint32 unicode;
+ const char* end = FetchUnicodePoint(ptr, &unicode);
+ if (end == ptr) {
+ // Failure: Just dump out what we saw, don't try to parse it.
+ output->push_back(*ptr);
+ } else {
+ AppendUTF8(unicode, output);
+ ptr = end - 1; // Because we're about to ++ptr.
+ }
} else {
// Some other escape code.
output->push_back(TranslateEscape(*ptr));
}
- } else if (*ptr == text[0]) {
- // Ignore quote matching the starting quote.
+ } else if (*ptr == text[0] && ptr[1] == '\0') {
+ // Ignore final quote matching the starting quote.
} else {
output->push_back(*ptr);
}
}
-
- return;
}
} // namespace io
diff --git a/src/google/protobuf/io/tokenizer.h b/src/google/protobuf/io/tokenizer.h
index 8f759abb..d85b82f9 100644
--- a/src/google/protobuf/io/tokenizer.h
+++ b/src/google/protobuf/io/tokenizer.h
@@ -38,6 +38,7 @@
#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
#include <string>
+#include <vector>
#include <google/protobuf/stubs/common.h>
namespace google {
@@ -137,6 +138,53 @@ class LIBPROTOBUF_EXPORT Tokenizer {
// reached.
bool Next();
+ // Like Next(), but also collects comments which appear between the previous
+ // and next tokens.
+ //
+ // Comments which appear to be attached to the previous token are stored
+ // in *prev_tailing_comments. Comments which appear to be attached to the
+ // next token are stored in *next_leading_comments. Comments appearing in
+ // between which do not appear to be attached to either will be added to
+ // detached_comments. Any of these parameters can be NULL to simply discard
+ // the comments.
+ //
+ // A series of line comments appearing on consecutive lines, with no other
+ // tokens appearing on those lines, will be treated as a single comment.
+ //
+ // Only the comment content is returned; comment markers (e.g. //) are
+ // stripped out. For block comments, leading whitespace and an asterisk will
+ // be stripped from the beginning of each line other than the first. Newlines
+ // are included in the output.
+ //
+ // Examples:
+ //
+ // optional int32 foo = 1; // Comment attached to foo.
+ // // Comment attached to bar.
+ // optional int32 bar = 2;
+ //
+ // optional string baz = 3;
+ // // Comment attached to baz.
+ // // Another line attached to baz.
+ //
+ // // Comment attached to qux.
+ // //
+ // // Another line attached to qux.
+ // optional double qux = 4;
+ //
+ // // Detached comment. This is not attached to qux or corge
+ // // because there are blank lines separating it from both.
+ //
+ // optional string corge = 5;
+ // /* Block comment attached
+ // * to corge. Leading asterisks
+ // * will be removed. */
+ // /* Block comment attached to
+ // * grault. */
+ // optional int32 grault = 6;
+ bool NextWithComments(string* prev_trailing_comments,
+ vector<string>* detached_comments,
+ string* next_leading_comments);
+
// Parse helpers ---------------------------------------------------
// Parses a TYPE_FLOAT token. This never fails, so long as the text actually
@@ -200,11 +248,12 @@ class LIBPROTOBUF_EXPORT Tokenizer {
int line_;
int column_;
- // Position in buffer_ where StartToken() was called. If the token
- // started in the previous buffer, this is zero, and current_.text already
- // contains the part of the token from the previous buffer. If not
- // currently parsing a token, this is -1.
- int token_start_;
+ // String to which text should be appended as we advance through it.
+ // Call RecordTo(&str) to start recording and StopRecording() to stop.
+ // E.g. StartToken() calls RecordTo(&current_.text). record_start_ is the
+ // position within the current buffer where recording started.
+ string* record_target_;
+ int record_start_;
// Options.
bool allow_f_after_float_;
@@ -223,6 +272,9 @@ class LIBPROTOBUF_EXPORT Tokenizer {
// Read a new buffer from the input.
void Refresh();
+ inline void RecordTo(string* target);
+ inline void StopRecording();
+
// Called when the current character is the first character of a new
// token (not including whitespace or comments).
inline void StartToken();
@@ -255,9 +307,28 @@ class LIBPROTOBUF_EXPORT Tokenizer {
TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
// Consume the rest of a line.
- void ConsumeLineComment();
+ void ConsumeLineComment(string* content);
// Consume until "*/".
- void ConsumeBlockComment();
+ void ConsumeBlockComment(string* content);
+
+ enum NextCommentStatus {
+ // Started a line comment.
+ LINE_COMMENT,
+
+ // Started a block comment.
+ BLOCK_COMMENT,
+
+ // Consumed a slash, then realized it wasn't a comment. current_ has
+ // been filled in with a slash token. The caller should return it.
+ SLASH_NOT_COMMENT,
+
+ // We do not appear to be starting a comment here.
+ NO_COMMENT
+ };
+
+ // If we're at the start of a new comment, consume it and return what kind
+ // of comment it is.
+ NextCommentStatus TryConsumeCommentStart();
// -----------------------------------------------------------------
// These helper methods make the parsing code more readable. The
diff --git a/src/google/protobuf/io/tokenizer_unittest.cc b/src/google/protobuf/io/tokenizer_unittest.cc
index 106d080f..8de43939 100644
--- a/src/google/protobuf/io/tokenizer_unittest.cc
+++ b/src/google/protobuf/io/tokenizer_unittest.cc
@@ -32,9 +32,10 @@
// Based on original Protocol Buffers design by
// Sanjay Ghemawat, Jeff Dean, and others.
-#include <vector>
-#include <math.h>
#include <limits.h>
+#include <math.h>
+
+#include <vector>
#include <google/protobuf/io/tokenizer.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
@@ -514,6 +515,217 @@ TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
// -------------------------------------------------------------------
+// In each case, the input is expected to have two tokens named "prev" and
+// "next" with comments in between.
+struct DocCommentCase {
+ string input;
+
+ const char* prev_trailing_comments;
+ const char* detached_comments[10];
+ const char* next_leading_comments;
+};
+
+inline ostream& operator<<(ostream& out,
+ const DocCommentCase& test_case) {
+ return out << CEscape(test_case.input);
+}
+
+DocCommentCase kDocCommentCases[] = {
+ {
+ "prev next",
+
+ "",
+ {},
+ ""
+ },
+
+ {
+ "prev /* ignored */ next",
+
+ "",
+ {},
+ ""
+ },
+
+ {
+ "prev // trailing comment\n"
+ "next",
+
+ " trailing comment\n",
+ {},
+ ""
+ },
+
+ {
+ "prev\n"
+ "// leading comment\n"
+ "// line 2\n"
+ "next",
+
+ "",
+ {},
+ " leading comment\n"
+ " line 2\n"
+ },
+
+ {
+ "prev\n"
+ "// trailing comment\n"
+ "// line 2\n"
+ "\n"
+ "next",
+
+ " trailing comment\n"
+ " line 2\n",
+ {},
+ ""
+ },
+
+ {
+ "prev // trailing comment\n"
+ "// leading comment\n"
+ "// line 2\n"
+ "next",
+
+ " trailing comment\n",
+ {},
+ " leading comment\n"
+ " line 2\n"
+ },
+
+ {
+ "prev /* trailing block comment */\n"
+ "/* leading block comment\n"
+ " * line 2\n"
+ " * line 3 */"
+ "next",
+
+ " trailing block comment ",
+ {},
+ " leading block comment\n"
+ " line 2\n"
+ " line 3 "
+ },
+
+ {
+ "prev\n"
+ "/* trailing block comment\n"
+ " * line 2\n"
+ " * line 3\n"
+ " */\n"
+ "/* leading block comment\n"
+ " * line 2\n"
+ " * line 3 */"
+ "next",
+
+ " trailing block comment\n"
+ " line 2\n"
+ " line 3\n",
+ {},
+ " leading block comment\n"
+ " line 2\n"
+ " line 3 "
+ },
+
+ {
+ "prev\n"
+ "// trailing comment\n"
+ "\n"
+ "// detached comment\n"
+ "// line 2\n"
+ "\n"
+ "// second detached comment\n"
+ "/* third detached comment\n"
+ " * line 2 */\n"
+ "// leading comment\n"
+ "next",
+
+ " trailing comment\n",
+ {
+ " detached comment\n"
+ " line 2\n",
+ " second detached comment\n",
+ " third detached comment\n"
+ " line 2 "
+ },
+ " leading comment\n"
+ },
+
+ {
+ "prev /**/\n"
+ "\n"
+ "// detached comment\n"
+ "\n"
+ "// leading comment\n"
+ "next",
+
+ "",
+ {
+ " detached comment\n"
+ },
+ " leading comment\n"
+ },
+
+ {
+ "prev /**/\n"
+ "// leading comment\n"
+ "next",
+
+ "",
+ {},
+ " leading comment\n"
+ },
+ };
+
+TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
+ // Set up the tokenizer.
+ TestInputStream input(kDocCommentCases_case.input.data(),
+ kDocCommentCases_case.input.size(),
+ kBlockSizes_case);
+ TestErrorCollector error_collector;
+ Tokenizer tokenizer(&input, &error_collector);
+
+ // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
+ TestInputStream input2(kDocCommentCases_case.input.data(),
+ kDocCommentCases_case.input.size(),
+ kBlockSizes_case);
+ Tokenizer tokenizer2(&input2, &error_collector);
+
+ tokenizer.Next();
+ tokenizer2.Next();
+
+ EXPECT_EQ("prev", tokenizer.current().text);
+ EXPECT_EQ("prev", tokenizer2.current().text);
+
+ string prev_trailing_comments;
+ vector<string> detached_comments;
+ string next_leading_comments;
+ tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
+ &next_leading_comments);
+ tokenizer2.NextWithComments(NULL, NULL, NULL);
+ EXPECT_EQ("next", tokenizer.current().text);
+ EXPECT_EQ("next", tokenizer2.current().text);
+
+ EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
+ prev_trailing_comments);
+
+ for (int i = 0; i < detached_comments.size(); i++) {
+ ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
+ ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
+ EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
+ detached_comments[i]);
+ }
+
+ // Verify that we matched all the detached comments.
+ EXPECT_EQ(NULL,
+ kDocCommentCases_case.detached_comments[detached_comments.size()]);
+
+ EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
+ next_leading_comments);
+}
+
+// -------------------------------------------------------------------
+
// Test parse helpers. It's not really worth setting up a full data-driven
// test here.
TEST_F(TokenizerTest, ParseInteger) {
@@ -614,6 +826,22 @@ TEST_F(TokenizerTest, ParseString) {
Tokenizer::ParseString("'\\", &output);
EXPECT_EQ("\\", output);
+ // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
+ // characters.
+ Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
+ EXPECT_EQ("$¢€𤭢XX", output);
+ // Same thing encoded using UTF16.
+ Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
+ EXPECT_EQ("$¢€𤭢XX", output);
+ // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
+ // We just output this as if it were UTF8; it's not a defined code point, but
+ // it has a defined encoding.
+ Tokenizer::ParseString("'\\ud852XX'", &output);
+ EXPECT_EQ("\xed\xa1\x92XX", output);
+ // Malformed escape: Demons may fly out of the nose.
+ Tokenizer::ParseString("\\u0", &output);
+ EXPECT_EQ("u0", output);
+
// Test invalid strings that will never be tokenized as strings.
#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
@@ -658,6 +886,12 @@ ErrorCase kErrorCases[] = {
"0:4: String literals cannot cross line boundaries.\n" },
{ "'bar\nfoo", true,
"0:4: String literals cannot cross line boundaries.\n" },
+ { "'\\u01' foo", true,
+ "0:5: Expected four hex digits for \\u escape sequence.\n" },
+ { "'\\u01' foo", true,
+ "0:5: Expected four hex digits for \\u escape sequence.\n" },
+ { "'\\uXYZ' foo", true,
+ "0:3: Expected four hex digits for \\u escape sequence.\n" },
// Integer errors.
{ "123foo", true,
@@ -734,7 +968,7 @@ TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
}
// Check that the errors match what was expected.
- EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
+ EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
// If the error was recoverable, make sure we saw "foo" after it.
if (kErrorCases_case.recoverable) {
@@ -760,6 +994,7 @@ TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
EXPECT_EQ(strlen("foo"), input.ByteCount());
}
+
} // namespace
} // namespace io
} // namespace protobuf
diff --git a/src/google/protobuf/io/zero_copy_stream_impl.cc b/src/google/protobuf/io/zero_copy_stream_impl.cc
index 1384c746..9fcbb622 100644
--- a/src/google/protobuf/io/zero_copy_stream_impl.cc
+++ b/src/google/protobuf/io/zero_copy_stream_impl.cc
@@ -46,7 +46,8 @@
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include <google/protobuf/stubs/common.h>
-#include <google/protobuf/stubs/stl_util-inl.h>
+#include <google/protobuf/stubs/stl_util.h>
+
namespace google {
namespace protobuf {
diff --git a/src/google/protobuf/io/zero_copy_stream_impl_lite.cc b/src/google/protobuf/io/zero_copy_stream_impl_lite.cc
index e8012510..f552e1f8 100644
--- a/src/google/protobuf/io/zero_copy_stream_impl_lite.cc
+++ b/src/google/protobuf/io/zero_copy_stream_impl_lite.cc
@@ -32,9 +32,9 @@
// Based on original Protocol Buffers design by
// Sanjay Ghemawat, Jeff Dean, and others.
-#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
#include <google/protobuf/stubs/common.h>
-#include <google/protobuf/stubs/stl_util-inl.h>
+#include <google/protobuf/stubs/stl_util.h>
namespace google {
namespace protobuf {
diff --git a/src/google/protobuf/io/zero_copy_stream_unittest.cc b/src/google/protobuf/io/zero_copy_stream_unittest.cc
index 5196d905..6f155df7 100644
--- a/src/google/protobuf/io/zero_copy_stream_unittest.cc
+++ b/src/google/protobuf/io/zero_copy_stream_unittest.cc
@@ -370,6 +370,100 @@ TEST_F(IoTest, GzipIo) {
delete [] buffer;
}
+TEST_F(IoTest, GzipIoWithFlush) {
+ const int kBufferSize = 2*1024;
+ uint8* buffer = new uint8[kBufferSize];
+ // We start with i = 4 as we want a block size > 6. With block size <= 6
+ // Flush() fills up the entire 2K buffer with flush markers and the test
+ // fails. See documentation for Flush() for more detail.
+ for (int i = 4; i < kBlockSizeCount; i++) {
+ for (int j = 0; j < kBlockSizeCount; j++) {
+ for (int z = 0; z < kBlockSizeCount; z++) {
+ int gzip_buffer_size = kBlockSizes[z];
+ int size;
+ {
+ ArrayOutputStream output(buffer, kBufferSize, kBlockSizes[i]);
+ GzipOutputStream::Options options;
+ options.format = GzipOutputStream::GZIP;
+ if (gzip_buffer_size != -1) {
+ options.buffer_size = gzip_buffer_size;
+ }
+ GzipOutputStream gzout(&output, options);
+ WriteStuff(&gzout);
+ EXPECT_TRUE(gzout.Flush());
+ gzout.Close();
+ size = output.ByteCount();
+ }
+ {
+ ArrayInputStream input(buffer, size, kBlockSizes[j]);
+ GzipInputStream gzin(
+ &input, GzipInputStream::GZIP, gzip_buffer_size);
+ ReadStuff(&gzin);
+ }
+ }
+ }
+ }
+ delete [] buffer;
+}
+
+TEST_F(IoTest, GzipIoContiguousFlushes) {
+ const int kBufferSize = 2*1024;
+ uint8* buffer = new uint8[kBufferSize];
+
+ int block_size = kBlockSizes[4];
+ int gzip_buffer_size = block_size;
+ int size;
+
+ ArrayOutputStream output(buffer, kBufferSize, block_size);
+ GzipOutputStream::Options options;
+ options.format = GzipOutputStream::GZIP;
+ if (gzip_buffer_size != -1) {
+ options.buffer_size = gzip_buffer_size;
+ }
+ GzipOutputStream gzout(&output, options);
+ WriteStuff(&gzout);
+ EXPECT_TRUE(gzout.Flush());
+ EXPECT_TRUE(gzout.Flush());
+ gzout.Close();
+ size = output.ByteCount();
+
+ ArrayInputStream input(buffer, size, block_size);
+ GzipInputStream gzin(
+ &input, GzipInputStream::GZIP, gzip_buffer_size);
+ ReadStuff(&gzin);
+
+ delete [] buffer;
+}
+
+TEST_F(IoTest, GzipIoReadAfterFlush) {
+ const int kBufferSize = 2*1024;
+ uint8* buffer = new uint8[kBufferSize];
+
+ int block_size = kBlockSizes[4];
+ int gzip_buffer_size = block_size;
+ int size;
+ ArrayOutputStream output(buffer, kBufferSize, block_size);
+ GzipOutputStream::Options options;
+ options.format = GzipOutputStream::GZIP;
+ if (gzip_buffer_size != -1) {
+ options.buffer_size = gzip_buffer_size;
+ }
+
+ GzipOutputStream gzout(&output, options);
+ WriteStuff(&gzout);
+ EXPECT_TRUE(gzout.Flush());
+ size = output.ByteCount();
+
+ ArrayInputStream input(buffer, size, block_size);
+ GzipInputStream gzin(
+ &input, GzipInputStream::GZIP, gzip_buffer_size);
+ ReadStuff(&gzin);
+
+ gzout.Close();
+
+ delete [] buffer;
+}
+
TEST_F(IoTest, ZlibIo) {
const int kBufferSize = 2*1024;
uint8* buffer = new uint8[kBufferSize];