From 40ee551715c3a784ea6132dbf604b0e665ca2def Mon Sep 17 00:00:00 2001 From: temporal Date: Thu, 10 Jul 2008 02:12:20 +0000 Subject: Initial checkin. --- src/google/protobuf/io/coded_stream.h | 592 ++++++++++++++++++++++++++++++++++ 1 file changed, 592 insertions(+) create mode 100644 src/google/protobuf/io/coded_stream.h (limited to 'src/google/protobuf/io/coded_stream.h') diff --git a/src/google/protobuf/io/coded_stream.h b/src/google/protobuf/io/coded_stream.h new file mode 100644 index 00000000..91e5c56a --- /dev/null +++ b/src/google/protobuf/io/coded_stream.h @@ -0,0 +1,592 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. +// http://code.google.com/p/protobuf/ +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This file contains the CodedInputStream and CodedOutputStream classes, +// which wrap a ZeroCopyInputStream or ZeroCopyOutputStream, respectively, +// and allow you to read or write individual pieces of data in various +// formats. In particular, these implement the varint encoding for +// integers, a simple variable-length encoding in which smaller numbers +// take fewer bytes. +// +// Typically these classes will only be used internally by the protocol +// buffer library in order to encode and decode protocol buffers. Clients +// of the library only need to know about this class if they wish to write +// custom message parsing or serialization procedures. +// +// CodedOutputStream example: +// // Write some data to "myfile". First we write a 4-byte "magic number" +// // to identify the file type, then write a length-delimited string. The +// // string is composed of a varint giving the length followed by the raw +// // bytes. +// int fd = open("myfile", O_WRONLY); +// ZeroCopyOutputStream* raw_output = new FileOutputStream(fd); +// CodedOutputStream* coded_output = new CodedOutputStream(raw_output); +// +// int magic_number = 1234; +// char text[] = "Hello world!"; +// coded_output->WriteLittleEndian32(magic_number); +// coded_output->WriteVarint32(strlen(text)); +// coded_output->WriteRaw(text, strlen(text)); +// +// delete coded_output; +// delete raw_output; +// close(fd); +// +// CodedInputStream example: +// // Read a file created by the above code. +// int fd = open("myfile", O_RDONLY); +// ZeroCopyInputStream* raw_input = new FileInputStream(fd); +// CodedInputStream coded_input = new CodedInputStream(raw_input); +// +// coded_input->ReadLittleEndian32(&magic_number); +// if (magic_number != 1234) { +// cerr << "File not in expected format." << endl; +// return; +// } +// +// uint32 size; +// coded_input->ReadVarint32(&size); +// +// char* text = new char[size + 1]; +// coded_input->ReadRaw(buffer, size); +// text[size] = '\0'; +// +// delete coded_input; +// delete raw_input; +// close(fd); +// +// cout << "Text is: " << text << endl; +// delete [] text; +// +// For those who are interested, varint encoding is defined as follows: +// +// The encoding operates on unsigned integers of up to 64 bits in length. +// Each byte of the encoded value has the format: +// * bits 0-6: Seven bits of the number being encoded. +// * bit 7: Zero if this is the last byte in the encoding (in which +// case all remaining bits of the number are zero) or 1 if +// more bytes follow. +// The first byte contains the least-significant 7 bits of the number, the +// second byte (if present) contains the next-least-significant 7 bits, +// and so on. So, the binary number 1011000101011 would be encoded in two +// bytes as "10101011 00101100". +// +// In theory, varint could be used to encode integers of any length. +// However, for practicality we set a limit at 64 bits. The maximum encoded +// length of a number is thus 10 bytes. + +#ifndef GOOGLE_PROTOBUF_IO_CODED_STREAM_H__ +#define GOOGLE_PROTOBUF_IO_CODED_STREAM_H__ + +#include +#include + +namespace google { + +namespace protobuf { +namespace io { + +// Defined in this file. +class CodedInputStream; +class CodedOutputStream; + +// Defined in other files. +class ZeroCopyInputStream; // zero_copy_stream.h +class ZeroCopyOutputStream; // zero_copy_stream.h + +// Class which reads and decodes binary data which is composed of varint- +// encoded integers and fixed-width pieces. Wraps a ZeroCopyInputStream. +// Most users will not need to deal with CodedInputStream. +// +// Most methods of CodedInputStream that return a bool return false if an +// underlying I/O error occurs or if the data is malformed. Once such a +// failure occurs, the CodedInputStream is broken and is no longer useful. +class LIBPROTOBUF_EXPORT CodedInputStream { + public: + // Create a CodedInputStream that reads from the given ZeroCopyInputStream. + explicit CodedInputStream(ZeroCopyInputStream* input); + + // Destroy the CodedInputStream and position the underlying + // ZeroCopyInputStream at the first unread byte. If an error occurred while + // reading (causing a method to return false), then the exact position of + // the input stream may be anywhere between the last value that was read + // successfully and the stream's byte limit. + ~CodedInputStream(); + + + // Skips a number of bytes. Returns false if an underlying read error + // occurs. + bool Skip(int count); + + // Read raw bytes, copying them into the given buffer. + bool ReadRaw(void* buffer, int size); + + // Like ReadRaw, but reads into a string. + // + // Implementation Note: ReadString() grows the string gradually as it + // reads in the data, rather than allocating the entire requested size + // upfront. This prevents denial-of-service attacks in which a client + // could claim that a string is going to be MAX_INT bytes long in order to + // crash the server because it can't allocate this much space at once. + bool ReadString(string* buffer, int size); + + + // Read a 32-bit little-endian integer. + bool ReadLittleEndian32(uint32* value); + // Read a 64-bit little-endian integer. + bool ReadLittleEndian64(uint64* value); + + // Read an unsigned integer with Varint encoding, truncating to 32 bits. + // Reading a 32-bit value is equivalent to reading a 64-bit one and casting + // it to uint32, but may be more efficient. + bool ReadVarint32(uint32* value); + // Read an unsigned integer with Varint encoding. + bool ReadVarint64(uint64* value); + + // Read a tag. This calls ReadVarint32() and returns the result, or returns + // zero (which is not a valid tag) if ReadVarint32() fails. Also, it updates + // the last tag value, which can be checked with LastTagWas(). + // Always inline because this is only called in once place per parse loop + // but it is called for every iteration of said loop, so it should be fast. + // GCC doesn't want to inline this by default. + uint32 ReadTag() GOOGLE_ATTRIBUTE_ALWAYS_INLINE; + + // Usually returns true if calling ReadVarint32() now would produce the given + // value. Will always return false if ReadVarint32() would not return the + // given value. If ExpectTag() returns true, it also advances past + // the varint. For best performance, use a compile-time constant as the + // parameter. + // Always inline because this collapses to a small number of instructions + // when given a constant parameter, but GCC doesn't want to inline by default. + bool ExpectTag(uint32 expected) GOOGLE_ATTRIBUTE_ALWAYS_INLINE; + + // Usually returns true if no more bytes can be read. Always returns false + // if more bytes can be read. If ExpectAtEnd() returns true, a subsequent + // call to LastTagWas() will act as if ReadTag() had been called and returned + // zero, and ConsumedEntireMessage() will return true. + bool ExpectAtEnd(); + + // If the last call to ReadTag() returned the given value, returns true. + // Otherwise, returns false; + // + // This is needed because parsers for some types of embedded messages + // (with field type TYPE_GROUP) don't actually know that they've reached the + // end of a message until they see an ENDGROUP tag, which was actually part + // of the enclosing message. The enclosing message would like to check that + // tag to make sure it had the right number, so it calls LastTagWas() on + // return from the embedded parser to check. + bool LastTagWas(uint32 expected); + + // When parsing message (but NOT a group), this method must be called + // immediately after MergeFromCodedStream() returns (if it returns true) + // to further verify that the message ended in a legitimate way. For + // example, this verifies that parsing did not end on an end-group tag. + // It also checks for some cases where, due to optimizations, + // MergeFromCodedStream() can incorrectly return true. + bool ConsumedEntireMessage(); + + // Limits ---------------------------------------------------------- + // Limits are used when parsing length-delimited embedded messages. + // After the message's length is read, PushLimit() is used to prevent + // the CodedInputStream from reading beyond that length. Once the + // embedded message has been parsed, PopLimit() is called to undo the + // limit. + + // Opaque type used with PushLimit() and PopLimit(). Do not modify + // values of this type yourself. The only reason that this isn't a + // struct with private internals is for efficiency. + typedef int Limit; + + // Places a limit on the number of bytes that the stream may read, + // starting from the current position. Once the stream hits this limit, + // it will act like the end of the input has been reached until PopLimit() + // is called. + // + // As the names imply, the stream conceptually has a stack of limits. The + // shortest limit on the stack is always enforced, even if it is not the + // top limit. + // + // The value returned by PushLimit() is opaque to the caller, and must + // be passed unchanged to the corresponding call to PopLimit(). + Limit PushLimit(int byte_limit); + + // Pops the last limit pushed by PushLimit(). The input must be the value + // returned by that call to PushLimit(). + void PopLimit(Limit limit); + + // Returns the number of bytes left until the nearest limit on the + // stack is hit, or -1 if no limits are in place. + int BytesUntilLimit(); + + // Total Bytes Limit ----------------------------------------------- + // To prevent malicious users from sending excessively large messages + // and causing integer overflows or memory exhaustion, CodedInputStream + // imposes a hard limit on the total number of bytes it will read. + + // Sets the maximum number of bytes that this CodedInputStream will read + // before refusing to continue. To prevent integer overflows in the + // protocol buffers implementation, as well as to prevent servers from + // allocating enormous amounts of memory to hold parsed messages, the + // maximum message length should be limited to the shortest length that + // will not harm usability. The theoretical shortest message that could + // cause integer overflows is 512MB. The default limit is 64MB. Apps + // should set shorter limits if possible. If warning_threshold is not -1, + // a warning will be printed to stderr after warning_threshold bytes are + // read. An error will always be printed to stderr if the limit is + // reached. + // + // This is unrelated to PushLimit()/PopLimit(). + // + // Hint: If you are reading this because your program is printing a + // warning about dangerously large protocol messages, you may be + // confused about what to do next. The best option is to change your + // design such that excessively large messages are not necessary. + // For example, try to design file formats to consist of many small + // messages rather than a single large one. If this is infeasible, + // you will need to increase the limit. Chances are, though, that + // your code never constructs a CodedInputStream on which the limit + // can be set. You probably parse messages by calling things like + // Message::ParseFromString(). In this case, you will need to change + // your code to instead construct some sort of ZeroCopyInputStream + // (e.g. an ArrayInputStream), construct a CodedInputStream around + // that, then call Message::ParseFromCodedStream() instead. Then + // you can adjust the limit. Yes, it's more work, but you're doing + // something unusual. + void SetTotalBytesLimit(int total_bytes_limit, int warning_threshold); + + // Recursion Limit ------------------------------------------------- + // To prevent corrupt or malicious messages from causing stack overflows, + // we must keep track of the depth of recursion when parsing embedded + // messages and groups. CodedInputStream keeps track of this because it + // is the only object that is passed down the stack during parsing. + + // Sets the maximum recursion depth. The default is 64. + void SetRecursionLimit(int limit); + + // Increments the current recursion depth. Returns true if the depth is + // under the limit, false if it has gone over. + bool IncrementRecursionDepth(); + + // Decrements the recursion depth. + void DecrementRecursionDepth(); + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CodedInputStream); + + ZeroCopyInputStream* input_; + const uint8* buffer_; + int buffer_size_; // size of current buffer + int total_bytes_read_; // total bytes read from input_, including + // the current buffer + + // If total_bytes_read_ surpasses INT_MAX, we record the extra bytes here + // so that we can BackUp() on destruction. + int overflow_bytes_; + + // LastTagWas() stuff. + uint32 last_tag_; // result of last ReadTag(). + + // This is set true by ReadVarint32Fallback() if it is called when exactly + // at EOF, or by ExpectAtEnd() when it returns true. This happens when we + // reach the end of a message and attempt to read another tag. + bool legitimate_message_end_; + + // See EnableAliasing(). + bool aliasing_enabled_; + + // Limits + Limit current_limit_; // if position = -1, no limit is applied + + // For simplicity, if the current buffer crosses a limit (either a normal + // limit created by PushLimit() or the total bytes limit), buffer_size_ + // only tracks the number of bytes before that limit. This field + // contains the number of bytes after it. Note that this implies that if + // buffer_size_ == 0 and buffer_size_after_limit_ > 0, we know we've + // hit a limit. However, if both are zero, it doesn't necessarily mean + // we aren't at a limit -- the buffer may have ended exactly at the limit. + int buffer_size_after_limit_; + + // Maximum number of bytes to read, period. This is unrelated to + // current_limit_. Set using SetTotalBytesLimit(). + int total_bytes_limit_; + int total_bytes_warning_threshold_; + + // Current recursion depth, controlled by IncrementRecursionDepth() and + // DecrementRecursionDepth(). + int recursion_depth_; + // Recursion depth limit, set by SetRecursionLimit(). + int recursion_limit_; + + // Advance the buffer by a given number of bytes. + void Advance(int amount); + + // Recomputes the value of buffer_size_after_limit_. Must be called after + // current_limit_ or total_bytes_limit_ changes. + void RecomputeBufferLimits(); + + // Writes an error message saying that we hit total_bytes_limit_. + void PrintTotalBytesLimitError(); + + // Called when the buffer runs out to request more data. Implies an + // Advance(buffer_size_). + bool Refresh(); + + bool ReadVarint32Fallback(uint32* value); +}; + +// Class which encodes and writes binary data which is composed of varint- +// encoded integers and fixed-width pieces. Wraps a ZeroCopyOutputStream. +// Most users will not need to deal with CodedOutputStream. +// +// Most methods of CodedOutputStream which return a bool return false if an +// underlying I/O error occurs. Once such a failure occurs, the +// CodedOutputStream is broken and is no longer useful. +class LIBPROTOBUF_EXPORT CodedOutputStream { + public: + // Create an CodedOutputStream that writes to the given ZeroCopyOutputStream. + explicit CodedOutputStream(ZeroCopyOutputStream* output); + + // Destroy the CodedOutputStream and position the underlying + // ZeroCopyOutputStream immediately after the last byte written. + ~CodedOutputStream(); + + // Write raw bytes, copying them from the given buffer. + bool WriteRaw(const void* buffer, int size); + + // Equivalent to WriteRaw(str.data(), str.size()). + bool WriteString(const string& str); + + + // Write a 32-bit little-endian integer. + bool WriteLittleEndian32(uint32 value); + // Write a 64-bit little-endian integer. + bool WriteLittleEndian64(uint64 value); + + // Write an unsigned integer with Varint encoding. Writing a 32-bit value + // is equivalent to casting it to uint64 and writing it as a 64-bit value, + // but may be more efficient. + bool WriteVarint32(uint32 value); + // Write an unsigned integer with Varint encoding. + bool WriteVarint64(uint64 value); + + // Equivalent to WriteVarint32() except when the value is negative, + // in which case it must be sign-extended to a full 10 bytes. + bool WriteVarint32SignExtended(int32 value); + + // This is identical to WriteVarint32(), but optimized for writing tags. + // In particular, if the input is a compile-time constant, this method + // compiles down to a couple instructions. + // Always inline because otherwise the aformentioned optimization can't work, + // but GCC by default doesn't want to inline this. + bool WriteTag(uint32 value) GOOGLE_ATTRIBUTE_ALWAYS_INLINE; + + // Returns the number of bytes needed to encode the given value as a varint. + static int VarintSize32(uint32 value); + // Returns the number of bytes needed to encode the given value as a varint. + static int VarintSize64(uint64 value); + + // If negative, 10 bytes. Otheriwse, same as VarintSize32(). + static int VarintSize32SignExtended(int32 value); + + // Returns the total number of bytes written since this object was created. + inline int ByteCount() const; + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CodedOutputStream); + + ZeroCopyOutputStream* output_; + uint8* buffer_; + int buffer_size_; + int total_bytes_; // Sum of sizes of all buffers seen so far. + + // Advance the buffer by a given number of bytes. + void Advance(int amount); + + // Called when the buffer runs out to request more data. Implies an + // Advance(buffer_size_). + bool Refresh(); + + bool WriteVarint32Fallback(uint32 value); + static int VarintSize32Fallback(uint32 value); +}; + +// inline methods ==================================================== +// The vast majority of varints are only one byte. These inline +// methods optimize for that case. + +inline bool CodedInputStream::ReadVarint32(uint32* value) { + if (buffer_size_ != 0 && *buffer_ < 0x80) { + *value = *buffer_; + Advance(1); + return true; + } else { + return ReadVarint32Fallback(value); + } +} + +inline uint32 CodedInputStream::ReadTag() { + if (buffer_size_ != 0 && buffer_[0] < 0x80) { + last_tag_ = buffer_[0]; + Advance(1); + return last_tag_; + } else if (buffer_size_ >= 2 && buffer_[1] < 0x80) { + last_tag_ = (buffer_[0] & 0x7f) + (buffer_[1] << 7); + Advance(2); + return last_tag_; + } else if (ReadVarint32Fallback(&last_tag_)) { + return last_tag_; + } else { + last_tag_ = 0; + return 0; + } +} + +inline bool CodedInputStream::LastTagWas(uint32 expected) { + return last_tag_ == expected; +} + +inline bool CodedInputStream::ConsumedEntireMessage() { + return legitimate_message_end_; +} + +inline bool CodedInputStream::ExpectTag(uint32 expected) { + if (expected < (1 << 7)) { + if (buffer_size_ != 0 && buffer_[0] == expected) { + Advance(1); + return true; + } else { + return false; + } + } else if (expected < (1 << 14)) { + if (buffer_size_ >= 2 && + buffer_[0] == static_cast(expected | 0x80) && + buffer_[1] == static_cast(expected >> 7)) { + Advance(2); + return true; + } else { + return false; + } + } else { + // Don't bother optimizing for larger values. + return false; + } +} + +inline bool CodedInputStream::ExpectAtEnd() { + // If we are at a limit we know no more bytes can be read. Otherwise, it's + // hard to say without calling Refresh(), and we'd rather not do that. + + if (buffer_size_ == 0 && buffer_size_after_limit_ != 0) { + last_tag_ = 0; // Pretend we called ReadTag()... + legitimate_message_end_ = true; // ... and it hit EOF. + return true; + } else { + return false; + } +} + +inline bool CodedOutputStream::WriteVarint32(uint32 value) { + if (value < 0x80 && buffer_size_ > 0) { + *buffer_ = value; + Advance(1); + return true; + } else { + return WriteVarint32Fallback(value); + } +} + +inline bool CodedOutputStream::WriteVarint32SignExtended(int32 value) { + if (value < 0) { + return WriteVarint64(static_cast(value)); + } else { + return WriteVarint32(static_cast(value)); + } +} + +inline bool CodedOutputStream::WriteTag(uint32 value) { + if (value < (1 << 7)) { + if (buffer_size_ != 0) { + buffer_[0] = value; + Advance(1); + return true; + } + } else if (value < (1 << 14)) { + if (buffer_size_ >= 2) { + buffer_[0] = static_cast(value | 0x80); + buffer_[1] = static_cast(value >> 7); + Advance(2); + return true; + } + } + return WriteVarint32Fallback(value); +} + +inline int CodedOutputStream::VarintSize32(uint32 value) { + if (value < (1 << 7)) { + return 1; + } else { + return VarintSize32Fallback(value); + } +} + +inline int CodedOutputStream::VarintSize32SignExtended(int32 value) { + if (value < 0) { + return 10; // TODO(kenton): Make this a symbolic constant. + } else { + return VarintSize32(static_cast(value)); + } +} + +inline bool CodedOutputStream::WriteString(const string& str) { + return WriteRaw(str.data(), str.size()); +} + +inline int CodedOutputStream::ByteCount() const { + return total_bytes_ - buffer_size_; +} + +inline void CodedInputStream::Advance(int amount) { + buffer_ += amount; + buffer_size_ -= amount; +} + +inline void CodedOutputStream::Advance(int amount) { + buffer_ += amount; + buffer_size_ -= amount; +} + +inline void CodedInputStream::SetRecursionLimit(int limit) { + recursion_limit_ = limit; +} + +inline bool CodedInputStream::IncrementRecursionDepth() { + ++recursion_depth_; + return recursion_depth_ <= recursion_limit_; +} + +inline void CodedInputStream::DecrementRecursionDepth() { + if (recursion_depth_ > 0) --recursion_depth_; +} + +} // namespace io +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_IO_CODED_STREAM_H__ -- cgit v1.2.3