Down-integrate from internal branch

author: xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d> 2012-09-22 02:40:50 +0000
committer: xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d> 2012-09-22 02:40:50 +0000
commit: b55a20fa2c669b181f47ea9219b8e74d1263da19 (patch)
tree: 3936a0e7c22196587a6d8397372de41434fe2129 /src/google/protobuf/io/tokenizer.h
parent: 9ced30caf94bb4e7e9629c199679ff44e8ca7389 (diff)
1 files changed, 78 insertions, 7 deletions
diff --git a/src/google/protobuf/io/tokenizer.h b/src/google/protobuf/io/tokenizer.h
index 8f759abb..d85b82f9 100644
--- a/src/google/protobuf/io/tokenizer.h
+++ b/src/google/protobuf/io/tokenizer.h
@@ -38,6 +38,7 @@
 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
 
 #include <string>
+#include <vector>
 #include <google/protobuf/stubs/common.h>
 
 namespace google {
@@ -137,6 +138,53 @@ class LIBPROTOBUF_EXPORT Tokenizer {
   // reached.
   bool Next();
 
+  // Like Next(), but also collects comments which appear between the previous
+  // and next tokens.
+  //
+  // Comments which appear to be attached to the previous token are stored
+  // in *prev_tailing_comments.  Comments which appear to be attached to the
+  // next token are stored in *next_leading_comments.  Comments appearing in
+  // between which do not appear to be attached to either will be added to
+  // detached_comments.  Any of these parameters can be NULL to simply discard
+  // the comments.
+  //
+  // A series of line comments appearing on consecutive lines, with no other
+  // tokens appearing on those lines, will be treated as a single comment.
+  //
+  // Only the comment content is returned; comment markers (e.g. //) are
+  // stripped out.  For block comments, leading whitespace and an asterisk will
+  // be stripped from the beginning of each line other than the first.  Newlines
+  // are included in the output.
+  //
+  // Examples:
+  //
+  //   optional int32 foo = 1;  // Comment attached to foo.
+  //   // Comment attached to bar.
+  //   optional int32 bar = 2;
+  //
+  //   optional string baz = 3;
+  //   // Comment attached to baz.
+  //   // Another line attached to baz.
+  //
+  //   // Comment attached to qux.
+  //   //
+  //   // Another line attached to qux.
+  //   optional double qux = 4;
+  //
+  //   // Detached comment.  This is not attached to qux or corge
+  //   // because there are blank lines separating it from both.
+  //
+  //   optional string corge = 5;
+  //   /* Block comment attached
+  //    * to corge.  Leading asterisks
+  //    * will be removed. */
+  //   /* Block comment attached to
+  //    * grault. */
+  //   optional int32 grault = 6;
+  bool NextWithComments(string* prev_trailing_comments,
+                        vector<string>* detached_comments,
+                        string* next_leading_comments);
+
   // Parse helpers ---------------------------------------------------
 
   // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
@@ -200,11 +248,12 @@ class LIBPROTOBUF_EXPORT Tokenizer {
   int line_;
   int column_;
 
-  // Position in buffer_ where StartToken() was called.  If the token
-  // started in the previous buffer, this is zero, and current_.text already
-  // contains the part of the token from the previous buffer.  If not
-  // currently parsing a token, this is -1.
-  int token_start_;
+  // String to which text should be appended as we advance through it.
+  // Call RecordTo(&str) to start recording and StopRecording() to stop.
+  // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
+  // position within the current buffer where recording started.
+  string* record_target_;
+  int record_start_;
 
   // Options.
   bool allow_f_after_float_;
@@ -223,6 +272,9 @@ class LIBPROTOBUF_EXPORT Tokenizer {
   // Read a new buffer from the input.
   void Refresh();
 
+  inline void RecordTo(string* target);
+  inline void StopRecording();
+
   // Called when the current character is the first character of a new
   // token (not including whitespace or comments).
   inline void StartToken();
@@ -255,9 +307,28 @@ class LIBPROTOBUF_EXPORT Tokenizer {
   TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
 
   // Consume the rest of a line.
-  void ConsumeLineComment();
+  void ConsumeLineComment(string* content);
   // Consume until "*/".
-  void ConsumeBlockComment();
+  void ConsumeBlockComment(string* content);
+
+  enum NextCommentStatus {
+    // Started a line comment.
+    LINE_COMMENT,
+
+    // Started a block comment.
+    BLOCK_COMMENT,
+
+    // Consumed a slash, then realized it wasn't a comment.  current_ has
+    // been filled in with a slash token.  The caller should return it.
+    SLASH_NOT_COMMENT,
+
+    // We do not appear to be starting a comment here.
+    NO_COMMENT
+  };
+
+  // If we're at the start of a new comment, consume it and return what kind
+  // of comment it is.
+  NextCommentStatus TryConsumeCommentStart();
 
   // -----------------------------------------------------------------
   // These helper methods make the parsing code more readable.  The
author	xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d>	2012-09-22 02:40:50 +0000
committer	xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d>	2012-09-22 02:40:50 +0000
commit	b55a20fa2c669b181f47ea9219b8e74d1263da19 (patch)
tree	3936a0e7c22196587a6d8397372de41434fe2129 /src/google/protobuf/io/tokenizer.h
parent	9ced30caf94bb4e7e9629c199679ff44e8ca7389 (diff)