aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Jie Luo <anandolee@gmail.com>2015-07-17 13:23:29 -0700
committerGravatar Jie Luo <anandolee@gmail.com>2015-07-17 13:23:29 -0700
commit7648852550c6e971e1fdfbfe3892bf12a616539e (patch)
treecf01bf90ff926072295121e4c520f40d6d674a7c /src
parent2f4fb642a36560e6b586f55cabf0d9caf7b55afe (diff)
parentb2d2cf8b48c2235e048ea76368e0eda75c7c28d5 (diff)
Merge pull request #601 from anandolee/master
ignore UTF-8 BOM
Diffstat (limited to 'src')
-rw-r--r--src/google/protobuf/compiler/parser_unittest.cc26
-rw-r--r--src/google/protobuf/io/tokenizer.cc9
2 files changed, 35 insertions, 0 deletions
diff --git a/src/google/protobuf/compiler/parser_unittest.cc b/src/google/protobuf/compiler/parser_unittest.cc
index ddf34bfa..cc6f1efb 100644
--- a/src/google/protobuf/compiler/parser_unittest.cc
+++ b/src/google/protobuf/compiler/parser_unittest.cc
@@ -229,6 +229,32 @@ TEST_F(ParserTest, WarnIfSyntaxIdentifierOmmitted) {
typedef ParserTest ParseMessageTest;
+TEST_F(ParseMessageTest, IgnoreBOM) {
+ char input[] = " message TestMessage {\n"
+ " required int32 foo = 1;\n"
+ "}\n";
+ // Set UTF-8 BOM.
+ input[0] = (char)0xEF;
+ input[1] = (char)0xBB;
+ input[2] = (char)0xBF;
+ ExpectParsesTo(input,
+ "message_type {"
+ " name: \"TestMessage\""
+ " field { name:\"foo\" label:LABEL_REQUIRED type:TYPE_INT32 number:1 }"
+ "}");
+}
+
+TEST_F(ParseMessageTest, BOMError) {
+ char input[] = " message TestMessage {\n"
+ " required int32 foo = 1;\n"
+ "}\n";
+ input[0] = (char)0xEF;
+ ExpectHasErrors(input,
+ "0:1: Proto file starts with 0xEF but not UTF-8 BOM. "
+ "Only UTF-8 is accepted for proto file.\n"
+ "0:0: Expected top-level statement (e.g. \"message\").\n");
+}
+
TEST_F(ParseMessageTest, SimpleMessage) {
ExpectParsesTo(
"message TestMessage {\n"
diff --git a/src/google/protobuf/io/tokenizer.cc b/src/google/protobuf/io/tokenizer.cc
index ef2de300..60bd7957 100644
--- a/src/google/protobuf/io/tokenizer.cc
+++ b/src/google/protobuf/io/tokenizer.cc
@@ -762,6 +762,15 @@ bool Tokenizer::NextWithComments(string* prev_trailing_comments,
next_leading_comments);
if (current_.type == TYPE_START) {
+ // Ignore unicode byte order mark(BOM) if it appears at the file
+ // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
+ if (TryConsume((char)0xEF)) {
+ if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
+ AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
+ "Only UTF-8 is accepted for proto file.");
+ return false;
+ }
+ }
collector.DetachFromPrev();
} else {
// A comment appearing on the same line must be attached to the previous