From b2d2cf8b48c2235e048ea76368e0eda75c7c28d5 Mon Sep 17 00:00:00 2001
From: Jie Luo <jieluo@google.com>
Date: Wed, 15 Jul 2015 14:31:19 -0700
Subject: ignore UTF-8 BOM if it is in the begining of a proto file

---
 src/google/protobuf/io/tokenizer.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'src/google/protobuf/io')

diff --git a/src/google/protobuf/io/tokenizer.cc b/src/google/protobuf/io/tokenizer.cc
index ef2de300..60bd7957 100644
--- a/src/google/protobuf/io/tokenizer.cc
+++ b/src/google/protobuf/io/tokenizer.cc
@@ -762,6 +762,15 @@ bool Tokenizer::NextWithComments(string* prev_trailing_comments,
                              next_leading_comments);
 
   if (current_.type == TYPE_START) {
+    // Ignore unicode byte order mark(BOM) if it appears at the file
+    // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
+    if (TryConsume((char)0xEF)) {
+      if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
+        AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
+                 "Only UTF-8 is accepted for proto file.");
+        return false;
+      }
+    }
     collector.DetachFromPrev();
   } else {
     // A comment appearing on the same line must be attached to the previous
-- 
cgit v1.2.3