aboutsummaryrefslogtreecommitdiffhomepage
path: root/python/google/protobuf/text_format.py
diff options
context:
space:
mode:
authorGravatar liujisi@google.com <liujisi@google.com@630680e5-0e50-0410-840e-4b1c322b438d>2010-11-02 13:14:58 +0000
committerGravatar liujisi@google.com <liujisi@google.com@630680e5-0e50-0410-840e-4b1c322b438d>2010-11-02 13:14:58 +0000
commit33165fe0d5c265c92f2a67fc2b437b567c24e294 (patch)
tree52def0850ddd2e976da238d1a437fbda79c96e44 /python/google/protobuf/text_format.py
parent80aa23df6c63750e8cdfdcf3996fbc37d63cac61 (diff)
Submit recent changes from internal branch. See CHANGES.txt for more details.
Diffstat (limited to 'python/google/protobuf/text_format.py')
-rwxr-xr-xpython/google/protobuf/text_format.py92
1 files changed, 55 insertions, 37 deletions
diff --git a/python/google/protobuf/text_format.py b/python/google/protobuf/text_format.py
index cc6ac902..6d77b543 100755
--- a/python/google/protobuf/text_format.py
+++ b/python/google/protobuf/text_format.py
@@ -53,24 +53,26 @@ class ParseError(Exception):
"""Thrown in case of ASCII parsing error."""
-def MessageToString(message):
+def MessageToString(message, as_utf8=False, as_one_line=False):
out = cStringIO.StringIO()
- PrintMessage(message, out)
+ PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line)
result = out.getvalue()
out.close()
+ if as_one_line:
+ return result.rstrip()
return result
-def PrintMessage(message, out, indent = 0):
+def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False):
for field, value in message.ListFields():
if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
for element in value:
- PrintField(field, element, out, indent)
+ PrintField(field, element, out, indent, as_utf8, as_one_line)
else:
- PrintField(field, value, out, indent)
+ PrintField(field, value, out, indent, as_utf8, as_one_line)
-def PrintField(field, value, out, indent = 0):
+def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False):
"""Print a single field name/value pair. For repeated fields, the value
should be a single element."""
@@ -96,23 +98,35 @@ def PrintField(field, value, out, indent = 0):
# don't include it.
out.write(': ')
- PrintFieldValue(field, value, out, indent)
- out.write('\n')
+ PrintFieldValue(field, value, out, indent, as_utf8, as_one_line)
+ if as_one_line:
+ out.write(' ')
+ else:
+ out.write('\n')
-def PrintFieldValue(field, value, out, indent = 0):
+def PrintFieldValue(field, value, out, indent=0,
+ as_utf8=False, as_one_line=False):
"""Print a single field value (not including name). For repeated fields,
the value should be a single element."""
if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
- out.write(' {\n')
- PrintMessage(value, out, indent + 2)
- out.write(' ' * indent + '}')
+ if as_one_line:
+ out.write(' { ')
+ PrintMessage(value, out, indent, as_utf8, as_one_line)
+ out.write('}')
+ else:
+ out.write(' {\n')
+ PrintMessage(value, out, indent + 2, as_utf8, as_one_line)
+ out.write(' ' * indent + '}')
elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
out.write(field.enum_type.values_by_number[value].name)
elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
out.write('\"')
- out.write(_CEscape(value))
+ if type(value) is unicode:
+ out.write(_CEscape(value.encode('utf-8'), as_utf8))
+ else:
+ out.write(_CEscape(value, as_utf8))
out.write('\"')
elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
if value:
@@ -334,10 +348,10 @@ class _Tokenizer(object):
Returns:
True iff the end was reached.
"""
- return not self._lines and not self._current_line
+ return self.token == ''
def _PopLine(self):
- while not self._current_line:
+ while len(self._current_line) <= self._column:
if not self._lines:
self._current_line = ''
return
@@ -348,11 +362,10 @@ class _Tokenizer(object):
def _SkipWhitespace(self):
while True:
self._PopLine()
- match = re.match(self._WHITESPACE, self._current_line)
+ match = self._WHITESPACE.match(self._current_line, self._column)
if not match:
break
length = len(match.group(0))
- self._current_line = self._current_line[length:]
self._column += length
def TryConsume(self, token):
@@ -402,7 +415,7 @@ class _Tokenizer(object):
ParseError: If an identifier couldn't be consumed.
"""
result = self.token
- if not re.match(self._IDENTIFIER, result):
+ if not self._IDENTIFIER.match(result):
raise self._ParseError('Expected identifier.')
self.NextToken()
return result
@@ -481,13 +494,13 @@ class _Tokenizer(object):
ParseError: If a floating point number couldn't be consumed.
"""
text = self.token
- if re.match(self._FLOAT_INFINITY, text):
+ if self._FLOAT_INFINITY.match(text):
self.NextToken()
if text.startswith('-'):
return -_INFINITY
return _INFINITY
- if re.match(self._FLOAT_NAN, text):
+ if self._FLOAT_NAN.match(text):
self.NextToken()
return _NAN
@@ -507,10 +520,10 @@ class _Tokenizer(object):
Raises:
ParseError: If a boolean value couldn't be consumed.
"""
- if self.token == 'true':
+ if self.token in ('true', 't', '1'):
self.NextToken()
return True
- elif self.token == 'false':
+ elif self.token in ('false', 'f', '0'):
self.NextToken()
return False
else:
@@ -525,7 +538,11 @@ class _Tokenizer(object):
Raises:
ParseError: If a string value couldn't be consumed.
"""
- return unicode(self.ConsumeByteString(), 'utf-8')
+ bytes = self.ConsumeByteString()
+ try:
+ return unicode(bytes, 'utf-8')
+ except UnicodeDecodeError, e:
+ raise self._StringParseError(e)
def ConsumeByteString(self):
"""Consumes a byte array value.
@@ -609,7 +626,7 @@ class _Tokenizer(object):
def _ParseError(self, message):
"""Creates and *returns* a ParseError for the current token."""
return ParseError('%d:%d : %s' % (
- self._line + 1, self._column + 1, message))
+ self._line + 1, self._column - len(self.token) + 1, message))
def _IntegerParseError(self, e):
return self._ParseError('Couldn\'t parse integer: ' + str(e))
@@ -617,27 +634,27 @@ class _Tokenizer(object):
def _FloatParseError(self, e):
return self._ParseError('Couldn\'t parse number: ' + str(e))
+ def _StringParseError(self, e):
+ return self._ParseError('Couldn\'t parse string: ' + str(e))
+
def NextToken(self):
"""Reads the next meaningful token."""
self._previous_line = self._line
self._previous_column = self._column
- if self.AtEnd():
- self.token = ''
- return
+
self._column += len(self.token)
+ self._SkipWhitespace()
- # Make sure there is data to work on.
- self._PopLine()
+ if not self._lines and len(self._current_line) <= self._column:
+ self.token = ''
+ return
- match = re.match(self._TOKEN, self._current_line)
+ match = self._TOKEN.match(self._current_line, self._column)
if match:
token = match.group(0)
- self._current_line = self._current_line[len(token):]
self.token = token
else:
- self.token = self._current_line[0]
- self._current_line = self._current_line[1:]
- self._SkipWhitespace()
+ self.token = self._current_line[self._column]
# text.encode('string_escape') does not seem to satisfy our needs as it
@@ -645,7 +662,7 @@ class _Tokenizer(object):
# C++ unescaping function allows hex escapes to be any length. So,
# "\0011".encode('string_escape') ends up being "\\x011", which will be
# decoded in C++ as a single-character string with char code 0x11.
-def _CEscape(text):
+def _CEscape(text, as_utf8):
def escape(c):
o = ord(c)
if o == 10: return r"\n" # optional escape
@@ -656,12 +673,13 @@ def _CEscape(text):
if o == 34: return r'\"' # necessary escape
if o == 92: return r"\\" # necessary escape
- if o >= 127 or o < 32: return "\\%03o" % o # necessary escapes
+ # necessary escapes
+ if not as_utf8 and (o >= 127 or o < 32): return "\\%03o" % o
return c
return "".join([escape(c) for c in text])
-_CUNESCAPE_HEX = re.compile('\\\\x([0-9a-fA-F]{2}|[0-9a-f-A-F])')
+_CUNESCAPE_HEX = re.compile('\\\\x([0-9a-fA-F]{2}|[0-9a-fA-F])')
def _CUnescape(text):