diff options
author | liujisi@google.com <liujisi@google.com@630680e5-0e50-0410-840e-4b1c322b438d> | 2010-11-02 13:14:58 +0000 |
---|---|---|
committer | liujisi@google.com <liujisi@google.com@630680e5-0e50-0410-840e-4b1c322b438d> | 2010-11-02 13:14:58 +0000 |
commit | 33165fe0d5c265c92f2a67fc2b437b567c24e294 (patch) | |
tree | 52def0850ddd2e976da238d1a437fbda79c96e44 /python/google/protobuf/text_format.py | |
parent | 80aa23df6c63750e8cdfdcf3996fbc37d63cac61 (diff) |
Submit recent changes from internal branch. See CHANGES.txt for more details.
Diffstat (limited to 'python/google/protobuf/text_format.py')
-rwxr-xr-x | python/google/protobuf/text_format.py | 92 |
1 files changed, 55 insertions, 37 deletions
diff --git a/python/google/protobuf/text_format.py b/python/google/protobuf/text_format.py index cc6ac902..6d77b543 100755 --- a/python/google/protobuf/text_format.py +++ b/python/google/protobuf/text_format.py @@ -53,24 +53,26 @@ class ParseError(Exception): """Thrown in case of ASCII parsing error.""" -def MessageToString(message): +def MessageToString(message, as_utf8=False, as_one_line=False): out = cStringIO.StringIO() - PrintMessage(message, out) + PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line) result = out.getvalue() out.close() + if as_one_line: + return result.rstrip() return result -def PrintMessage(message, out, indent = 0): +def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False): for field, value in message.ListFields(): if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: for element in value: - PrintField(field, element, out, indent) + PrintField(field, element, out, indent, as_utf8, as_one_line) else: - PrintField(field, value, out, indent) + PrintField(field, value, out, indent, as_utf8, as_one_line) -def PrintField(field, value, out, indent = 0): +def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False): """Print a single field name/value pair. For repeated fields, the value should be a single element.""" @@ -96,23 +98,35 @@ def PrintField(field, value, out, indent = 0): # don't include it. out.write(': ') - PrintFieldValue(field, value, out, indent) - out.write('\n') + PrintFieldValue(field, value, out, indent, as_utf8, as_one_line) + if as_one_line: + out.write(' ') + else: + out.write('\n') -def PrintFieldValue(field, value, out, indent = 0): +def PrintFieldValue(field, value, out, indent=0, + as_utf8=False, as_one_line=False): """Print a single field value (not including name). For repeated fields, the value should be a single element.""" if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: - out.write(' {\n') - PrintMessage(value, out, indent + 2) - out.write(' ' * indent + '}') + if as_one_line: + out.write(' { ') + PrintMessage(value, out, indent, as_utf8, as_one_line) + out.write('}') + else: + out.write(' {\n') + PrintMessage(value, out, indent + 2, as_utf8, as_one_line) + out.write(' ' * indent + '}') elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: out.write(field.enum_type.values_by_number[value].name) elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: out.write('\"') - out.write(_CEscape(value)) + if type(value) is unicode: + out.write(_CEscape(value.encode('utf-8'), as_utf8)) + else: + out.write(_CEscape(value, as_utf8)) out.write('\"') elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: if value: @@ -334,10 +348,10 @@ class _Tokenizer(object): Returns: True iff the end was reached. """ - return not self._lines and not self._current_line + return self.token == '' def _PopLine(self): - while not self._current_line: + while len(self._current_line) <= self._column: if not self._lines: self._current_line = '' return @@ -348,11 +362,10 @@ class _Tokenizer(object): def _SkipWhitespace(self): while True: self._PopLine() - match = re.match(self._WHITESPACE, self._current_line) + match = self._WHITESPACE.match(self._current_line, self._column) if not match: break length = len(match.group(0)) - self._current_line = self._current_line[length:] self._column += length def TryConsume(self, token): @@ -402,7 +415,7 @@ class _Tokenizer(object): ParseError: If an identifier couldn't be consumed. """ result = self.token - if not re.match(self._IDENTIFIER, result): + if not self._IDENTIFIER.match(result): raise self._ParseError('Expected identifier.') self.NextToken() return result @@ -481,13 +494,13 @@ class _Tokenizer(object): ParseError: If a floating point number couldn't be consumed. """ text = self.token - if re.match(self._FLOAT_INFINITY, text): + if self._FLOAT_INFINITY.match(text): self.NextToken() if text.startswith('-'): return -_INFINITY return _INFINITY - if re.match(self._FLOAT_NAN, text): + if self._FLOAT_NAN.match(text): self.NextToken() return _NAN @@ -507,10 +520,10 @@ class _Tokenizer(object): Raises: ParseError: If a boolean value couldn't be consumed. """ - if self.token == 'true': + if self.token in ('true', 't', '1'): self.NextToken() return True - elif self.token == 'false': + elif self.token in ('false', 'f', '0'): self.NextToken() return False else: @@ -525,7 +538,11 @@ class _Tokenizer(object): Raises: ParseError: If a string value couldn't be consumed. """ - return unicode(self.ConsumeByteString(), 'utf-8') + bytes = self.ConsumeByteString() + try: + return unicode(bytes, 'utf-8') + except UnicodeDecodeError, e: + raise self._StringParseError(e) def ConsumeByteString(self): """Consumes a byte array value. @@ -609,7 +626,7 @@ class _Tokenizer(object): def _ParseError(self, message): """Creates and *returns* a ParseError for the current token.""" return ParseError('%d:%d : %s' % ( - self._line + 1, self._column + 1, message)) + self._line + 1, self._column - len(self.token) + 1, message)) def _IntegerParseError(self, e): return self._ParseError('Couldn\'t parse integer: ' + str(e)) @@ -617,27 +634,27 @@ class _Tokenizer(object): def _FloatParseError(self, e): return self._ParseError('Couldn\'t parse number: ' + str(e)) + def _StringParseError(self, e): + return self._ParseError('Couldn\'t parse string: ' + str(e)) + def NextToken(self): """Reads the next meaningful token.""" self._previous_line = self._line self._previous_column = self._column - if self.AtEnd(): - self.token = '' - return + self._column += len(self.token) + self._SkipWhitespace() - # Make sure there is data to work on. - self._PopLine() + if not self._lines and len(self._current_line) <= self._column: + self.token = '' + return - match = re.match(self._TOKEN, self._current_line) + match = self._TOKEN.match(self._current_line, self._column) if match: token = match.group(0) - self._current_line = self._current_line[len(token):] self.token = token else: - self.token = self._current_line[0] - self._current_line = self._current_line[1:] - self._SkipWhitespace() + self.token = self._current_line[self._column] # text.encode('string_escape') does not seem to satisfy our needs as it @@ -645,7 +662,7 @@ class _Tokenizer(object): # C++ unescaping function allows hex escapes to be any length. So, # "\0011".encode('string_escape') ends up being "\\x011", which will be # decoded in C++ as a single-character string with char code 0x11. -def _CEscape(text): +def _CEscape(text, as_utf8): def escape(c): o = ord(c) if o == 10: return r"\n" # optional escape @@ -656,12 +673,13 @@ def _CEscape(text): if o == 34: return r'\"' # necessary escape if o == 92: return r"\\" # necessary escape - if o >= 127 or o < 32: return "\\%03o" % o # necessary escapes + # necessary escapes + if not as_utf8 and (o >= 127 or o < 32): return "\\%03o" % o return c return "".join([escape(c) for c in text]) -_CUNESCAPE_HEX = re.compile('\\\\x([0-9a-fA-F]{2}|[0-9a-f-A-F])') +_CUNESCAPE_HEX = re.compile('\\\\x([0-9a-fA-F]{2}|[0-9a-fA-F])') def _CUnescape(text): |