#region Copyright notice and license // Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // http://github.com/jskeet/dotnet-protobufs/ // Original C++/Java/Python code: // http://code.google.com/p/protobuf/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endregion using System; using System.Globalization; using System.Text.RegularExpressions; namespace Google.ProtocolBuffers { /// /// Represents a stream of tokens parsed from a string. /// internal sealed class TextTokenizer { private readonly string text; private string currentToken; /// /// The character index within the text to perform the next regex match at. /// private int matchPos = 0; /// /// The character index within the text at which the current token begins. /// private int pos = 0; /// /// The line number of the current token. /// private int line = 0; /// /// The column number of the current token. /// private int column = 0; /// /// The line number of the previous token. /// private int previousLine = 0; /// /// The column number of the previous token. /// private int previousColumn = 0; // Note: atomic groups used to mimic possessive quantifiers in Java in both of these regexes internal static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(?>(\\s|(#.*$))+)", FrameworkPortability. CompiledRegexWhereAvailable | RegexOptions.Multiline); private static readonly Regex TokenPattern = new Regex( "\\G[a-zA-Z_](?>[0-9a-zA-Z_+-]*)|" + // an identifier "\\G[0-9+-](?>[0-9a-zA-Z_.+-]*)|" + // a number "\\G\"(?>([^\"\\\n\\\\]|\\\\.)*)(\"|\\\\?$)|" + // a double-quoted string "\\G\'(?>([^\"\\\n\\\\]|\\\\.)*)(\'|\\\\?$)", // a single-quoted string FrameworkPortability.CompiledRegexWhereAvailable | RegexOptions.Multiline); private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", FrameworkPortability.CompiledRegexWhereAvailable | RegexOptions.IgnoreCase); private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", FrameworkPortability.CompiledRegexWhereAvailable | RegexOptions.IgnoreCase); private static readonly Regex FloatNan = new Regex("^nanf?$", FrameworkPortability.CompiledRegexWhereAvailable | RegexOptions.IgnoreCase); /** Construct a tokenizer that parses tokens from the given text. */ public TextTokenizer(string text) { this.text = text; SkipWhitespace(); NextToken(); } /// /// Are we at the end of the input? /// public bool AtEnd { get { return currentToken.Length == 0; } } /// /// Advances to the next token. /// public void NextToken() { previousLine = line; previousColumn = column; // Advance the line counter to the current position. while (pos < matchPos) { if (text[pos] == '\n') { ++line; column = 0; } else { ++column; } ++pos; } // Match the next token. if (matchPos == text.Length) { // EOF currentToken = ""; } else { Match match = TokenPattern.Match(text, matchPos); if (match.Success) { currentToken = match.Value; matchPos += match.Length; } else { // Take one character. currentToken = text[matchPos].ToString(); matchPos++; } SkipWhitespace(); } } /// /// Skip over any whitespace so that matchPos starts at the next token. /// private void SkipWhitespace() { Match match = WhitespaceAndCommentPattern.Match(text, matchPos); if (match.Success) { matchPos += match.Length; } } /// /// If the next token exactly matches the given token, consume it and return /// true. Otherwise, return false without doing anything. /// public bool TryConsume(string token) { if (currentToken == token) { NextToken(); return true; } return false; } /* * If the next token exactly matches {@code token}, consume it. Otherwise, * throw a {@link ParseException}. */ /// /// If the next token exactly matches the specified one, consume it. /// Otherwise, throw a FormatException. /// /// public void Consume(string token) { if (!TryConsume(token)) { throw CreateFormatException("Expected \"" + token + "\"."); } } /// /// Returns true if the next token is an integer, but does not consume it. /// public bool LookingAtInteger() { if (currentToken.Length == 0) { return false; } char c = currentToken[0]; return ('0' <= c && c <= '9') || c == '-' || c == '+'; } /// /// If the next token is an identifier, consume it and return its value. /// Otherwise, throw a FormatException. /// public string ConsumeIdentifier() { foreach (char c in currentToken) { if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || (c == '_') || (c == '.')) { // OK } else { throw CreateFormatException("Expected identifier."); } } string result = currentToken; NextToken(); return result; } /// /// If the next token is a 32-bit signed integer, consume it and return its /// value. Otherwise, throw a FormatException. /// public int ConsumeInt32() { try { int result = TextFormat.ParseInt32(currentToken); NextToken(); return result; } catch (FormatException e) { throw CreateIntegerParseException(e); } } /// /// If the next token is a 32-bit unsigned integer, consume it and return its /// value. Otherwise, throw a FormatException. /// public uint ConsumeUInt32() { try { uint result = TextFormat.ParseUInt32(currentToken); NextToken(); return result; } catch (FormatException e) { throw CreateIntegerParseException(e); } } /// /// If the next token is a 64-bit signed integer, consume it and return its /// value. Otherwise, throw a FormatException. /// public long ConsumeInt64() { try { long result = TextFormat.ParseInt64(currentToken); NextToken(); return result; } catch (FormatException e) { throw CreateIntegerParseException(e); } } /// /// If the next token is a 64-bit unsigned integer, consume it and return its /// value. Otherwise, throw a FormatException. /// public ulong ConsumeUInt64() { try { ulong result = TextFormat.ParseUInt64(currentToken); NextToken(); return result; } catch (FormatException e) { throw CreateIntegerParseException(e); } } /// /// If the next token is a double, consume it and return its value. /// Otherwise, throw a FormatException. /// public double ConsumeDouble() { // We need to parse infinity and nan separately because // double.Parse() does not accept "inf", "infinity", or "nan". if (DoubleInfinity.IsMatch(currentToken)) { bool negative = currentToken.StartsWith("-"); NextToken(); return negative ? double.NegativeInfinity : double.PositiveInfinity; } if (currentToken.Equals("nan", StringComparison.OrdinalIgnoreCase)) { NextToken(); return Double.NaN; } try { double result = double.Parse(currentToken, FrameworkPortability.InvariantCulture); NextToken(); return result; } catch (FormatException e) { throw CreateFloatParseException(e); } catch (OverflowException e) { throw CreateFloatParseException(e); } } /// /// If the next token is a float, consume it and return its value. /// Otherwise, throw a FormatException. /// public float ConsumeFloat() { // We need to parse infinity and nan separately because // Float.parseFloat() does not accept "inf", "infinity", or "nan". if (FloatInfinity.IsMatch(currentToken)) { bool negative = currentToken.StartsWith("-"); NextToken(); return negative ? float.NegativeInfinity : float.PositiveInfinity; } if (FloatNan.IsMatch(currentToken)) { NextToken(); return float.NaN; } if (currentToken.EndsWith("f")) { currentToken = currentToken.TrimEnd('f'); } try { float result = float.Parse(currentToken, FrameworkPortability.InvariantCulture); NextToken(); return result; } catch (FormatException e) { throw CreateFloatParseException(e); } catch (OverflowException e) { throw CreateFloatParseException(e); } } /// /// If the next token is a Boolean, consume it and return its value. /// Otherwise, throw a FormatException. /// public bool ConsumeBoolean() { if (currentToken == "true") { NextToken(); return true; } if (currentToken == "false") { NextToken(); return false; } throw CreateFormatException("Expected \"true\" or \"false\"."); } /// /// If the next token is a string, consume it and return its (unescaped) value. /// Otherwise, throw a FormatException. /// public string ConsumeString() { return ConsumeByteString().ToStringUtf8(); } /// /// If the next token is a string, consume it, unescape it as a /// ByteString and return it. Otherwise, throw a FormatException. /// public ByteString ConsumeByteString() { char quote = currentToken.Length > 0 ? currentToken[0] : '\0'; if (quote != '\"' && quote != '\'') { throw CreateFormatException("Expected string."); } if (currentToken.Length < 2 || currentToken[currentToken.Length - 1] != quote) { throw CreateFormatException("String missing ending quote."); } try { string escaped = currentToken.Substring(1, currentToken.Length - 2); ByteString result = TextFormat.UnescapeBytes(escaped); NextToken(); return result; } catch (FormatException e) { throw CreateFormatException(e.Message); } } /// /// Returns a format exception with the current line and column numbers /// in the description, suitable for throwing. /// public FormatException CreateFormatException(string description) { // Note: People generally prefer one-based line and column numbers. return new FormatException((line + 1) + ":" + (column + 1) + ": " + description); } /// /// Returns a format exception with the line and column numbers of the /// previous token in the description, suitable for throwing. /// public FormatException CreateFormatExceptionPreviousToken(string description) { // Note: People generally prefer one-based line and column numbers. return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description); } /// /// Constructs an appropriate FormatException for the given existing exception /// when trying to parse an integer. /// private FormatException CreateIntegerParseException(FormatException e) { return CreateFormatException("Couldn't parse integer: " + e.Message); } /// /// Constructs an appropriate FormatException for the given existing exception /// when trying to parse a float or double. /// private FormatException CreateFloatParseException(Exception e) { return CreateFormatException("Couldn't parse number: " + e.Message); } } }