1 files changed, 501 insertions, 0 deletions
diff --git a/csharp/src/ProtocolBuffers/TextTokenizer.cs b/csharp/src/ProtocolBuffers/TextTokenizer.cs
new file mode 100644
index 00000000..5bb27fd0
--- /dev/null
+++ b/csharp/src/ProtocolBuffers/TextTokenizer.cs
@@ -0,0 +1,501 @@
+#region Copyright notice and license
+
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// http://github.com/jskeet/dotnet-protobufs/
+// Original C++/Java/Python code:
+// http://code.google.com/p/protobuf/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#endregion
+
+using System;
+using System.Globalization;
+using System.Text.RegularExpressions;
+
+namespace Google.ProtocolBuffers
+{
+    /// <summary>
+    /// Represents a stream of tokens parsed from a string.
+    /// </summary>
+    internal sealed class TextTokenizer
+    {
+        private readonly string text;
+        private string currentToken;
+
+        /// <summary>
+        /// The character index within the text to perform the next regex match at.
+        /// </summary>
+        private int matchPos = 0;
+
+        /// <summary>
+        /// The character index within the text at which the current token begins.
+        /// </summary>
+        private int pos = 0;
+
+        /// <summary>
+        /// The line number of the current token.
+        /// </summary>
+        private int line = 0;
+
+        /// <summary>
+        /// The column number of the current token.
+        /// </summary>
+        private int column = 0;
+
+        /// <summary>
+        /// The line number of the previous token.
+        /// </summary>
+        private int previousLine = 0;
+
+        /// <summary>
+        /// The column number of the previous token.
+        /// </summary>
+        private int previousColumn = 0;
+
+        // Note: atomic groups used to mimic possessive quantifiers in Java in both of these regexes
+        internal static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(?>(\\s|(#.*$))+)",
+                                                                               FrameworkPortability.
+                                                                                   CompiledRegexWhereAvailable |
+                                                                               RegexOptions.Multiline);
+
+        private static readonly Regex TokenPattern = new Regex(
+            "\\G[a-zA-Z_](?>[0-9a-zA-Z_+-]*)|" + // an identifier
+            "\\G[0-9+-](?>[0-9a-zA-Z_.+-]*)|" + // a number
+            "\\G\"(?>([^\"\\\n\\\\]|\\\\.)*)(\"|\\\\?$)|" + // a double-quoted string
+            "\\G\'(?>([^\"\\\n\\\\]|\\\\.)*)(\'|\\\\?$)", // a single-quoted string
+            FrameworkPortability.CompiledRegexWhereAvailable | RegexOptions.Multiline);
+
+        private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$",
+                                                                 FrameworkPortability.CompiledRegexWhereAvailable |
+                                                                 RegexOptions.IgnoreCase);
+
+        private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$",
+                                                                FrameworkPortability.CompiledRegexWhereAvailable |
+                                                                RegexOptions.IgnoreCase);
+
+        private static readonly Regex FloatNan = new Regex("^nanf?$",
+                                                           FrameworkPortability.CompiledRegexWhereAvailable |
+                                                           RegexOptions.IgnoreCase);
+
+        /** Construct a tokenizer that parses tokens from the given text. */
+
+        public TextTokenizer(string text)
+        {
+            this.text = text;
+            SkipWhitespace();
+            NextToken();
+        }
+
+        /// <summary>
+        /// Are we at the end of the input?
+        /// </summary>
+        public bool AtEnd
+        {
+            get { return currentToken.Length == 0; }
+        }
+
+        /// <summary>
+        /// Advances to the next token.
+        /// </summary>
+        public void NextToken()
+        {
+            previousLine = line;
+            previousColumn = column;
+
+            // Advance the line counter to the current position.
+            while (pos < matchPos)
+            {
+                if (text[pos] == '\n')
+                {
+                    ++line;
+                    column = 0;
+                }
+                else
+                {
+                    ++column;
+                }
+                ++pos;
+            }
+
+            // Match the next token.
+            if (matchPos == text.Length)
+            {
+                // EOF
+                currentToken = "";
+            }
+            else
+            {
+                Match match = TokenPattern.Match(text, matchPos);
+                if (match.Success)
+                {
+                    currentToken = match.Value;
+                    matchPos += match.Length;
+                }
+                else
+                {
+                    // Take one character.
+                    currentToken = text[matchPos].ToString();
+                    matchPos++;
+                }
+
+                SkipWhitespace();
+            }
+        }
+
+        /// <summary>
+        /// Skip over any whitespace so that matchPos starts at the next token.
+        /// </summary>
+        private void SkipWhitespace()
+        {
+            Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
+            if (match.Success)
+            {
+                matchPos += match.Length;
+            }
+        }
+
+        /// <summary>
+        /// If the next token exactly matches the given token, consume it and return
+        /// true. Otherwise, return false without doing anything.
+        /// </summary>
+        public bool TryConsume(string token)
+        {
+            if (currentToken == token)
+            {
+                NextToken();
+                return true;
+            }
+            return false;
+        }
+
+        /*
+     * If the next token exactly matches {@code token}, consume it.  Otherwise,
+     * throw a {@link ParseException}.
+     */
+
+        /// <summary>
+        /// If the next token exactly matches the specified one, consume it.
+        /// Otherwise, throw a FormatException.
+        /// </summary>
+        /// <param name="token"></param>
+        public void Consume(string token)
+        {
+            if (!TryConsume(token))
+            {
+                throw CreateFormatException("Expected \"" + token + "\".");
+            }
+        }
+
+        /// <summary>
+        /// Returns true if the next token is an integer, but does not consume it.
+        /// </summary>
+        public bool LookingAtInteger()
+        {
+            if (currentToken.Length == 0)
+            {
+                return false;
+            }
+
+            char c = currentToken[0];
+            return ('0' <= c && c <= '9') || c == '-' || c == '+';
+        }
+
+        /// <summary>
+        /// If the next token is an identifier, consume it and return its value.
+        /// Otherwise, throw a FormatException.
+        /// </summary>
+        public string ConsumeIdentifier()
+        {
+            foreach (char c in currentToken)
+            {
+                if (('a' <= c && c <= 'z') ||
+                    ('A' <= c && c <= 'Z') ||
+                    ('0' <= c && c <= '9') ||
+                    (c == '_') || (c == '.'))
+                {
+                    // OK
+                }
+                else
+                {
+                    throw CreateFormatException("Expected identifier.");
+                }
+            }
+
+            string result = currentToken;
+            NextToken();
+            return result;
+        }
+
+        /// <summary>
+        /// If the next token is a 32-bit signed integer, consume it and return its 
+        /// value. Otherwise, throw a FormatException.
+        /// </summary>
+        public int ConsumeInt32()
+        {
+            try
+            {
+                int result = TextFormat.ParseInt32(currentToken);
+                NextToken();
+                return result;
+            }
+            catch (FormatException e)
+            {
+                throw CreateIntegerParseException(e);
+            }
+        }
+
+        /// <summary>
+        /// If the next token is a 32-bit unsigned integer, consume it and return its
+        /// value. Otherwise, throw a FormatException.
+        /// </summary>
+        public uint ConsumeUInt32()
+        {
+            try
+            {
+                uint result = TextFormat.ParseUInt32(currentToken);
+                NextToken();
+                return result;
+            }
+            catch (FormatException e)
+            {
+                throw CreateIntegerParseException(e);
+            }
+        }
+
+        /// <summary>
+        /// If the next token is a 64-bit signed integer, consume it and return its
+        /// value. Otherwise, throw a FormatException.
+        /// </summary>
+        public long ConsumeInt64()
+        {
+            try
+            {
+                long result = TextFormat.ParseInt64(currentToken);
+                NextToken();
+                return result;
+            }
+            catch (FormatException e)
+            {
+                throw CreateIntegerParseException(e);
+            }
+        }
+
+        /// <summary>
+        /// If the next token is a 64-bit unsigned integer, consume it and return its
+        /// value. Otherwise, throw a FormatException.
+        /// </summary>
+        public ulong ConsumeUInt64()
+        {
+            try
+            {
+                ulong result = TextFormat.ParseUInt64(currentToken);
+                NextToken();
+                return result;
+            }
+            catch (FormatException e)
+            {
+                throw CreateIntegerParseException(e);
+            }
+        }
+
+        /// <summary>
+        /// If the next token is a double, consume it and return its value.
+        /// Otherwise, throw a FormatException.
+        /// </summary>
+        public double ConsumeDouble()
+        {
+            // We need to parse infinity and nan separately because
+            // double.Parse() does not accept "inf", "infinity", or "nan".
+            if (DoubleInfinity.IsMatch(currentToken))
+            {
+                bool negative = currentToken.StartsWith("-");
+                NextToken();
+                return negative ? double.NegativeInfinity : double.PositiveInfinity;
+            }
+            if (currentToken.Equals("nan", StringComparison.OrdinalIgnoreCase))
+            {
+                NextToken();
+                return Double.NaN;
+            }
+
+            try
+            {
+                double result = double.Parse(currentToken, FrameworkPortability.InvariantCulture);
+                NextToken();
+                return result;
+            }
+            catch (FormatException e)
+            {
+                throw CreateFloatParseException(e);
+            }
+            catch (OverflowException e)
+            {
+                throw CreateFloatParseException(e);
+            }
+        }
+
+        /// <summary>
+        /// If the next token is a float, consume it and return its value.
+        /// Otherwise, throw a FormatException.
+        /// </summary>
+        public float ConsumeFloat()
+        {
+            // We need to parse infinity and nan separately because
+            // Float.parseFloat() does not accept "inf", "infinity", or "nan".
+            if (FloatInfinity.IsMatch(currentToken))
+            {
+                bool negative = currentToken.StartsWith("-");
+                NextToken();
+                return negative ? float.NegativeInfinity : float.PositiveInfinity;
+            }
+            if (FloatNan.IsMatch(currentToken))
+            {
+                NextToken();
+                return float.NaN;
+            }
+
+            if (currentToken.EndsWith("f"))
+            {
+                currentToken = currentToken.TrimEnd('f');
+            }
+
+            try
+            {
+                float result = float.Parse(currentToken, FrameworkPortability.InvariantCulture);
+                NextToken();
+                return result;
+            }
+            catch (FormatException e)
+            {
+                throw CreateFloatParseException(e);
+            }
+            catch (OverflowException e)
+            {
+                throw CreateFloatParseException(e);
+            }
+        }
+
+        /// <summary>
+        /// If the next token is a Boolean, consume it and return its value.
+        /// Otherwise, throw a FormatException.    
+        /// </summary>
+        public bool ConsumeBoolean()
+        {
+            if (currentToken == "true")
+            {
+                NextToken();
+                return true;
+            }
+            if (currentToken == "false")
+            {
+                NextToken();
+                return false;
+            }
+            throw CreateFormatException("Expected \"true\" or \"false\".");
+        }
+
+        /// <summary>
+        /// If the next token is a string, consume it and return its (unescaped) value.
+        /// Otherwise, throw a FormatException.
+        /// </summary>
+        public string ConsumeString()
+        {
+            return ConsumeByteString().ToStringUtf8();
+        }
+
+        /// <summary>
+        /// If the next token is a string, consume it, unescape it as a
+        /// ByteString and return it. Otherwise, throw a FormatException.
+        /// </summary>
+        public ByteString ConsumeByteString()
+        {
+            char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
+            if (quote != '\"' && quote != '\'')
+            {
+                throw CreateFormatException("Expected string.");
+            }
+
+            if (currentToken.Length < 2 ||
+                currentToken[currentToken.Length - 1] != quote)
+            {
+                throw CreateFormatException("String missing ending quote.");
+            }
+
+            try
+            {
+                string escaped = currentToken.Substring(1, currentToken.Length - 2);
+                ByteString result = TextFormat.UnescapeBytes(escaped);
+                NextToken();
+                return result;
+            }
+            catch (FormatException e)
+            {
+                throw CreateFormatException(e.Message);
+            }
+        }
+
+        /// <summary>
+        /// Returns a format exception with the current line and column numbers
+        /// in the description, suitable for throwing.
+        /// </summary>
+        public FormatException CreateFormatException(string description)
+        {
+            // Note:  People generally prefer one-based line and column numbers.
+            return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
+        }
+
+        /// <summary>
+        /// Returns a format exception with the line and column numbers of the
+        /// previous token in the description, suitable for throwing.
+        /// </summary>
+        public FormatException CreateFormatExceptionPreviousToken(string description)
+        {
+            // Note:  People generally prefer one-based line and column numbers.
+            return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
+        }
+
+        /// <summary>
+        /// Constructs an appropriate FormatException for the given existing exception
+        /// when trying to parse an integer.
+        /// </summary>
+        private FormatException CreateIntegerParseException(FormatException e)
+        {
+            return CreateFormatException("Couldn't parse integer: " + e.Message);
+        }
+
+        /// <summary>
+        /// Constructs an appropriate FormatException for the given existing exception
+        /// when trying to parse a float or double.
+        /// </summary>
+        private FormatException CreateFloatParseException(Exception e)
+        {
+            return CreateFormatException("Couldn't parse number: " + e.Message);
+        }
+    }
+}
+\ No newline at end of file