diff options
author | 2017-11-19 21:10:04 -0800 | |
---|---|---|
committer | 2017-11-19 22:01:46 -0800 | |
commit | 8ffb773f43c8dc54801ca1d111854e7e881c93c9 (patch) | |
tree | 38133a2fc612597a75fed1d13e5b4042f58a2b7e /vendor/github.com/tdewolff/parse/js/lex.go |
First commit
Diffstat (limited to 'vendor/github.com/tdewolff/parse/js/lex.go')
-rw-r--r-- | vendor/github.com/tdewolff/parse/js/lex.go | 650 |
1 files changed, 650 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/js/lex.go b/vendor/github.com/tdewolff/parse/js/lex.go new file mode 100644 index 0000000..96ffcd8 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/js/lex.go @@ -0,0 +1,650 @@ +// Package js is an ECMAScript5.1 lexer following the specifications at http://www.ecma-international.org/ecma-262/5.1/. +package js // import "github.com/tdewolff/parse/js" + +import ( + "io" + "strconv" + "unicode" + + "github.com/tdewolff/parse/buffer" +) + +var identifierStart = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Other_ID_Start} +var identifierContinue = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Mn, unicode.Mc, unicode.Nd, unicode.Pc, unicode.Other_ID_Continue} + +//////////////////////////////////////////////////////////////// + +// TokenType determines the type of token, eg. a number or a semicolon. +type TokenType uint32 + +// TokenType values. +const ( + ErrorToken TokenType = iota // extra token when errors occur + UnknownToken // extra token when no token can be matched + WhitespaceToken // space \t \v \f + LineTerminatorToken // \r \n \r\n + CommentToken + IdentifierToken + PunctuatorToken /* { } ( ) [ ] . ; , < > <= >= == != === !== + - * % ++ -- << >> + >>> & | ^ ! ~ && || ? : = += -= *= %= <<= >>= >>>= &= |= ^= / /= >= */ + NumericToken + StringToken + RegexpToken + TemplateToken +) + +// TokenState determines a state in which next token should be read +type TokenState uint32 + +// TokenState values +const ( + ExprState TokenState = iota + StmtParensState + SubscriptState + PropNameState +) + +// ParsingContext determines the context in which following token should be parsed. +// This affects parsing regular expressions and template literals. +type ParsingContext uint32 + +// ParsingContext values +const ( + GlobalContext ParsingContext = iota + StmtParensContext + ExprParensContext + BracesContext + TemplateContext +) + +// String returns the string representation of a TokenType. +func (tt TokenType) String() string { + switch tt { + case ErrorToken: + return "Error" + case UnknownToken: + return "Unknown" + case WhitespaceToken: + return "Whitespace" + case LineTerminatorToken: + return "LineTerminator" + case CommentToken: + return "Comment" + case IdentifierToken: + return "Identifier" + case PunctuatorToken: + return "Punctuator" + case NumericToken: + return "Numeric" + case StringToken: + return "String" + case RegexpToken: + return "Regexp" + case TemplateToken: + return "Template" + } + return "Invalid(" + strconv.Itoa(int(tt)) + ")" +} + +//////////////////////////////////////////////////////////////// + +// Lexer is the state for the lexer. +type Lexer struct { + r *buffer.Lexer + stack []ParsingContext + state TokenState + emptyLine bool +} + +// NewLexer returns a new Lexer for a given io.Reader. +func NewLexer(r io.Reader) *Lexer { + return &Lexer{ + r: buffer.NewLexer(r), + stack: make([]ParsingContext, 0, 16), + state: ExprState, + emptyLine: true, + } +} + +func (l *Lexer) enterContext(context ParsingContext) { + l.stack = append(l.stack, context) +} + +func (l *Lexer) leaveContext() ParsingContext { + ctx := GlobalContext + if last := len(l.stack) - 1; last >= 0 { + ctx, l.stack = l.stack[last], l.stack[:last] + } + return ctx +} + +// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. +func (l *Lexer) Err() error { + return l.r.Err() +} + +// Restore restores the NULL byte at the end of the buffer. +func (l *Lexer) Restore() { + l.r.Restore() +} + +// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. +func (l *Lexer) Next() (TokenType, []byte) { + tt := UnknownToken + c := l.r.Peek(0) + switch c { + case '(': + if l.state == StmtParensState { + l.enterContext(StmtParensContext) + } else { + l.enterContext(ExprParensContext) + } + l.state = ExprState + l.r.Move(1) + tt = PunctuatorToken + case ')': + if l.leaveContext() == StmtParensContext { + l.state = ExprState + } else { + l.state = SubscriptState + } + l.r.Move(1) + tt = PunctuatorToken + case '{': + l.enterContext(BracesContext) + l.state = ExprState + l.r.Move(1) + tt = PunctuatorToken + case '}': + if l.leaveContext() == TemplateContext && l.consumeTemplateToken() { + tt = TemplateToken + } else { + // will work incorrectly for objects or functions divided by something, + // but that's an extremely rare case + l.state = ExprState + l.r.Move(1) + tt = PunctuatorToken + } + case ']': + l.state = SubscriptState + l.r.Move(1) + tt = PunctuatorToken + case '[', ';', ',', '~', '?', ':': + l.state = ExprState + l.r.Move(1) + tt = PunctuatorToken + case '<', '>', '=', '!', '+', '-', '*', '%', '&', '|', '^': + if (c == '<' || (l.emptyLine && c == '-')) && l.consumeCommentToken() { + return CommentToken, l.r.Shift() + } else if l.consumeLongPunctuatorToken() { + l.state = ExprState + tt = PunctuatorToken + } + case '/': + if l.consumeCommentToken() { + return CommentToken, l.r.Shift() + } else if l.state == ExprState && l.consumeRegexpToken() { + l.state = SubscriptState + tt = RegexpToken + } else if l.consumeLongPunctuatorToken() { + l.state = ExprState + tt = PunctuatorToken + } + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.': + if l.consumeNumericToken() { + tt = NumericToken + l.state = SubscriptState + } else if c == '.' { + l.state = PropNameState + l.r.Move(1) + tt = PunctuatorToken + } + case '\'', '"': + if l.consumeStringToken() { + l.state = SubscriptState + tt = StringToken + } + case ' ', '\t', '\v', '\f': + l.r.Move(1) + for l.consumeWhitespace() { + } + return WhitespaceToken, l.r.Shift() + case '\n', '\r': + l.r.Move(1) + for l.consumeLineTerminator() { + } + tt = LineTerminatorToken + case '`': + if l.consumeTemplateToken() { + tt = TemplateToken + } + default: + if l.consumeIdentifierToken() { + tt = IdentifierToken + if l.state != PropNameState { + switch hash := ToHash(l.r.Lexeme()); hash { + case 0, This, False, True, Null: + l.state = SubscriptState + case If, While, For, With: + l.state = StmtParensState + default: + // This will include keywords that can't be followed by a regexp, but only + // by a specified char (like `switch` or `try`), but we don't check for syntax + // errors as we don't attempt to parse a full JS grammar when streaming + l.state = ExprState + } + } else { + l.state = SubscriptState + } + } else if c >= 0xC0 { + if l.consumeWhitespace() { + for l.consumeWhitespace() { + } + return WhitespaceToken, l.r.Shift() + } else if l.consumeLineTerminator() { + for l.consumeLineTerminator() { + } + tt = LineTerminatorToken + } + } else if l.Err() != nil { + return ErrorToken, nil + } + } + + l.emptyLine = tt == LineTerminatorToken + + if tt == UnknownToken { + _, n := l.r.PeekRune(0) + l.r.Move(n) + } + return tt, l.r.Shift() +} + +//////////////////////////////////////////////////////////////// + +/* +The following functions follow the specifications at http://www.ecma-international.org/ecma-262/5.1/ +*/ + +func (l *Lexer) consumeWhitespace() bool { + c := l.r.Peek(0) + if c == ' ' || c == '\t' || c == '\v' || c == '\f' { + l.r.Move(1) + return true + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); r == '\u00A0' || r == '\uFEFF' || unicode.Is(unicode.Zs, r) { + l.r.Move(n) + return true + } + } + return false +} + +func (l *Lexer) consumeLineTerminator() bool { + c := l.r.Peek(0) + if c == '\n' { + l.r.Move(1) + return true + } else if c == '\r' { + if l.r.Peek(1) == '\n' { + l.r.Move(2) + } else { + l.r.Move(1) + } + return true + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' { + l.r.Move(n) + return true + } + } + return false +} + +func (l *Lexer) consumeDigit() bool { + if c := l.r.Peek(0); c >= '0' && c <= '9' { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeHexDigit() bool { + if c := l.r.Peek(0); (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeBinaryDigit() bool { + if c := l.r.Peek(0); c == '0' || c == '1' { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeOctalDigit() bool { + if c := l.r.Peek(0); c >= '0' && c <= '7' { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeUnicodeEscape() bool { + if l.r.Peek(0) != '\\' || l.r.Peek(1) != 'u' { + return false + } + mark := l.r.Pos() + l.r.Move(2) + if c := l.r.Peek(0); c == '{' { + l.r.Move(1) + if l.consumeHexDigit() { + for l.consumeHexDigit() { + } + if c := l.r.Peek(0); c == '}' { + l.r.Move(1) + return true + } + } + l.r.Rewind(mark) + return false + } else if !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() { + l.r.Rewind(mark) + return false + } + return true +} + +func (l *Lexer) consumeSingleLineComment() { + for { + c := l.r.Peek(0) + if c == '\r' || c == '\n' || c == 0 { + break + } else if c >= 0xC0 { + if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' { + break + } + } + l.r.Move(1) + } +} + +//////////////////////////////////////////////////////////////// + +func (l *Lexer) consumeCommentToken() bool { + c := l.r.Peek(0) + if c == '/' { + c = l.r.Peek(1) + if c == '/' { + // single line + l.r.Move(2) + l.consumeSingleLineComment() + } else if c == '*' { + // multi line + l.r.Move(2) + for { + c := l.r.Peek(0) + if c == '*' && l.r.Peek(1) == '/' { + l.r.Move(2) + return true + } else if c == 0 { + break + } else if l.consumeLineTerminator() { + l.emptyLine = true + } else { + l.r.Move(1) + } + } + } else { + return false + } + } else if c == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { + // opening HTML-style single line comment + l.r.Move(4) + l.consumeSingleLineComment() + } else if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { + // closing HTML-style single line comment + // (only if current line didn't contain any meaningful tokens) + l.r.Move(3) + l.consumeSingleLineComment() + } else { + return false + } + return true +} + +func (l *Lexer) consumeLongPunctuatorToken() bool { + c := l.r.Peek(0) + if c == '!' || c == '=' || c == '+' || c == '-' || c == '*' || c == '/' || c == '%' || c == '&' || c == '|' || c == '^' { + l.r.Move(1) + if l.r.Peek(0) == '=' { + l.r.Move(1) + if (c == '!' || c == '=') && l.r.Peek(0) == '=' { + l.r.Move(1) + } + } else if (c == '+' || c == '-' || c == '&' || c == '|') && l.r.Peek(0) == c { + l.r.Move(1) + } else if c == '=' && l.r.Peek(0) == '>' { + l.r.Move(1) + } + } else { // c == '<' || c == '>' + l.r.Move(1) + if l.r.Peek(0) == c { + l.r.Move(1) + if c == '>' && l.r.Peek(0) == '>' { + l.r.Move(1) + } + } + if l.r.Peek(0) == '=' { + l.r.Move(1) + } + } + return true +} + +func (l *Lexer) consumeIdentifierToken() bool { + c := l.r.Peek(0) + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '$' || c == '_' { + l.r.Move(1) + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); unicode.IsOneOf(identifierStart, r) { + l.r.Move(n) + } else { + return false + } + } else if !l.consumeUnicodeEscape() { + return false + } + for { + c := l.r.Peek(0) + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' { + l.r.Move(1) + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) { + l.r.Move(n) + } else { + break + } + } else { + break + } + } + return true +} + +func (l *Lexer) consumeNumericToken() bool { + // assume to be on 0 1 2 3 4 5 6 7 8 9 . + mark := l.r.Pos() + c := l.r.Peek(0) + if c == '0' { + l.r.Move(1) + if l.r.Peek(0) == 'x' || l.r.Peek(0) == 'X' { + l.r.Move(1) + if l.consumeHexDigit() { + for l.consumeHexDigit() { + } + } else { + l.r.Move(-1) // return just the zero + } + return true + } else if l.r.Peek(0) == 'b' || l.r.Peek(0) == 'B' { + l.r.Move(1) + if l.consumeBinaryDigit() { + for l.consumeBinaryDigit() { + } + } else { + l.r.Move(-1) // return just the zero + } + return true + } else if l.r.Peek(0) == 'o' || l.r.Peek(0) == 'O' { + l.r.Move(1) + if l.consumeOctalDigit() { + for l.consumeOctalDigit() { + } + } else { + l.r.Move(-1) // return just the zero + } + return true + } + } else if c != '.' { + for l.consumeDigit() { + } + } + if l.r.Peek(0) == '.' { + l.r.Move(1) + if l.consumeDigit() { + for l.consumeDigit() { + } + } else if c != '.' { + // . could belong to the next token + l.r.Move(-1) + return true + } else { + l.r.Rewind(mark) + return false + } + } + mark = l.r.Pos() + c = l.r.Peek(0) + if c == 'e' || c == 'E' { + l.r.Move(1) + c = l.r.Peek(0) + if c == '+' || c == '-' { + l.r.Move(1) + } + if !l.consumeDigit() { + // e could belong to the next token + l.r.Rewind(mark) + return true + } + for l.consumeDigit() { + } + } + return true +} + +func (l *Lexer) consumeStringToken() bool { + // assume to be on ' or " + mark := l.r.Pos() + delim := l.r.Peek(0) + l.r.Move(1) + for { + c := l.r.Peek(0) + if c == delim { + l.r.Move(1) + break + } else if c == '\\' { + l.r.Move(1) + if !l.consumeLineTerminator() { + if c := l.r.Peek(0); c == delim || c == '\\' { + l.r.Move(1) + } + } + continue + } else if c == '\n' || c == '\r' { + l.r.Rewind(mark) + return false + } else if c >= 0xC0 { + if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' { + l.r.Rewind(mark) + return false + } + } else if c == 0 { + break + } + l.r.Move(1) + } + return true +} + +func (l *Lexer) consumeRegexpToken() bool { + // assume to be on / and not /* + mark := l.r.Pos() + l.r.Move(1) + inClass := false + for { + c := l.r.Peek(0) + if !inClass && c == '/' { + l.r.Move(1) + break + } else if c == '[' { + inClass = true + } else if c == ']' { + inClass = false + } else if c == '\\' { + l.r.Move(1) + if l.consumeLineTerminator() { + l.r.Rewind(mark) + return false + } + } else if l.consumeLineTerminator() { + l.r.Rewind(mark) + return false + } else if c == 0 { + return true + } + l.r.Move(1) + } + // flags + for { + c := l.r.Peek(0) + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' { + l.r.Move(1) + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) { + l.r.Move(n) + } else { + break + } + } else { + break + } + } + return true +} + +func (l *Lexer) consumeTemplateToken() bool { + // assume to be on ` or } when already within template + mark := l.r.Pos() + l.r.Move(1) + for { + c := l.r.Peek(0) + if c == '`' { + l.state = SubscriptState + l.r.Move(1) + return true + } else if c == '$' && l.r.Peek(1) == '{' { + l.enterContext(TemplateContext) + l.state = ExprState + l.r.Move(2) + return true + } else if c == 0 { + l.r.Rewind(mark) + return false + } + l.r.Move(1) + } +} |