diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse/js')
-rw-r--r-- | vendor/github.com/tdewolff/parse/js/README.md | 89 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/js/hash.go | 156 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/js/hash_test.go | 18 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/js/lex.go | 650 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/js/lex_test.go | 152 |
5 files changed, 1065 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/js/README.md b/vendor/github.com/tdewolff/parse/js/README.md new file mode 100644 index 0000000..7b06d7c --- /dev/null +++ b/vendor/github.com/tdewolff/parse/js/README.md @@ -0,0 +1,89 @@ +# JS [![GoDoc](http://godoc.org/github.com/tdewolff/parse/js?status.svg)](http://godoc.org/github.com/tdewolff/parse/js) [![GoCover](http://gocover.io/_badge/github.com/tdewolff/parse/js)](http://gocover.io/github.com/tdewolff/parse/js) + +This package is a JS lexer (ECMA-262, edition 6.0) written in [Go][1]. It follows the specification at [ECMAScript Language Specification](http://www.ecma-international.org/ecma-262/6.0/). The lexer takes an io.Reader and converts it into tokens until the EOF. + +## Installation +Run the following command + + go get github.com/tdewolff/parse/js + +or add the following import and run project with `go get` + + import "github.com/tdewolff/parse/js" + +## Lexer +### Usage +The following initializes a new Lexer with io.Reader `r`: +``` go +l := js.NewLexer(r) +``` + +To tokenize until EOF an error, use: +``` go +for { + tt, text := l.Next() + switch tt { + case js.ErrorToken: + // error or EOF set in l.Err() + return + // ... + } +} +``` + +All tokens (see [ECMAScript Language Specification](http://www.ecma-international.org/ecma-262/6.0/)): +``` go +ErrorToken TokenType = iota // extra token when errors occur +UnknownToken // extra token when no token can be matched +WhitespaceToken // space \t \v \f +LineTerminatorToken // \r \n \r\n +CommentToken +IdentifierToken // also: null true false +PunctuatorToken /* { } ( ) [ ] . ; , < > <= >= == != === !== + - * % ++ -- << >> + >>> & | ^ ! ~ && || ? : = += -= *= %= <<= >>= >>>= &= |= ^= / /= => */ +NumericToken +StringToken +RegexpToken +TemplateToken +``` + +### Quirks +Because the ECMAScript specification for `PunctuatorToken` (of which the `/` and `/=` symbols) and `RegexpToken` depends on a parser state to differentiate between the two, the lexer (to remain modular) uses different rules. It aims to correctly disambiguate contexts and returns `RegexpToken` or `PunctuatorToken` where appropriate with only few exceptions which don't make much sense in runtime and so don't happen in a real-world code: function literal division (`x = function y(){} / z`) and object literal division (`x = {y:1} / z`). + +Another interesting case introduced by ES2015 is `yield` operator in function generators vs `yield` as an identifier in regular functions. This was done for backward compatibility, but is very hard to disambiguate correctly on a lexer level without essentially implementing entire parsing spec as a state machine and hurting performance, code readability and maintainability, so, instead, `yield` is just always assumed to be an operator. In combination with above paragraph, this means that, for example, `yield /x/i` will be always parsed as `yield`-ing regular expression and not as `yield` identifier divided by `x` and then `i`. There is no evidence though that this pattern occurs in any popular libraries. + +### Examples +``` go +package main + +import ( + "os" + + "github.com/tdewolff/parse/js" +) + +// Tokenize JS from stdin. +func main() { + l := js.NewLexer(os.Stdin) + for { + tt, text := l.Next() + switch tt { + case js.ErrorToken: + if l.Err() != io.EOF { + fmt.Println("Error on line", l.Line(), ":", l.Err()) + } + return + case js.IdentifierToken: + fmt.Println("Identifier", string(text)) + case js.NumericToken: + fmt.Println("Numeric", string(text)) + // ... + } + } +} +``` + +## License +Released under the [MIT license](https://github.com/tdewolff/parse/blob/master/LICENSE.md). + +[1]: http://golang.org/ "Go Language" diff --git a/vendor/github.com/tdewolff/parse/js/hash.go b/vendor/github.com/tdewolff/parse/js/hash.go new file mode 100644 index 0000000..3de86b2 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/js/hash.go @@ -0,0 +1,156 @@ +package js + +// generated by hasher -file hash.go -type Hash; DO NOT EDIT, except for adding more constants to the list and rerun go generate + +// uses github.com/tdewolff/hasher +//go:generate hasher -type=Hash -file=hash.go + +// Hash defines perfect hashes for a predefined list of strings +type Hash uint32 + +// Unique hash definitions to be used instead of strings +const ( + Break Hash = 0x5 // break + Case Hash = 0x3404 // case + Catch Hash = 0xba05 // catch + Class Hash = 0x505 // class + Const Hash = 0x2c05 // const + Continue Hash = 0x3e08 // continue + Debugger Hash = 0x8408 // debugger + Default Hash = 0xab07 // default + Delete Hash = 0xcd06 // delete + Do Hash = 0x4c02 // do + Else Hash = 0x3704 // else + Enum Hash = 0x3a04 // enum + Export Hash = 0x1806 // export + Extends Hash = 0x4507 // extends + False Hash = 0x5a05 // false + Finally Hash = 0x7a07 // finally + For Hash = 0xc403 // for + Function Hash = 0x4e08 // function + If Hash = 0x5902 // if + Implements Hash = 0x5f0a // implements + Import Hash = 0x6906 // import + In Hash = 0x4202 // in + Instanceof Hash = 0x710a // instanceof + Interface Hash = 0x8c09 // interface + Let Hash = 0xcf03 // let + New Hash = 0x1203 // new + Null Hash = 0x5504 // null + Package Hash = 0x9507 // package + Private Hash = 0x9c07 // private + Protected Hash = 0xa309 // protected + Public Hash = 0xb506 // public + Return Hash = 0xd06 // return + Static Hash = 0x2f06 // static + Super Hash = 0x905 // super + Switch Hash = 0x2606 // switch + This Hash = 0x2304 // this + Throw Hash = 0x1d05 // throw + True Hash = 0xb104 // true + Try Hash = 0x6e03 // try + Typeof Hash = 0xbf06 // typeof + Var Hash = 0xc703 // var + Void Hash = 0xca04 // void + While Hash = 0x1405 // while + With Hash = 0x2104 // with + Yield Hash = 0x8005 // yield +) + +// String returns the hash' name. +func (i Hash) String() string { + start := uint32(i >> 8) + n := uint32(i & 0xff) + if start+n > uint32(len(_Hash_text)) { + return "" + } + return _Hash_text[start : start+n] +} + +// ToHash returns the hash whose name is s. It returns zero if there is no +// such hash. It is case sensitive. +func ToHash(s []byte) Hash { + if len(s) == 0 || len(s) > _Hash_maxLen { + return 0 + } + h := uint32(_Hash_hash0) + for i := 0; i < len(s); i++ { + h ^= uint32(s[i]) + h *= 16777619 + } + if i := _Hash_table[h&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) { + t := _Hash_text[i>>8 : i>>8+i&0xff] + for i := 0; i < len(s); i++ { + if t[i] != s[i] { + goto NEXT + } + } + return i + } +NEXT: + if i := _Hash_table[(h>>16)&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) { + t := _Hash_text[i>>8 : i>>8+i&0xff] + for i := 0; i < len(s); i++ { + if t[i] != s[i] { + return 0 + } + } + return i + } + return 0 +} + +const _Hash_hash0 = 0x9acb0442 +const _Hash_maxLen = 10 +const _Hash_text = "breakclassupereturnewhilexporthrowithiswitchconstaticaselsen" + + "umcontinuextendsdofunctionullifalseimplementsimportryinstanc" + + "eofinallyieldebuggerinterfacepackageprivateprotectedefaultru" + + "epublicatchtypeoforvarvoidelete" + +var _Hash_table = [1 << 6]Hash{ + 0x0: 0x2f06, // static + 0x1: 0x9c07, // private + 0x3: 0xb104, // true + 0x6: 0x5a05, // false + 0x7: 0x4c02, // do + 0x9: 0x2c05, // const + 0xa: 0x2606, // switch + 0xb: 0x6e03, // try + 0xc: 0x1203, // new + 0xd: 0x4202, // in + 0xf: 0x8005, // yield + 0x10: 0x5f0a, // implements + 0x11: 0xc403, // for + 0x12: 0x505, // class + 0x13: 0x3a04, // enum + 0x16: 0xc703, // var + 0x17: 0x5902, // if + 0x19: 0xcf03, // let + 0x1a: 0x9507, // package + 0x1b: 0xca04, // void + 0x1c: 0xcd06, // delete + 0x1f: 0x5504, // null + 0x20: 0x1806, // export + 0x21: 0xd06, // return + 0x23: 0x4507, // extends + 0x25: 0x2304, // this + 0x26: 0x905, // super + 0x27: 0x1405, // while + 0x29: 0x5, // break + 0x2b: 0x3e08, // continue + 0x2e: 0x3404, // case + 0x2f: 0xab07, // default + 0x31: 0x8408, // debugger + 0x32: 0x1d05, // throw + 0x33: 0xbf06, // typeof + 0x34: 0x2104, // with + 0x35: 0xba05, // catch + 0x36: 0x4e08, // function + 0x37: 0x710a, // instanceof + 0x38: 0xa309, // protected + 0x39: 0x8c09, // interface + 0x3b: 0xb506, // public + 0x3c: 0x3704, // else + 0x3d: 0x7a07, // finally + 0x3f: 0x6906, // import +} diff --git a/vendor/github.com/tdewolff/parse/js/hash_test.go b/vendor/github.com/tdewolff/parse/js/hash_test.go new file mode 100644 index 0000000..fa6a213 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/js/hash_test.go @@ -0,0 +1,18 @@ +package js // import "github.com/tdewolff/parse/js" + +import ( + "testing" + + "github.com/tdewolff/test" +) + +func TestHashTable(t *testing.T) { + test.T(t, ToHash([]byte("break")), Break, "'break' must resolve to hash.Break") + test.T(t, ToHash([]byte("var")), Var, "'var' must resolve to hash.Var") + test.T(t, Break.String(), "break") + test.T(t, ToHash([]byte("")), Hash(0), "empty string must resolve to zero") + test.T(t, Hash(0xffffff).String(), "") + test.T(t, ToHash([]byte("breaks")), Hash(0), "'breaks' must resolve to zero") + test.T(t, ToHash([]byte("sdf")), Hash(0), "'sdf' must resolve to zero") + test.T(t, ToHash([]byte("uio")), Hash(0), "'uio' must resolve to zero") +} diff --git a/vendor/github.com/tdewolff/parse/js/lex.go b/vendor/github.com/tdewolff/parse/js/lex.go new file mode 100644 index 0000000..96ffcd8 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/js/lex.go @@ -0,0 +1,650 @@ +// Package js is an ECMAScript5.1 lexer following the specifications at http://www.ecma-international.org/ecma-262/5.1/. +package js // import "github.com/tdewolff/parse/js" + +import ( + "io" + "strconv" + "unicode" + + "github.com/tdewolff/parse/buffer" +) + +var identifierStart = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Other_ID_Start} +var identifierContinue = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Mn, unicode.Mc, unicode.Nd, unicode.Pc, unicode.Other_ID_Continue} + +//////////////////////////////////////////////////////////////// + +// TokenType determines the type of token, eg. a number or a semicolon. +type TokenType uint32 + +// TokenType values. +const ( + ErrorToken TokenType = iota // extra token when errors occur + UnknownToken // extra token when no token can be matched + WhitespaceToken // space \t \v \f + LineTerminatorToken // \r \n \r\n + CommentToken + IdentifierToken + PunctuatorToken /* { } ( ) [ ] . ; , < > <= >= == != === !== + - * % ++ -- << >> + >>> & | ^ ! ~ && || ? : = += -= *= %= <<= >>= >>>= &= |= ^= / /= >= */ + NumericToken + StringToken + RegexpToken + TemplateToken +) + +// TokenState determines a state in which next token should be read +type TokenState uint32 + +// TokenState values +const ( + ExprState TokenState = iota + StmtParensState + SubscriptState + PropNameState +) + +// ParsingContext determines the context in which following token should be parsed. +// This affects parsing regular expressions and template literals. +type ParsingContext uint32 + +// ParsingContext values +const ( + GlobalContext ParsingContext = iota + StmtParensContext + ExprParensContext + BracesContext + TemplateContext +) + +// String returns the string representation of a TokenType. +func (tt TokenType) String() string { + switch tt { + case ErrorToken: + return "Error" + case UnknownToken: + return "Unknown" + case WhitespaceToken: + return "Whitespace" + case LineTerminatorToken: + return "LineTerminator" + case CommentToken: + return "Comment" + case IdentifierToken: + return "Identifier" + case PunctuatorToken: + return "Punctuator" + case NumericToken: + return "Numeric" + case StringToken: + return "String" + case RegexpToken: + return "Regexp" + case TemplateToken: + return "Template" + } + return "Invalid(" + strconv.Itoa(int(tt)) + ")" +} + +//////////////////////////////////////////////////////////////// + +// Lexer is the state for the lexer. +type Lexer struct { + r *buffer.Lexer + stack []ParsingContext + state TokenState + emptyLine bool +} + +// NewLexer returns a new Lexer for a given io.Reader. +func NewLexer(r io.Reader) *Lexer { + return &Lexer{ + r: buffer.NewLexer(r), + stack: make([]ParsingContext, 0, 16), + state: ExprState, + emptyLine: true, + } +} + +func (l *Lexer) enterContext(context ParsingContext) { + l.stack = append(l.stack, context) +} + +func (l *Lexer) leaveContext() ParsingContext { + ctx := GlobalContext + if last := len(l.stack) - 1; last >= 0 { + ctx, l.stack = l.stack[last], l.stack[:last] + } + return ctx +} + +// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. +func (l *Lexer) Err() error { + return l.r.Err() +} + +// Restore restores the NULL byte at the end of the buffer. +func (l *Lexer) Restore() { + l.r.Restore() +} + +// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. +func (l *Lexer) Next() (TokenType, []byte) { + tt := UnknownToken + c := l.r.Peek(0) + switch c { + case '(': + if l.state == StmtParensState { + l.enterContext(StmtParensContext) + } else { + l.enterContext(ExprParensContext) + } + l.state = ExprState + l.r.Move(1) + tt = PunctuatorToken + case ')': + if l.leaveContext() == StmtParensContext { + l.state = ExprState + } else { + l.state = SubscriptState + } + l.r.Move(1) + tt = PunctuatorToken + case '{': + l.enterContext(BracesContext) + l.state = ExprState + l.r.Move(1) + tt = PunctuatorToken + case '}': + if l.leaveContext() == TemplateContext && l.consumeTemplateToken() { + tt = TemplateToken + } else { + // will work incorrectly for objects or functions divided by something, + // but that's an extremely rare case + l.state = ExprState + l.r.Move(1) + tt = PunctuatorToken + } + case ']': + l.state = SubscriptState + l.r.Move(1) + tt = PunctuatorToken + case '[', ';', ',', '~', '?', ':': + l.state = ExprState + l.r.Move(1) + tt = PunctuatorToken + case '<', '>', '=', '!', '+', '-', '*', '%', '&', '|', '^': + if (c == '<' || (l.emptyLine && c == '-')) && l.consumeCommentToken() { + return CommentToken, l.r.Shift() + } else if l.consumeLongPunctuatorToken() { + l.state = ExprState + tt = PunctuatorToken + } + case '/': + if l.consumeCommentToken() { + return CommentToken, l.r.Shift() + } else if l.state == ExprState && l.consumeRegexpToken() { + l.state = SubscriptState + tt = RegexpToken + } else if l.consumeLongPunctuatorToken() { + l.state = ExprState + tt = PunctuatorToken + } + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.': + if l.consumeNumericToken() { + tt = NumericToken + l.state = SubscriptState + } else if c == '.' { + l.state = PropNameState + l.r.Move(1) + tt = PunctuatorToken + } + case '\'', '"': + if l.consumeStringToken() { + l.state = SubscriptState + tt = StringToken + } + case ' ', '\t', '\v', '\f': + l.r.Move(1) + for l.consumeWhitespace() { + } + return WhitespaceToken, l.r.Shift() + case '\n', '\r': + l.r.Move(1) + for l.consumeLineTerminator() { + } + tt = LineTerminatorToken + case '`': + if l.consumeTemplateToken() { + tt = TemplateToken + } + default: + if l.consumeIdentifierToken() { + tt = IdentifierToken + if l.state != PropNameState { + switch hash := ToHash(l.r.Lexeme()); hash { + case 0, This, False, True, Null: + l.state = SubscriptState + case If, While, For, With: + l.state = StmtParensState + default: + // This will include keywords that can't be followed by a regexp, but only + // by a specified char (like `switch` or `try`), but we don't check for syntax + // errors as we don't attempt to parse a full JS grammar when streaming + l.state = ExprState + } + } else { + l.state = SubscriptState + } + } else if c >= 0xC0 { + if l.consumeWhitespace() { + for l.consumeWhitespace() { + } + return WhitespaceToken, l.r.Shift() + } else if l.consumeLineTerminator() { + for l.consumeLineTerminator() { + } + tt = LineTerminatorToken + } + } else if l.Err() != nil { + return ErrorToken, nil + } + } + + l.emptyLine = tt == LineTerminatorToken + + if tt == UnknownToken { + _, n := l.r.PeekRune(0) + l.r.Move(n) + } + return tt, l.r.Shift() +} + +//////////////////////////////////////////////////////////////// + +/* +The following functions follow the specifications at http://www.ecma-international.org/ecma-262/5.1/ +*/ + +func (l *Lexer) consumeWhitespace() bool { + c := l.r.Peek(0) + if c == ' ' || c == '\t' || c == '\v' || c == '\f' { + l.r.Move(1) + return true + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); r == '\u00A0' || r == '\uFEFF' || unicode.Is(unicode.Zs, r) { + l.r.Move(n) + return true + } + } + return false +} + +func (l *Lexer) consumeLineTerminator() bool { + c := l.r.Peek(0) + if c == '\n' { + l.r.Move(1) + return true + } else if c == '\r' { + if l.r.Peek(1) == '\n' { + l.r.Move(2) + } else { + l.r.Move(1) + } + return true + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' { + l.r.Move(n) + return true + } + } + return false +} + +func (l *Lexer) consumeDigit() bool { + if c := l.r.Peek(0); c >= '0' && c <= '9' { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeHexDigit() bool { + if c := l.r.Peek(0); (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeBinaryDigit() bool { + if c := l.r.Peek(0); c == '0' || c == '1' { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeOctalDigit() bool { + if c := l.r.Peek(0); c >= '0' && c <= '7' { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeUnicodeEscape() bool { + if l.r.Peek(0) != '\\' || l.r.Peek(1) != 'u' { + return false + } + mark := l.r.Pos() + l.r.Move(2) + if c := l.r.Peek(0); c == '{' { + l.r.Move(1) + if l.consumeHexDigit() { + for l.consumeHexDigit() { + } + if c := l.r.Peek(0); c == '}' { + l.r.Move(1) + return true + } + } + l.r.Rewind(mark) + return false + } else if !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() { + l.r.Rewind(mark) + return false + } + return true +} + +func (l *Lexer) consumeSingleLineComment() { + for { + c := l.r.Peek(0) + if c == '\r' || c == '\n' || c == 0 { + break + } else if c >= 0xC0 { + if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' { + break + } + } + l.r.Move(1) + } +} + +//////////////////////////////////////////////////////////////// + +func (l *Lexer) consumeCommentToken() bool { + c := l.r.Peek(0) + if c == '/' { + c = l.r.Peek(1) + if c == '/' { + // single line + l.r.Move(2) + l.consumeSingleLineComment() + } else if c == '*' { + // multi line + l.r.Move(2) + for { + c := l.r.Peek(0) + if c == '*' && l.r.Peek(1) == '/' { + l.r.Move(2) + return true + } else if c == 0 { + break + } else if l.consumeLineTerminator() { + l.emptyLine = true + } else { + l.r.Move(1) + } + } + } else { + return false + } + } else if c == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { + // opening HTML-style single line comment + l.r.Move(4) + l.consumeSingleLineComment() + } else if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { + // closing HTML-style single line comment + // (only if current line didn't contain any meaningful tokens) + l.r.Move(3) + l.consumeSingleLineComment() + } else { + return false + } + return true +} + +func (l *Lexer) consumeLongPunctuatorToken() bool { + c := l.r.Peek(0) + if c == '!' || c == '=' || c == '+' || c == '-' || c == '*' || c == '/' || c == '%' || c == '&' || c == '|' || c == '^' { + l.r.Move(1) + if l.r.Peek(0) == '=' { + l.r.Move(1) + if (c == '!' || c == '=') && l.r.Peek(0) == '=' { + l.r.Move(1) + } + } else if (c == '+' || c == '-' || c == '&' || c == '|') && l.r.Peek(0) == c { + l.r.Move(1) + } else if c == '=' && l.r.Peek(0) == '>' { + l.r.Move(1) + } + } else { // c == '<' || c == '>' + l.r.Move(1) + if l.r.Peek(0) == c { + l.r.Move(1) + if c == '>' && l.r.Peek(0) == '>' { + l.r.Move(1) + } + } + if l.r.Peek(0) == '=' { + l.r.Move(1) + } + } + return true +} + +func (l *Lexer) consumeIdentifierToken() bool { + c := l.r.Peek(0) + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '$' || c == '_' { + l.r.Move(1) + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); unicode.IsOneOf(identifierStart, r) { + l.r.Move(n) + } else { + return false + } + } else if !l.consumeUnicodeEscape() { + return false + } + for { + c := l.r.Peek(0) + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' { + l.r.Move(1) + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) { + l.r.Move(n) + } else { + break + } + } else { + break + } + } + return true +} + +func (l *Lexer) consumeNumericToken() bool { + // assume to be on 0 1 2 3 4 5 6 7 8 9 . + mark := l.r.Pos() + c := l.r.Peek(0) + if c == '0' { + l.r.Move(1) + if l.r.Peek(0) == 'x' || l.r.Peek(0) == 'X' { + l.r.Move(1) + if l.consumeHexDigit() { + for l.consumeHexDigit() { + } + } else { + l.r.Move(-1) // return just the zero + } + return true + } else if l.r.Peek(0) == 'b' || l.r.Peek(0) == 'B' { + l.r.Move(1) + if l.consumeBinaryDigit() { + for l.consumeBinaryDigit() { + } + } else { + l.r.Move(-1) // return just the zero + } + return true + } else if l.r.Peek(0) == 'o' || l.r.Peek(0) == 'O' { + l.r.Move(1) + if l.consumeOctalDigit() { + for l.consumeOctalDigit() { + } + } else { + l.r.Move(-1) // return just the zero + } + return true + } + } else if c != '.' { + for l.consumeDigit() { + } + } + if l.r.Peek(0) == '.' { + l.r.Move(1) + if l.consumeDigit() { + for l.consumeDigit() { + } + } else if c != '.' { + // . could belong to the next token + l.r.Move(-1) + return true + } else { + l.r.Rewind(mark) + return false + } + } + mark = l.r.Pos() + c = l.r.Peek(0) + if c == 'e' || c == 'E' { + l.r.Move(1) + c = l.r.Peek(0) + if c == '+' || c == '-' { + l.r.Move(1) + } + if !l.consumeDigit() { + // e could belong to the next token + l.r.Rewind(mark) + return true + } + for l.consumeDigit() { + } + } + return true +} + +func (l *Lexer) consumeStringToken() bool { + // assume to be on ' or " + mark := l.r.Pos() + delim := l.r.Peek(0) + l.r.Move(1) + for { + c := l.r.Peek(0) + if c == delim { + l.r.Move(1) + break + } else if c == '\\' { + l.r.Move(1) + if !l.consumeLineTerminator() { + if c := l.r.Peek(0); c == delim || c == '\\' { + l.r.Move(1) + } + } + continue + } else if c == '\n' || c == '\r' { + l.r.Rewind(mark) + return false + } else if c >= 0xC0 { + if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' { + l.r.Rewind(mark) + return false + } + } else if c == 0 { + break + } + l.r.Move(1) + } + return true +} + +func (l *Lexer) consumeRegexpToken() bool { + // assume to be on / and not /* + mark := l.r.Pos() + l.r.Move(1) + inClass := false + for { + c := l.r.Peek(0) + if !inClass && c == '/' { + l.r.Move(1) + break + } else if c == '[' { + inClass = true + } else if c == ']' { + inClass = false + } else if c == '\\' { + l.r.Move(1) + if l.consumeLineTerminator() { + l.r.Rewind(mark) + return false + } + } else if l.consumeLineTerminator() { + l.r.Rewind(mark) + return false + } else if c == 0 { + return true + } + l.r.Move(1) + } + // flags + for { + c := l.r.Peek(0) + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' { + l.r.Move(1) + } else if c >= 0xC0 { + if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) { + l.r.Move(n) + } else { + break + } + } else { + break + } + } + return true +} + +func (l *Lexer) consumeTemplateToken() bool { + // assume to be on ` or } when already within template + mark := l.r.Pos() + l.r.Move(1) + for { + c := l.r.Peek(0) + if c == '`' { + l.state = SubscriptState + l.r.Move(1) + return true + } else if c == '$' && l.r.Peek(1) == '{' { + l.enterContext(TemplateContext) + l.state = ExprState + l.r.Move(2) + return true + } else if c == 0 { + l.r.Rewind(mark) + return false + } + l.r.Move(1) + } +} diff --git a/vendor/github.com/tdewolff/parse/js/lex_test.go b/vendor/github.com/tdewolff/parse/js/lex_test.go new file mode 100644 index 0000000..b379321 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/js/lex_test.go @@ -0,0 +1,152 @@ +package js // import "github.com/tdewolff/parse/js" + +import ( + "bytes" + "fmt" + "io" + "testing" + + "github.com/tdewolff/test" +) + +type TTs []TokenType + +func TestTokens(t *testing.T) { + var tokenTests = []struct { + js string + expected []TokenType + }{ + {" \t\v\f\u00A0\uFEFF\u2000", TTs{}}, // WhitespaceToken + {"\n\r\r\n\u2028\u2029", TTs{LineTerminatorToken}}, + {"5.2 .04 0x0F 5e99", TTs{NumericToken, NumericToken, NumericToken, NumericToken}}, + {"a = 'string'", TTs{IdentifierToken, PunctuatorToken, StringToken}}, + {"/*comment*/ //comment", TTs{CommentToken, CommentToken}}, + {"{ } ( ) [ ]", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {". ; , < > <=", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {">= == != === !==", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {"+ - * % ++ --", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {"<< >> >>> & | ^", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {"! ~ && || ? :", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {"= += -= *= %= <<=", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {">>= >>>= &= |= ^= =>", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {"a = /.*/g;", TTs{IdentifierToken, PunctuatorToken, RegexpToken, PunctuatorToken}}, + + {"/*co\nm\u2028m/*ent*/ //co//mment\u2029//comment", TTs{CommentToken, CommentToken, LineTerminatorToken, CommentToken}}, + {"<!-", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {"1<!--2\n", TTs{NumericToken, CommentToken, LineTerminatorToken}}, + {"x=y-->10\n", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, PunctuatorToken, NumericToken, LineTerminatorToken}}, + {" /*comment*/ -->nothing\n", TTs{CommentToken, CommentToken, LineTerminatorToken}}, + {"1 /*comment\nmultiline*/ -->nothing\n", TTs{NumericToken, CommentToken, CommentToken, LineTerminatorToken}}, + {"$ _\u200C \\u2000 \u200C", TTs{IdentifierToken, IdentifierToken, IdentifierToken, UnknownToken}}, + {">>>=>>>>=", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken}}, + {"1/", TTs{NumericToken, PunctuatorToken}}, + {"1/=", TTs{NumericToken, PunctuatorToken}}, + {"010xF", TTs{NumericToken, NumericToken, IdentifierToken}}, + {"50e+-0", TTs{NumericToken, IdentifierToken, PunctuatorToken, PunctuatorToken, NumericToken}}, + {"'str\\i\\'ng'", TTs{StringToken}}, + {"'str\\\\'abc", TTs{StringToken, IdentifierToken}}, + {"'str\\\ni\\\\u00A0ng'", TTs{StringToken}}, + {"a = /[a-z/]/g", TTs{IdentifierToken, PunctuatorToken, RegexpToken}}, + {"a=/=/g1", TTs{IdentifierToken, PunctuatorToken, RegexpToken}}, + {"a = /'\\\\/\n", TTs{IdentifierToken, PunctuatorToken, RegexpToken, LineTerminatorToken}}, + {"a=/\\//g1", TTs{IdentifierToken, PunctuatorToken, RegexpToken}}, + {"new RegExp(a + /\\d{1,2}/.source)", TTs{IdentifierToken, IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, RegexpToken, PunctuatorToken, IdentifierToken, PunctuatorToken}}, + + {"0b0101 0o0707 0b17", TTs{NumericToken, NumericToken, NumericToken, NumericToken}}, + {"`template`", TTs{TemplateToken}}, + {"`a${x+y}b`", TTs{TemplateToken, IdentifierToken, PunctuatorToken, IdentifierToken, TemplateToken}}, + {"`temp\nlate`", TTs{TemplateToken}}, + {"`outer${{x: 10}}bar${ raw`nested${2}endnest` }end`", TTs{TemplateToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, TemplateToken, IdentifierToken, TemplateToken, NumericToken, TemplateToken, TemplateToken}}, + + // early endings + {"'string", TTs{StringToken}}, + {"'\n '\u2028", TTs{UnknownToken, LineTerminatorToken, UnknownToken, LineTerminatorToken}}, + {"'str\\\U00100000ing\\0'", TTs{StringToken}}, + {"'strin\\00g'", TTs{StringToken}}, + {"/*comment", TTs{CommentToken}}, + {"a=/regexp", TTs{IdentifierToken, PunctuatorToken, RegexpToken}}, + {"\\u002", TTs{UnknownToken, IdentifierToken}}, + + // coverage + {"Ø a〉", TTs{IdentifierToken, IdentifierToken, UnknownToken}}, + {"0xg 0.f", TTs{NumericToken, IdentifierToken, NumericToken, PunctuatorToken, IdentifierToken}}, + {"0bg 0og", TTs{NumericToken, IdentifierToken, NumericToken, IdentifierToken}}, + {"\u00A0\uFEFF\u2000", TTs{}}, + {"\u2028\u2029", TTs{LineTerminatorToken}}, + {"\\u0029ident", TTs{IdentifierToken}}, + {"\\u{0029FEF}ident", TTs{IdentifierToken}}, + {"\\u{}", TTs{UnknownToken, IdentifierToken, PunctuatorToken, PunctuatorToken}}, + {"\\ugident", TTs{UnknownToken, IdentifierToken}}, + {"'str\u2028ing'", TTs{UnknownToken, IdentifierToken, LineTerminatorToken, IdentifierToken, StringToken}}, + {"a=/\\\n", TTs{IdentifierToken, PunctuatorToken, PunctuatorToken, UnknownToken, LineTerminatorToken}}, + {"a=/x/\u200C\u3009", TTs{IdentifierToken, PunctuatorToken, RegexpToken, UnknownToken}}, + {"a=/x\n", TTs{IdentifierToken, PunctuatorToken, PunctuatorToken, IdentifierToken, LineTerminatorToken}}, + + {"return /abc/;", TTs{IdentifierToken, RegexpToken, PunctuatorToken}}, + {"yield /abc/;", TTs{IdentifierToken, RegexpToken, PunctuatorToken}}, + {"a/b/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, IdentifierToken}}, + {"{}/1/g", TTs{PunctuatorToken, PunctuatorToken, RegexpToken}}, + {"i(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}}, + {"if(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}}, + {"a.if(0)/1/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}}, + {"while(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}}, + {"for(;;)/1/g", TTs{IdentifierToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, RegexpToken}}, + {"with(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}}, + {"this/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}}, + {"case /1/g:", TTs{IdentifierToken, RegexpToken, PunctuatorToken}}, + {"function f(){}/1/g", TTs{IdentifierToken, IdentifierToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, RegexpToken}}, + {"this.return/1/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}}, + {"(a+b)/1/g", TTs{PunctuatorToken, IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}}, + + // go fuzz + {"`", TTs{UnknownToken}}, + } + + for _, tt := range tokenTests { + t.Run(tt.js, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.js)) + i := 0 + j := 0 + for { + token, _ := l.Next() + j++ + if token == ErrorToken { + test.T(t, l.Err(), io.EOF) + test.T(t, i, len(tt.expected), "when error occurred we must be at the end") + break + } else if token == WhitespaceToken { + continue + } + if i < len(tt.expected) { + if token != tt.expected[i] { + test.String(t, token.String(), tt.expected[i].String(), "token types must match") + break + } + } else { + test.Fail(t, "index", i, "must not exceed expected token types size", len(tt.expected)) + break + } + i++ + } + }) + } + + test.T(t, WhitespaceToken.String(), "Whitespace") + test.T(t, TokenType(100).String(), "Invalid(100)") +} + +//////////////////////////////////////////////////////////////// + +func ExampleNewLexer() { + l := NewLexer(bytes.NewBufferString("var x = 'lorem ipsum';")) + out := "" + for { + tt, data := l.Next() + if tt == ErrorToken { + break + } + out += string(data) + } + fmt.Println(out) + // Output: var x = 'lorem ipsum'; +} |