diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse/css/lex.go')
-rw-r--r-- | vendor/github.com/tdewolff/parse/css/lex.go | 710 |
1 files changed, 710 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/css/lex.go b/vendor/github.com/tdewolff/parse/css/lex.go new file mode 100644 index 0000000..3924bb7 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/css/lex.go @@ -0,0 +1,710 @@ +// Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/. +package css // import "github.com/tdewolff/parse/css" + +// TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early + +import ( + "bytes" + "io" + "strconv" + + "github.com/tdewolff/parse" + "github.com/tdewolff/parse/buffer" +) + +// TokenType determines the type of token, eg. a number or a semicolon. +type TokenType uint32 + +// TokenType values. +const ( + ErrorToken TokenType = iota // extra token when errors occur + IdentToken + FunctionToken // rgb( rgba( ... + AtKeywordToken // @abc + HashToken // #abc + StringToken + BadStringToken + URLToken + BadURLToken + DelimToken // any unmatched character + NumberToken // 5 + PercentageToken // 5% + DimensionToken // 5em + UnicodeRangeToken // U+554A + IncludeMatchToken // ~= + DashMatchToken // |= + PrefixMatchToken // ^= + SuffixMatchToken // $= + SubstringMatchToken // *= + ColumnToken // || + WhitespaceToken // space \t \r \n \f + CDOToken // <!-- + CDCToken // --> + ColonToken // : + SemicolonToken // ; + CommaToken // , + LeftBracketToken // [ + RightBracketToken // ] + LeftParenthesisToken // ( + RightParenthesisToken // ) + LeftBraceToken // { + RightBraceToken // } + CommentToken // extra token for comments + EmptyToken + CustomPropertyNameToken + CustomPropertyValueToken +) + +// String returns the string representation of a TokenType. +func (tt TokenType) String() string { + switch tt { + case ErrorToken: + return "Error" + case IdentToken: + return "Ident" + case FunctionToken: + return "Function" + case AtKeywordToken: + return "AtKeyword" + case HashToken: + return "Hash" + case StringToken: + return "String" + case BadStringToken: + return "BadString" + case URLToken: + return "URL" + case BadURLToken: + return "BadURL" + case DelimToken: + return "Delim" + case NumberToken: + return "Number" + case PercentageToken: + return "Percentage" + case DimensionToken: + return "Dimension" + case UnicodeRangeToken: + return "UnicodeRange" + case IncludeMatchToken: + return "IncludeMatch" + case DashMatchToken: + return "DashMatch" + case PrefixMatchToken: + return "PrefixMatch" + case SuffixMatchToken: + return "SuffixMatch" + case SubstringMatchToken: + return "SubstringMatch" + case ColumnToken: + return "Column" + case WhitespaceToken: + return "Whitespace" + case CDOToken: + return "CDO" + case CDCToken: + return "CDC" + case ColonToken: + return "Colon" + case SemicolonToken: + return "Semicolon" + case CommaToken: + return "Comma" + case LeftBracketToken: + return "LeftBracket" + case RightBracketToken: + return "RightBracket" + case LeftParenthesisToken: + return "LeftParenthesis" + case RightParenthesisToken: + return "RightParenthesis" + case LeftBraceToken: + return "LeftBrace" + case RightBraceToken: + return "RightBrace" + case CommentToken: + return "Comment" + case EmptyToken: + return "Empty" + case CustomPropertyNameToken: + return "CustomPropertyName" + case CustomPropertyValueToken: + return "CustomPropertyValue" + } + return "Invalid(" + strconv.Itoa(int(tt)) + ")" +} + +//////////////////////////////////////////////////////////////// + +// Lexer is the state for the lexer. +type Lexer struct { + r *buffer.Lexer +} + +// NewLexer returns a new Lexer for a given io.Reader. +func NewLexer(r io.Reader) *Lexer { + return &Lexer{ + buffer.NewLexer(r), + } +} + +// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. +func (l *Lexer) Err() error { + return l.r.Err() +} + +// Restore restores the NULL byte at the end of the buffer. +func (l *Lexer) Restore() { + l.r.Restore() +} + +// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. +func (l *Lexer) Next() (TokenType, []byte) { + switch l.r.Peek(0) { + case ' ', '\t', '\n', '\r', '\f': + l.r.Move(1) + for l.consumeWhitespace() { + } + return WhitespaceToken, l.r.Shift() + case ':': + l.r.Move(1) + return ColonToken, l.r.Shift() + case ';': + l.r.Move(1) + return SemicolonToken, l.r.Shift() + case ',': + l.r.Move(1) + return CommaToken, l.r.Shift() + case '(', ')', '[', ']', '{', '}': + if t := l.consumeBracket(); t != ErrorToken { + return t, l.r.Shift() + } + case '#': + if l.consumeHashToken() { + return HashToken, l.r.Shift() + } + case '"', '\'': + if t := l.consumeString(); t != ErrorToken { + return t, l.r.Shift() + } + case '.', '+': + if t := l.consumeNumeric(); t != ErrorToken { + return t, l.r.Shift() + } + case '-': + if t := l.consumeNumeric(); t != ErrorToken { + return t, l.r.Shift() + } else if t := l.consumeIdentlike(); t != ErrorToken { + return t, l.r.Shift() + } else if l.consumeCDCToken() { + return CDCToken, l.r.Shift() + } else if l.consumeCustomVariableToken() { + return CustomPropertyNameToken, l.r.Shift() + } + case '@': + if l.consumeAtKeywordToken() { + return AtKeywordToken, l.r.Shift() + } + case '$', '*', '^', '~': + if t := l.consumeMatch(); t != ErrorToken { + return t, l.r.Shift() + } + case '/': + if l.consumeComment() { + return CommentToken, l.r.Shift() + } + case '<': + if l.consumeCDOToken() { + return CDOToken, l.r.Shift() + } + case '\\': + if t := l.consumeIdentlike(); t != ErrorToken { + return t, l.r.Shift() + } + case 'u', 'U': + if l.consumeUnicodeRangeToken() { + return UnicodeRangeToken, l.r.Shift() + } else if t := l.consumeIdentlike(); t != ErrorToken { + return t, l.r.Shift() + } + case '|': + if t := l.consumeMatch(); t != ErrorToken { + return t, l.r.Shift() + } else if l.consumeColumnToken() { + return ColumnToken, l.r.Shift() + } + case 0: + if l.Err() != nil { + return ErrorToken, nil + } + default: + if t := l.consumeNumeric(); t != ErrorToken { + return t, l.r.Shift() + } else if t := l.consumeIdentlike(); t != ErrorToken { + return t, l.r.Shift() + } + } + // can't be rune because consumeIdentlike consumes that as an identifier + l.r.Move(1) + return DelimToken, l.r.Shift() +} + +//////////////////////////////////////////////////////////////// + +/* +The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/ +*/ + +func (l *Lexer) consumeByte(c byte) bool { + if l.r.Peek(0) == c { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeComment() bool { + if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' { + return false + } + l.r.Move(2) + for { + c := l.r.Peek(0) + if c == 0 && l.Err() != nil { + break + } else if c == '*' && l.r.Peek(1) == '/' { + l.r.Move(2) + return true + } + l.r.Move(1) + } + return true +} + +func (l *Lexer) consumeNewline() bool { + c := l.r.Peek(0) + if c == '\n' || c == '\f' { + l.r.Move(1) + return true + } else if c == '\r' { + if l.r.Peek(1) == '\n' { + l.r.Move(2) + } else { + l.r.Move(1) + } + return true + } + return false +} + +func (l *Lexer) consumeWhitespace() bool { + c := l.r.Peek(0) + if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeDigit() bool { + c := l.r.Peek(0) + if c >= '0' && c <= '9' { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeHexDigit() bool { + c := l.r.Peek(0) + if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') { + l.r.Move(1) + return true + } + return false +} + +func (l *Lexer) consumeEscape() bool { + if l.r.Peek(0) != '\\' { + return false + } + mark := l.r.Pos() + l.r.Move(1) + if l.consumeNewline() { + l.r.Rewind(mark) + return false + } else if l.consumeHexDigit() { + for k := 1; k < 6; k++ { + if !l.consumeHexDigit() { + break + } + } + l.consumeWhitespace() + return true + } else { + c := l.r.Peek(0) + if c >= 0xC0 { + _, n := l.r.PeekRune(0) + l.r.Move(n) + return true + } else if c == 0 && l.r.Err() != nil { + return true + } + } + l.r.Move(1) + return true +} + +func (l *Lexer) consumeIdentToken() bool { + mark := l.r.Pos() + if l.r.Peek(0) == '-' { + l.r.Move(1) + } + c := l.r.Peek(0) + if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) { + if c != '\\' || !l.consumeEscape() { + l.r.Rewind(mark) + return false + } + } else { + l.r.Move(1) + } + for { + c := l.r.Peek(0) + if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { + if c != '\\' || !l.consumeEscape() { + break + } + } else { + l.r.Move(1) + } + } + return true +} + +// support custom variables, https://www.w3.org/TR/css-variables-1/ +func (l *Lexer) consumeCustomVariableToken() bool { + // expect to be on a '-' + l.r.Move(1) + if l.r.Peek(0) != '-' { + l.r.Move(-1) + return false + } + if !l.consumeIdentToken() { + l.r.Move(-1) + return false + } + return true +} + +func (l *Lexer) consumeAtKeywordToken() bool { + // expect to be on an '@' + l.r.Move(1) + if !l.consumeIdentToken() { + l.r.Move(-1) + return false + } + return true +} + +func (l *Lexer) consumeHashToken() bool { + // expect to be on a '#' + mark := l.r.Pos() + l.r.Move(1) + c := l.r.Peek(0) + if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { + if c != '\\' || !l.consumeEscape() { + l.r.Rewind(mark) + return false + } + } else { + l.r.Move(1) + } + for { + c := l.r.Peek(0) + if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { + if c != '\\' || !l.consumeEscape() { + break + } + } else { + l.r.Move(1) + } + } + return true +} + +func (l *Lexer) consumeNumberToken() bool { + mark := l.r.Pos() + c := l.r.Peek(0) + if c == '+' || c == '-' { + l.r.Move(1) + } + firstDigit := l.consumeDigit() + if firstDigit { + for l.consumeDigit() { + } + } + if l.r.Peek(0) == '.' { + l.r.Move(1) + if l.consumeDigit() { + for l.consumeDigit() { + } + } else if firstDigit { + // . could belong to the next token + l.r.Move(-1) + return true + } else { + l.r.Rewind(mark) + return false + } + } else if !firstDigit { + l.r.Rewind(mark) + return false + } + mark = l.r.Pos() + c = l.r.Peek(0) + if c == 'e' || c == 'E' { + l.r.Move(1) + c = l.r.Peek(0) + if c == '+' || c == '-' { + l.r.Move(1) + } + if !l.consumeDigit() { + // e could belong to next token + l.r.Rewind(mark) + return true + } + for l.consumeDigit() { + } + } + return true +} + +func (l *Lexer) consumeUnicodeRangeToken() bool { + c := l.r.Peek(0) + if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' { + return false + } + mark := l.r.Pos() + l.r.Move(2) + if l.consumeHexDigit() { + // consume up to 6 hexDigits + k := 1 + for ; k < 6; k++ { + if !l.consumeHexDigit() { + break + } + } + + // either a minus or a question mark or the end is expected + if l.consumeByte('-') { + // consume another up to 6 hexDigits + if l.consumeHexDigit() { + for k := 1; k < 6; k++ { + if !l.consumeHexDigit() { + break + } + } + } else { + l.r.Rewind(mark) + return false + } + } else { + // could be filled up to 6 characters with question marks or else regular hexDigits + if l.consumeByte('?') { + k++ + for ; k < 6; k++ { + if !l.consumeByte('?') { + l.r.Rewind(mark) + return false + } + } + } + } + } else { + // consume 6 question marks + for k := 0; k < 6; k++ { + if !l.consumeByte('?') { + l.r.Rewind(mark) + return false + } + } + } + return true +} + +func (l *Lexer) consumeColumnToken() bool { + if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' { + l.r.Move(2) + return true + } + return false +} + +func (l *Lexer) consumeCDOToken() bool { + if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { + l.r.Move(4) + return true + } + return false +} + +func (l *Lexer) consumeCDCToken() bool { + if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { + l.r.Move(3) + return true + } + return false +} + +//////////////////////////////////////////////////////////////// + +// consumeMatch consumes any MatchToken. +func (l *Lexer) consumeMatch() TokenType { + if l.r.Peek(1) == '=' { + switch l.r.Peek(0) { + case '~': + l.r.Move(2) + return IncludeMatchToken + case '|': + l.r.Move(2) + return DashMatchToken + case '^': + l.r.Move(2) + return PrefixMatchToken + case '$': + l.r.Move(2) + return SuffixMatchToken + case '*': + l.r.Move(2) + return SubstringMatchToken + } + } + return ErrorToken +} + +// consumeBracket consumes any bracket token. +func (l *Lexer) consumeBracket() TokenType { + switch l.r.Peek(0) { + case '(': + l.r.Move(1) + return LeftParenthesisToken + case ')': + l.r.Move(1) + return RightParenthesisToken + case '[': + l.r.Move(1) + return LeftBracketToken + case ']': + l.r.Move(1) + return RightBracketToken + case '{': + l.r.Move(1) + return LeftBraceToken + case '}': + l.r.Move(1) + return RightBraceToken + } + return ErrorToken +} + +// consumeNumeric consumes NumberToken, PercentageToken or DimensionToken. +func (l *Lexer) consumeNumeric() TokenType { + if l.consumeNumberToken() { + if l.consumeByte('%') { + return PercentageToken + } else if l.consumeIdentToken() { + return DimensionToken + } + return NumberToken + } + return ErrorToken +} + +// consumeString consumes a string and may return BadStringToken when a newline is encountered. +func (l *Lexer) consumeString() TokenType { + // assume to be on " or ' + delim := l.r.Peek(0) + l.r.Move(1) + for { + c := l.r.Peek(0) + if c == 0 && l.Err() != nil { + break + } else if c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + return BadStringToken + } else if c == delim { + l.r.Move(1) + break + } else if c == '\\' { + if !l.consumeEscape() { + l.r.Move(1) + l.consumeNewline() + } + } else { + l.r.Move(1) + } + } + return StringToken +} + +func (l *Lexer) consumeUnquotedURL() bool { + for { + c := l.r.Peek(0) + if c == 0 && l.Err() != nil || c == ')' { + break + } else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F { + if c != '\\' || !l.consumeEscape() { + return false + } + } else { + l.r.Move(1) + } + } + return true +} + +// consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue. +func (l *Lexer) consumeRemnantsBadURL() { + for { + if l.consumeByte(')') || l.Err() != nil { + break + } else if !l.consumeEscape() { + l.r.Move(1) + } + } +} + +// consumeIdentlike consumes IdentToken, FunctionToken or UrlToken. +func (l *Lexer) consumeIdentlike() TokenType { + if l.consumeIdentToken() { + if l.r.Peek(0) != '(' { + return IdentToken + } else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) { + l.r.Move(1) + return FunctionToken + } + l.r.Move(1) + + // consume url + for l.consumeWhitespace() { + } + if c := l.r.Peek(0); c == '"' || c == '\'' { + if l.consumeString() == BadStringToken { + l.consumeRemnantsBadURL() + return BadURLToken + } + } else if !l.consumeUnquotedURL() && !l.consumeWhitespace() { + l.consumeRemnantsBadURL() + return BadURLToken + } + for l.consumeWhitespace() { + } + if !l.consumeByte(')') && l.Err() != io.EOF { + l.consumeRemnantsBadURL() + return BadURLToken + } + return URLToken + } + return ErrorToken +} |