diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse/css/lex.go')
-rw-r--r-- | vendor/github.com/tdewolff/parse/css/lex.go | 710 |
1 files changed, 0 insertions, 710 deletions
diff --git a/vendor/github.com/tdewolff/parse/css/lex.go b/vendor/github.com/tdewolff/parse/css/lex.go deleted file mode 100644 index 3924bb7..0000000 --- a/vendor/github.com/tdewolff/parse/css/lex.go +++ /dev/null @@ -1,710 +0,0 @@ -// Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/. -package css // import "github.com/tdewolff/parse/css" - -// TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early - -import ( - "bytes" - "io" - "strconv" - - "github.com/tdewolff/parse" - "github.com/tdewolff/parse/buffer" -) - -// TokenType determines the type of token, eg. a number or a semicolon. -type TokenType uint32 - -// TokenType values. -const ( - ErrorToken TokenType = iota // extra token when errors occur - IdentToken - FunctionToken // rgb( rgba( ... - AtKeywordToken // @abc - HashToken // #abc - StringToken - BadStringToken - URLToken - BadURLToken - DelimToken // any unmatched character - NumberToken // 5 - PercentageToken // 5% - DimensionToken // 5em - UnicodeRangeToken // U+554A - IncludeMatchToken // ~= - DashMatchToken // |= - PrefixMatchToken // ^= - SuffixMatchToken // $= - SubstringMatchToken // *= - ColumnToken // || - WhitespaceToken // space \t \r \n \f - CDOToken // <!-- - CDCToken // --> - ColonToken // : - SemicolonToken // ; - CommaToken // , - LeftBracketToken // [ - RightBracketToken // ] - LeftParenthesisToken // ( - RightParenthesisToken // ) - LeftBraceToken // { - RightBraceToken // } - CommentToken // extra token for comments - EmptyToken - CustomPropertyNameToken - CustomPropertyValueToken -) - -// String returns the string representation of a TokenType. -func (tt TokenType) String() string { - switch tt { - case ErrorToken: - return "Error" - case IdentToken: - return "Ident" - case FunctionToken: - return "Function" - case AtKeywordToken: - return "AtKeyword" - case HashToken: - return "Hash" - case StringToken: - return "String" - case BadStringToken: - return "BadString" - case URLToken: - return "URL" - case BadURLToken: - return "BadURL" - case DelimToken: - return "Delim" - case NumberToken: - return "Number" - case PercentageToken: - return "Percentage" - case DimensionToken: - return "Dimension" - case UnicodeRangeToken: - return "UnicodeRange" - case IncludeMatchToken: - return "IncludeMatch" - case DashMatchToken: - return "DashMatch" - case PrefixMatchToken: - return "PrefixMatch" - case SuffixMatchToken: - return "SuffixMatch" - case SubstringMatchToken: - return "SubstringMatch" - case ColumnToken: - return "Column" - case WhitespaceToken: - return "Whitespace" - case CDOToken: - return "CDO" - case CDCToken: - return "CDC" - case ColonToken: - return "Colon" - case SemicolonToken: - return "Semicolon" - case CommaToken: - return "Comma" - case LeftBracketToken: - return "LeftBracket" - case RightBracketToken: - return "RightBracket" - case LeftParenthesisToken: - return "LeftParenthesis" - case RightParenthesisToken: - return "RightParenthesis" - case LeftBraceToken: - return "LeftBrace" - case RightBraceToken: - return "RightBrace" - case CommentToken: - return "Comment" - case EmptyToken: - return "Empty" - case CustomPropertyNameToken: - return "CustomPropertyName" - case CustomPropertyValueToken: - return "CustomPropertyValue" - } - return "Invalid(" + strconv.Itoa(int(tt)) + ")" -} - -//////////////////////////////////////////////////////////////// - -// Lexer is the state for the lexer. -type Lexer struct { - r *buffer.Lexer -} - -// NewLexer returns a new Lexer for a given io.Reader. -func NewLexer(r io.Reader) *Lexer { - return &Lexer{ - buffer.NewLexer(r), - } -} - -// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. -func (l *Lexer) Err() error { - return l.r.Err() -} - -// Restore restores the NULL byte at the end of the buffer. -func (l *Lexer) Restore() { - l.r.Restore() -} - -// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. -func (l *Lexer) Next() (TokenType, []byte) { - switch l.r.Peek(0) { - case ' ', '\t', '\n', '\r', '\f': - l.r.Move(1) - for l.consumeWhitespace() { - } - return WhitespaceToken, l.r.Shift() - case ':': - l.r.Move(1) - return ColonToken, l.r.Shift() - case ';': - l.r.Move(1) - return SemicolonToken, l.r.Shift() - case ',': - l.r.Move(1) - return CommaToken, l.r.Shift() - case '(', ')', '[', ']', '{', '}': - if t := l.consumeBracket(); t != ErrorToken { - return t, l.r.Shift() - } - case '#': - if l.consumeHashToken() { - return HashToken, l.r.Shift() - } - case '"', '\'': - if t := l.consumeString(); t != ErrorToken { - return t, l.r.Shift() - } - case '.', '+': - if t := l.consumeNumeric(); t != ErrorToken { - return t, l.r.Shift() - } - case '-': - if t := l.consumeNumeric(); t != ErrorToken { - return t, l.r.Shift() - } else if t := l.consumeIdentlike(); t != ErrorToken { - return t, l.r.Shift() - } else if l.consumeCDCToken() { - return CDCToken, l.r.Shift() - } else if l.consumeCustomVariableToken() { - return CustomPropertyNameToken, l.r.Shift() - } - case '@': - if l.consumeAtKeywordToken() { - return AtKeywordToken, l.r.Shift() - } - case '$', '*', '^', '~': - if t := l.consumeMatch(); t != ErrorToken { - return t, l.r.Shift() - } - case '/': - if l.consumeComment() { - return CommentToken, l.r.Shift() - } - case '<': - if l.consumeCDOToken() { - return CDOToken, l.r.Shift() - } - case '\\': - if t := l.consumeIdentlike(); t != ErrorToken { - return t, l.r.Shift() - } - case 'u', 'U': - if l.consumeUnicodeRangeToken() { - return UnicodeRangeToken, l.r.Shift() - } else if t := l.consumeIdentlike(); t != ErrorToken { - return t, l.r.Shift() - } - case '|': - if t := l.consumeMatch(); t != ErrorToken { - return t, l.r.Shift() - } else if l.consumeColumnToken() { - return ColumnToken, l.r.Shift() - } - case 0: - if l.Err() != nil { - return ErrorToken, nil - } - default: - if t := l.consumeNumeric(); t != ErrorToken { - return t, l.r.Shift() - } else if t := l.consumeIdentlike(); t != ErrorToken { - return t, l.r.Shift() - } - } - // can't be rune because consumeIdentlike consumes that as an identifier - l.r.Move(1) - return DelimToken, l.r.Shift() -} - -//////////////////////////////////////////////////////////////// - -/* -The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/ -*/ - -func (l *Lexer) consumeByte(c byte) bool { - if l.r.Peek(0) == c { - l.r.Move(1) - return true - } - return false -} - -func (l *Lexer) consumeComment() bool { - if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' { - return false - } - l.r.Move(2) - for { - c := l.r.Peek(0) - if c == 0 && l.Err() != nil { - break - } else if c == '*' && l.r.Peek(1) == '/' { - l.r.Move(2) - return true - } - l.r.Move(1) - } - return true -} - -func (l *Lexer) consumeNewline() bool { - c := l.r.Peek(0) - if c == '\n' || c == '\f' { - l.r.Move(1) - return true - } else if c == '\r' { - if l.r.Peek(1) == '\n' { - l.r.Move(2) - } else { - l.r.Move(1) - } - return true - } - return false -} - -func (l *Lexer) consumeWhitespace() bool { - c := l.r.Peek(0) - if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { - l.r.Move(1) - return true - } - return false -} - -func (l *Lexer) consumeDigit() bool { - c := l.r.Peek(0) - if c >= '0' && c <= '9' { - l.r.Move(1) - return true - } - return false -} - -func (l *Lexer) consumeHexDigit() bool { - c := l.r.Peek(0) - if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') { - l.r.Move(1) - return true - } - return false -} - -func (l *Lexer) consumeEscape() bool { - if l.r.Peek(0) != '\\' { - return false - } - mark := l.r.Pos() - l.r.Move(1) - if l.consumeNewline() { - l.r.Rewind(mark) - return false - } else if l.consumeHexDigit() { - for k := 1; k < 6; k++ { - if !l.consumeHexDigit() { - break - } - } - l.consumeWhitespace() - return true - } else { - c := l.r.Peek(0) - if c >= 0xC0 { - _, n := l.r.PeekRune(0) - l.r.Move(n) - return true - } else if c == 0 && l.r.Err() != nil { - return true - } - } - l.r.Move(1) - return true -} - -func (l *Lexer) consumeIdentToken() bool { - mark := l.r.Pos() - if l.r.Peek(0) == '-' { - l.r.Move(1) - } - c := l.r.Peek(0) - if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) { - if c != '\\' || !l.consumeEscape() { - l.r.Rewind(mark) - return false - } - } else { - l.r.Move(1) - } - for { - c := l.r.Peek(0) - if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { - if c != '\\' || !l.consumeEscape() { - break - } - } else { - l.r.Move(1) - } - } - return true -} - -// support custom variables, https://www.w3.org/TR/css-variables-1/ -func (l *Lexer) consumeCustomVariableToken() bool { - // expect to be on a '-' - l.r.Move(1) - if l.r.Peek(0) != '-' { - l.r.Move(-1) - return false - } - if !l.consumeIdentToken() { - l.r.Move(-1) - return false - } - return true -} - -func (l *Lexer) consumeAtKeywordToken() bool { - // expect to be on an '@' - l.r.Move(1) - if !l.consumeIdentToken() { - l.r.Move(-1) - return false - } - return true -} - -func (l *Lexer) consumeHashToken() bool { - // expect to be on a '#' - mark := l.r.Pos() - l.r.Move(1) - c := l.r.Peek(0) - if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { - if c != '\\' || !l.consumeEscape() { - l.r.Rewind(mark) - return false - } - } else { - l.r.Move(1) - } - for { - c := l.r.Peek(0) - if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { - if c != '\\' || !l.consumeEscape() { - break - } - } else { - l.r.Move(1) - } - } - return true -} - -func (l *Lexer) consumeNumberToken() bool { - mark := l.r.Pos() - c := l.r.Peek(0) - if c == '+' || c == '-' { - l.r.Move(1) - } - firstDigit := l.consumeDigit() - if firstDigit { - for l.consumeDigit() { - } - } - if l.r.Peek(0) == '.' { - l.r.Move(1) - if l.consumeDigit() { - for l.consumeDigit() { - } - } else if firstDigit { - // . could belong to the next token - l.r.Move(-1) - return true - } else { - l.r.Rewind(mark) - return false - } - } else if !firstDigit { - l.r.Rewind(mark) - return false - } - mark = l.r.Pos() - c = l.r.Peek(0) - if c == 'e' || c == 'E' { - l.r.Move(1) - c = l.r.Peek(0) - if c == '+' || c == '-' { - l.r.Move(1) - } - if !l.consumeDigit() { - // e could belong to next token - l.r.Rewind(mark) - return true - } - for l.consumeDigit() { - } - } - return true -} - -func (l *Lexer) consumeUnicodeRangeToken() bool { - c := l.r.Peek(0) - if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' { - return false - } - mark := l.r.Pos() - l.r.Move(2) - if l.consumeHexDigit() { - // consume up to 6 hexDigits - k := 1 - for ; k < 6; k++ { - if !l.consumeHexDigit() { - break - } - } - - // either a minus or a question mark or the end is expected - if l.consumeByte('-') { - // consume another up to 6 hexDigits - if l.consumeHexDigit() { - for k := 1; k < 6; k++ { - if !l.consumeHexDigit() { - break - } - } - } else { - l.r.Rewind(mark) - return false - } - } else { - // could be filled up to 6 characters with question marks or else regular hexDigits - if l.consumeByte('?') { - k++ - for ; k < 6; k++ { - if !l.consumeByte('?') { - l.r.Rewind(mark) - return false - } - } - } - } - } else { - // consume 6 question marks - for k := 0; k < 6; k++ { - if !l.consumeByte('?') { - l.r.Rewind(mark) - return false - } - } - } - return true -} - -func (l *Lexer) consumeColumnToken() bool { - if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' { - l.r.Move(2) - return true - } - return false -} - -func (l *Lexer) consumeCDOToken() bool { - if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { - l.r.Move(4) - return true - } - return false -} - -func (l *Lexer) consumeCDCToken() bool { - if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { - l.r.Move(3) - return true - } - return false -} - -//////////////////////////////////////////////////////////////// - -// consumeMatch consumes any MatchToken. -func (l *Lexer) consumeMatch() TokenType { - if l.r.Peek(1) == '=' { - switch l.r.Peek(0) { - case '~': - l.r.Move(2) - return IncludeMatchToken - case '|': - l.r.Move(2) - return DashMatchToken - case '^': - l.r.Move(2) - return PrefixMatchToken - case '$': - l.r.Move(2) - return SuffixMatchToken - case '*': - l.r.Move(2) - return SubstringMatchToken - } - } - return ErrorToken -} - -// consumeBracket consumes any bracket token. -func (l *Lexer) consumeBracket() TokenType { - switch l.r.Peek(0) { - case '(': - l.r.Move(1) - return LeftParenthesisToken - case ')': - l.r.Move(1) - return RightParenthesisToken - case '[': - l.r.Move(1) - return LeftBracketToken - case ']': - l.r.Move(1) - return RightBracketToken - case '{': - l.r.Move(1) - return LeftBraceToken - case '}': - l.r.Move(1) - return RightBraceToken - } - return ErrorToken -} - -// consumeNumeric consumes NumberToken, PercentageToken or DimensionToken. -func (l *Lexer) consumeNumeric() TokenType { - if l.consumeNumberToken() { - if l.consumeByte('%') { - return PercentageToken - } else if l.consumeIdentToken() { - return DimensionToken - } - return NumberToken - } - return ErrorToken -} - -// consumeString consumes a string and may return BadStringToken when a newline is encountered. -func (l *Lexer) consumeString() TokenType { - // assume to be on " or ' - delim := l.r.Peek(0) - l.r.Move(1) - for { - c := l.r.Peek(0) - if c == 0 && l.Err() != nil { - break - } else if c == '\n' || c == '\r' || c == '\f' { - l.r.Move(1) - return BadStringToken - } else if c == delim { - l.r.Move(1) - break - } else if c == '\\' { - if !l.consumeEscape() { - l.r.Move(1) - l.consumeNewline() - } - } else { - l.r.Move(1) - } - } - return StringToken -} - -func (l *Lexer) consumeUnquotedURL() bool { - for { - c := l.r.Peek(0) - if c == 0 && l.Err() != nil || c == ')' { - break - } else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F { - if c != '\\' || !l.consumeEscape() { - return false - } - } else { - l.r.Move(1) - } - } - return true -} - -// consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue. -func (l *Lexer) consumeRemnantsBadURL() { - for { - if l.consumeByte(')') || l.Err() != nil { - break - } else if !l.consumeEscape() { - l.r.Move(1) - } - } -} - -// consumeIdentlike consumes IdentToken, FunctionToken or UrlToken. -func (l *Lexer) consumeIdentlike() TokenType { - if l.consumeIdentToken() { - if l.r.Peek(0) != '(' { - return IdentToken - } else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) { - l.r.Move(1) - return FunctionToken - } - l.r.Move(1) - - // consume url - for l.consumeWhitespace() { - } - if c := l.r.Peek(0); c == '"' || c == '\'' { - if l.consumeString() == BadStringToken { - l.consumeRemnantsBadURL() - return BadURLToken - } - } else if !l.consumeUnquotedURL() && !l.consumeWhitespace() { - l.consumeRemnantsBadURL() - return BadURLToken - } - for l.consumeWhitespace() { - } - if !l.consumeByte(')') && l.Err() != io.EOF { - l.consumeRemnantsBadURL() - return BadURLToken - } - return URLToken - } - return ErrorToken -} |