aboutsummaryrefslogtreecommitdiffhomepage
path: root/vendor/github.com/tdewolff/parse/css/lex.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/tdewolff/parse/css/lex.go')
-rw-r--r--vendor/github.com/tdewolff/parse/css/lex.go710
1 files changed, 710 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/css/lex.go b/vendor/github.com/tdewolff/parse/css/lex.go
new file mode 100644
index 0000000..3924bb7
--- /dev/null
+++ b/vendor/github.com/tdewolff/parse/css/lex.go
@@ -0,0 +1,710 @@
+// Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/.
+package css // import "github.com/tdewolff/parse/css"
+
+// TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early
+
+import (
+ "bytes"
+ "io"
+ "strconv"
+
+ "github.com/tdewolff/parse"
+ "github.com/tdewolff/parse/buffer"
+)
+
+// TokenType determines the type of token, eg. a number or a semicolon.
+type TokenType uint32
+
+// TokenType values.
+const (
+ ErrorToken TokenType = iota // extra token when errors occur
+ IdentToken
+ FunctionToken // rgb( rgba( ...
+ AtKeywordToken // @abc
+ HashToken // #abc
+ StringToken
+ BadStringToken
+ URLToken
+ BadURLToken
+ DelimToken // any unmatched character
+ NumberToken // 5
+ PercentageToken // 5%
+ DimensionToken // 5em
+ UnicodeRangeToken // U+554A
+ IncludeMatchToken // ~=
+ DashMatchToken // |=
+ PrefixMatchToken // ^=
+ SuffixMatchToken // $=
+ SubstringMatchToken // *=
+ ColumnToken // ||
+ WhitespaceToken // space \t \r \n \f
+ CDOToken // <!--
+ CDCToken // -->
+ ColonToken // :
+ SemicolonToken // ;
+ CommaToken // ,
+ LeftBracketToken // [
+ RightBracketToken // ]
+ LeftParenthesisToken // (
+ RightParenthesisToken // )
+ LeftBraceToken // {
+ RightBraceToken // }
+ CommentToken // extra token for comments
+ EmptyToken
+ CustomPropertyNameToken
+ CustomPropertyValueToken
+)
+
+// String returns the string representation of a TokenType.
+func (tt TokenType) String() string {
+ switch tt {
+ case ErrorToken:
+ return "Error"
+ case IdentToken:
+ return "Ident"
+ case FunctionToken:
+ return "Function"
+ case AtKeywordToken:
+ return "AtKeyword"
+ case HashToken:
+ return "Hash"
+ case StringToken:
+ return "String"
+ case BadStringToken:
+ return "BadString"
+ case URLToken:
+ return "URL"
+ case BadURLToken:
+ return "BadURL"
+ case DelimToken:
+ return "Delim"
+ case NumberToken:
+ return "Number"
+ case PercentageToken:
+ return "Percentage"
+ case DimensionToken:
+ return "Dimension"
+ case UnicodeRangeToken:
+ return "UnicodeRange"
+ case IncludeMatchToken:
+ return "IncludeMatch"
+ case DashMatchToken:
+ return "DashMatch"
+ case PrefixMatchToken:
+ return "PrefixMatch"
+ case SuffixMatchToken:
+ return "SuffixMatch"
+ case SubstringMatchToken:
+ return "SubstringMatch"
+ case ColumnToken:
+ return "Column"
+ case WhitespaceToken:
+ return "Whitespace"
+ case CDOToken:
+ return "CDO"
+ case CDCToken:
+ return "CDC"
+ case ColonToken:
+ return "Colon"
+ case SemicolonToken:
+ return "Semicolon"
+ case CommaToken:
+ return "Comma"
+ case LeftBracketToken:
+ return "LeftBracket"
+ case RightBracketToken:
+ return "RightBracket"
+ case LeftParenthesisToken:
+ return "LeftParenthesis"
+ case RightParenthesisToken:
+ return "RightParenthesis"
+ case LeftBraceToken:
+ return "LeftBrace"
+ case RightBraceToken:
+ return "RightBrace"
+ case CommentToken:
+ return "Comment"
+ case EmptyToken:
+ return "Empty"
+ case CustomPropertyNameToken:
+ return "CustomPropertyName"
+ case CustomPropertyValueToken:
+ return "CustomPropertyValue"
+ }
+ return "Invalid(" + strconv.Itoa(int(tt)) + ")"
+}
+
+////////////////////////////////////////////////////////////////
+
+// Lexer is the state for the lexer.
+type Lexer struct {
+ r *buffer.Lexer
+}
+
+// NewLexer returns a new Lexer for a given io.Reader.
+func NewLexer(r io.Reader) *Lexer {
+ return &Lexer{
+ buffer.NewLexer(r),
+ }
+}
+
+// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
+func (l *Lexer) Err() error {
+ return l.r.Err()
+}
+
+// Restore restores the NULL byte at the end of the buffer.
+func (l *Lexer) Restore() {
+ l.r.Restore()
+}
+
+// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
+func (l *Lexer) Next() (TokenType, []byte) {
+ switch l.r.Peek(0) {
+ case ' ', '\t', '\n', '\r', '\f':
+ l.r.Move(1)
+ for l.consumeWhitespace() {
+ }
+ return WhitespaceToken, l.r.Shift()
+ case ':':
+ l.r.Move(1)
+ return ColonToken, l.r.Shift()
+ case ';':
+ l.r.Move(1)
+ return SemicolonToken, l.r.Shift()
+ case ',':
+ l.r.Move(1)
+ return CommaToken, l.r.Shift()
+ case '(', ')', '[', ']', '{', '}':
+ if t := l.consumeBracket(); t != ErrorToken {
+ return t, l.r.Shift()
+ }
+ case '#':
+ if l.consumeHashToken() {
+ return HashToken, l.r.Shift()
+ }
+ case '"', '\'':
+ if t := l.consumeString(); t != ErrorToken {
+ return t, l.r.Shift()
+ }
+ case '.', '+':
+ if t := l.consumeNumeric(); t != ErrorToken {
+ return t, l.r.Shift()
+ }
+ case '-':
+ if t := l.consumeNumeric(); t != ErrorToken {
+ return t, l.r.Shift()
+ } else if t := l.consumeIdentlike(); t != ErrorToken {
+ return t, l.r.Shift()
+ } else if l.consumeCDCToken() {
+ return CDCToken, l.r.Shift()
+ } else if l.consumeCustomVariableToken() {
+ return CustomPropertyNameToken, l.r.Shift()
+ }
+ case '@':
+ if l.consumeAtKeywordToken() {
+ return AtKeywordToken, l.r.Shift()
+ }
+ case '$', '*', '^', '~':
+ if t := l.consumeMatch(); t != ErrorToken {
+ return t, l.r.Shift()
+ }
+ case '/':
+ if l.consumeComment() {
+ return CommentToken, l.r.Shift()
+ }
+ case '<':
+ if l.consumeCDOToken() {
+ return CDOToken, l.r.Shift()
+ }
+ case '\\':
+ if t := l.consumeIdentlike(); t != ErrorToken {
+ return t, l.r.Shift()
+ }
+ case 'u', 'U':
+ if l.consumeUnicodeRangeToken() {
+ return UnicodeRangeToken, l.r.Shift()
+ } else if t := l.consumeIdentlike(); t != ErrorToken {
+ return t, l.r.Shift()
+ }
+ case '|':
+ if t := l.consumeMatch(); t != ErrorToken {
+ return t, l.r.Shift()
+ } else if l.consumeColumnToken() {
+ return ColumnToken, l.r.Shift()
+ }
+ case 0:
+ if l.Err() != nil {
+ return ErrorToken, nil
+ }
+ default:
+ if t := l.consumeNumeric(); t != ErrorToken {
+ return t, l.r.Shift()
+ } else if t := l.consumeIdentlike(); t != ErrorToken {
+ return t, l.r.Shift()
+ }
+ }
+ // can't be rune because consumeIdentlike consumes that as an identifier
+ l.r.Move(1)
+ return DelimToken, l.r.Shift()
+}
+
+////////////////////////////////////////////////////////////////
+
+/*
+The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/
+*/
+
+func (l *Lexer) consumeByte(c byte) bool {
+ if l.r.Peek(0) == c {
+ l.r.Move(1)
+ return true
+ }
+ return false
+}
+
+func (l *Lexer) consumeComment() bool {
+ if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' {
+ return false
+ }
+ l.r.Move(2)
+ for {
+ c := l.r.Peek(0)
+ if c == 0 && l.Err() != nil {
+ break
+ } else if c == '*' && l.r.Peek(1) == '/' {
+ l.r.Move(2)
+ return true
+ }
+ l.r.Move(1)
+ }
+ return true
+}
+
+func (l *Lexer) consumeNewline() bool {
+ c := l.r.Peek(0)
+ if c == '\n' || c == '\f' {
+ l.r.Move(1)
+ return true
+ } else if c == '\r' {
+ if l.r.Peek(1) == '\n' {
+ l.r.Move(2)
+ } else {
+ l.r.Move(1)
+ }
+ return true
+ }
+ return false
+}
+
+func (l *Lexer) consumeWhitespace() bool {
+ c := l.r.Peek(0)
+ if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
+ l.r.Move(1)
+ return true
+ }
+ return false
+}
+
+func (l *Lexer) consumeDigit() bool {
+ c := l.r.Peek(0)
+ if c >= '0' && c <= '9' {
+ l.r.Move(1)
+ return true
+ }
+ return false
+}
+
+func (l *Lexer) consumeHexDigit() bool {
+ c := l.r.Peek(0)
+ if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
+ l.r.Move(1)
+ return true
+ }
+ return false
+}
+
+func (l *Lexer) consumeEscape() bool {
+ if l.r.Peek(0) != '\\' {
+ return false
+ }
+ mark := l.r.Pos()
+ l.r.Move(1)
+ if l.consumeNewline() {
+ l.r.Rewind(mark)
+ return false
+ } else if l.consumeHexDigit() {
+ for k := 1; k < 6; k++ {
+ if !l.consumeHexDigit() {
+ break
+ }
+ }
+ l.consumeWhitespace()
+ return true
+ } else {
+ c := l.r.Peek(0)
+ if c >= 0xC0 {
+ _, n := l.r.PeekRune(0)
+ l.r.Move(n)
+ return true
+ } else if c == 0 && l.r.Err() != nil {
+ return true
+ }
+ }
+ l.r.Move(1)
+ return true
+}
+
+func (l *Lexer) consumeIdentToken() bool {
+ mark := l.r.Pos()
+ if l.r.Peek(0) == '-' {
+ l.r.Move(1)
+ }
+ c := l.r.Peek(0)
+ if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) {
+ if c != '\\' || !l.consumeEscape() {
+ l.r.Rewind(mark)
+ return false
+ }
+ } else {
+ l.r.Move(1)
+ }
+ for {
+ c := l.r.Peek(0)
+ if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
+ if c != '\\' || !l.consumeEscape() {
+ break
+ }
+ } else {
+ l.r.Move(1)
+ }
+ }
+ return true
+}
+
+// support custom variables, https://www.w3.org/TR/css-variables-1/
+func (l *Lexer) consumeCustomVariableToken() bool {
+ // expect to be on a '-'
+ l.r.Move(1)
+ if l.r.Peek(0) != '-' {
+ l.r.Move(-1)
+ return false
+ }
+ if !l.consumeIdentToken() {
+ l.r.Move(-1)
+ return false
+ }
+ return true
+}
+
+func (l *Lexer) consumeAtKeywordToken() bool {
+ // expect to be on an '@'
+ l.r.Move(1)
+ if !l.consumeIdentToken() {
+ l.r.Move(-1)
+ return false
+ }
+ return true
+}
+
+func (l *Lexer) consumeHashToken() bool {
+ // expect to be on a '#'
+ mark := l.r.Pos()
+ l.r.Move(1)
+ c := l.r.Peek(0)
+ if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
+ if c != '\\' || !l.consumeEscape() {
+ l.r.Rewind(mark)
+ return false
+ }
+ } else {
+ l.r.Move(1)
+ }
+ for {
+ c := l.r.Peek(0)
+ if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
+ if c != '\\' || !l.consumeEscape() {
+ break
+ }
+ } else {
+ l.r.Move(1)
+ }
+ }
+ return true
+}
+
+func (l *Lexer) consumeNumberToken() bool {
+ mark := l.r.Pos()
+ c := l.r.Peek(0)
+ if c == '+' || c == '-' {
+ l.r.Move(1)
+ }
+ firstDigit := l.consumeDigit()
+ if firstDigit {
+ for l.consumeDigit() {
+ }
+ }
+ if l.r.Peek(0) == '.' {
+ l.r.Move(1)
+ if l.consumeDigit() {
+ for l.consumeDigit() {
+ }
+ } else if firstDigit {
+ // . could belong to the next token
+ l.r.Move(-1)
+ return true
+ } else {
+ l.r.Rewind(mark)
+ return false
+ }
+ } else if !firstDigit {
+ l.r.Rewind(mark)
+ return false
+ }
+ mark = l.r.Pos()
+ c = l.r.Peek(0)
+ if c == 'e' || c == 'E' {
+ l.r.Move(1)
+ c = l.r.Peek(0)
+ if c == '+' || c == '-' {
+ l.r.Move(1)
+ }
+ if !l.consumeDigit() {
+ // e could belong to next token
+ l.r.Rewind(mark)
+ return true
+ }
+ for l.consumeDigit() {
+ }
+ }
+ return true
+}
+
+func (l *Lexer) consumeUnicodeRangeToken() bool {
+ c := l.r.Peek(0)
+ if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' {
+ return false
+ }
+ mark := l.r.Pos()
+ l.r.Move(2)
+ if l.consumeHexDigit() {
+ // consume up to 6 hexDigits
+ k := 1
+ for ; k < 6; k++ {
+ if !l.consumeHexDigit() {
+ break
+ }
+ }
+
+ // either a minus or a question mark or the end is expected
+ if l.consumeByte('-') {
+ // consume another up to 6 hexDigits
+ if l.consumeHexDigit() {
+ for k := 1; k < 6; k++ {
+ if !l.consumeHexDigit() {
+ break
+ }
+ }
+ } else {
+ l.r.Rewind(mark)
+ return false
+ }
+ } else {
+ // could be filled up to 6 characters with question marks or else regular hexDigits
+ if l.consumeByte('?') {
+ k++
+ for ; k < 6; k++ {
+ if !l.consumeByte('?') {
+ l.r.Rewind(mark)
+ return false
+ }
+ }
+ }
+ }
+ } else {
+ // consume 6 question marks
+ for k := 0; k < 6; k++ {
+ if !l.consumeByte('?') {
+ l.r.Rewind(mark)
+ return false
+ }
+ }
+ }
+ return true
+}
+
+func (l *Lexer) consumeColumnToken() bool {
+ if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' {
+ l.r.Move(2)
+ return true
+ }
+ return false
+}
+
+func (l *Lexer) consumeCDOToken() bool {
+ if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
+ l.r.Move(4)
+ return true
+ }
+ return false
+}
+
+func (l *Lexer) consumeCDCToken() bool {
+ if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
+ l.r.Move(3)
+ return true
+ }
+ return false
+}
+
+////////////////////////////////////////////////////////////////
+
+// consumeMatch consumes any MatchToken.
+func (l *Lexer) consumeMatch() TokenType {
+ if l.r.Peek(1) == '=' {
+ switch l.r.Peek(0) {
+ case '~':
+ l.r.Move(2)
+ return IncludeMatchToken
+ case '|':
+ l.r.Move(2)
+ return DashMatchToken
+ case '^':
+ l.r.Move(2)
+ return PrefixMatchToken
+ case '$':
+ l.r.Move(2)
+ return SuffixMatchToken
+ case '*':
+ l.r.Move(2)
+ return SubstringMatchToken
+ }
+ }
+ return ErrorToken
+}
+
+// consumeBracket consumes any bracket token.
+func (l *Lexer) consumeBracket() TokenType {
+ switch l.r.Peek(0) {
+ case '(':
+ l.r.Move(1)
+ return LeftParenthesisToken
+ case ')':
+ l.r.Move(1)
+ return RightParenthesisToken
+ case '[':
+ l.r.Move(1)
+ return LeftBracketToken
+ case ']':
+ l.r.Move(1)
+ return RightBracketToken
+ case '{':
+ l.r.Move(1)
+ return LeftBraceToken
+ case '}':
+ l.r.Move(1)
+ return RightBraceToken
+ }
+ return ErrorToken
+}
+
+// consumeNumeric consumes NumberToken, PercentageToken or DimensionToken.
+func (l *Lexer) consumeNumeric() TokenType {
+ if l.consumeNumberToken() {
+ if l.consumeByte('%') {
+ return PercentageToken
+ } else if l.consumeIdentToken() {
+ return DimensionToken
+ }
+ return NumberToken
+ }
+ return ErrorToken
+}
+
+// consumeString consumes a string and may return BadStringToken when a newline is encountered.
+func (l *Lexer) consumeString() TokenType {
+ // assume to be on " or '
+ delim := l.r.Peek(0)
+ l.r.Move(1)
+ for {
+ c := l.r.Peek(0)
+ if c == 0 && l.Err() != nil {
+ break
+ } else if c == '\n' || c == '\r' || c == '\f' {
+ l.r.Move(1)
+ return BadStringToken
+ } else if c == delim {
+ l.r.Move(1)
+ break
+ } else if c == '\\' {
+ if !l.consumeEscape() {
+ l.r.Move(1)
+ l.consumeNewline()
+ }
+ } else {
+ l.r.Move(1)
+ }
+ }
+ return StringToken
+}
+
+func (l *Lexer) consumeUnquotedURL() bool {
+ for {
+ c := l.r.Peek(0)
+ if c == 0 && l.Err() != nil || c == ')' {
+ break
+ } else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F {
+ if c != '\\' || !l.consumeEscape() {
+ return false
+ }
+ } else {
+ l.r.Move(1)
+ }
+ }
+ return true
+}
+
+// consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue.
+func (l *Lexer) consumeRemnantsBadURL() {
+ for {
+ if l.consumeByte(')') || l.Err() != nil {
+ break
+ } else if !l.consumeEscape() {
+ l.r.Move(1)
+ }
+ }
+}
+
+// consumeIdentlike consumes IdentToken, FunctionToken or UrlToken.
+func (l *Lexer) consumeIdentlike() TokenType {
+ if l.consumeIdentToken() {
+ if l.r.Peek(0) != '(' {
+ return IdentToken
+ } else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) {
+ l.r.Move(1)
+ return FunctionToken
+ }
+ l.r.Move(1)
+
+ // consume url
+ for l.consumeWhitespace() {
+ }
+ if c := l.r.Peek(0); c == '"' || c == '\'' {
+ if l.consumeString() == BadStringToken {
+ l.consumeRemnantsBadURL()
+ return BadURLToken
+ }
+ } else if !l.consumeUnquotedURL() && !l.consumeWhitespace() {
+ l.consumeRemnantsBadURL()
+ return BadURLToken
+ }
+ for l.consumeWhitespace() {
+ }
+ if !l.consumeByte(')') && l.Err() != io.EOF {
+ l.consumeRemnantsBadURL()
+ return BadURLToken
+ }
+ return URLToken
+ }
+ return ErrorToken
+}