diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse/xml/lex.go')
-rw-r--r-- | vendor/github.com/tdewolff/parse/xml/lex.go | 345 |
1 files changed, 345 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/xml/lex.go b/vendor/github.com/tdewolff/parse/xml/lex.go new file mode 100644 index 0000000..0f1393c --- /dev/null +++ b/vendor/github.com/tdewolff/parse/xml/lex.go @@ -0,0 +1,345 @@ +// Package xml is an XML1.0 lexer following the specifications at http://www.w3.org/TR/xml/. +package xml // import "github.com/tdewolff/parse/xml" + +import ( + "io" + "strconv" + + "github.com/tdewolff/parse" + "github.com/tdewolff/parse/buffer" +) + +// TokenType determines the type of token, eg. a number or a semicolon. +type TokenType uint32 + +// TokenType values. +const ( + ErrorToken TokenType = iota // extra token when errors occur + CommentToken + DOCTYPEToken + CDATAToken + StartTagToken + StartTagPIToken + StartTagCloseToken + StartTagCloseVoidToken + StartTagClosePIToken + EndTagToken + AttributeToken + TextToken +) + +// String returns the string representation of a TokenType. +func (tt TokenType) String() string { + switch tt { + case ErrorToken: + return "Error" + case CommentToken: + return "Comment" + case DOCTYPEToken: + return "DOCTYPE" + case CDATAToken: + return "CDATA" + case StartTagToken: + return "StartTag" + case StartTagPIToken: + return "StartTagPI" + case StartTagCloseToken: + return "StartTagClose" + case StartTagCloseVoidToken: + return "StartTagCloseVoid" + case StartTagClosePIToken: + return "StartTagClosePI" + case EndTagToken: + return "EndTag" + case AttributeToken: + return "Attribute" + case TextToken: + return "Text" + } + return "Invalid(" + strconv.Itoa(int(tt)) + ")" +} + +//////////////////////////////////////////////////////////////// + +// Lexer is the state for the lexer. +type Lexer struct { + r *buffer.Lexer + err error + + inTag bool + + text []byte + attrVal []byte +} + +// NewLexer returns a new Lexer for a given io.Reader. +func NewLexer(r io.Reader) *Lexer { + return &Lexer{ + r: buffer.NewLexer(r), + } +} + +// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. +func (l *Lexer) Err() error { + err := l.r.Err() + if err != nil { + return err + } + return l.err +} + +// Restore restores the NULL byte at the end of the buffer. +func (l *Lexer) Restore() { + l.r.Restore() +} + +// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. +func (l *Lexer) Next() (TokenType, []byte) { + l.text = nil + var c byte + if l.inTag { + l.attrVal = nil + for { // before attribute name state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' { + l.r.Move(1) + continue + } + break + } + if c == 0 { + l.err = parse.NewErrorLexer("unexpected null character", l.r) + return ErrorToken, nil + } else if c != '>' && (c != '/' && c != '?' || l.r.Peek(1) != '>') { + return AttributeToken, l.shiftAttribute() + } + start := l.r.Pos() + l.inTag = false + if c == '/' { + l.r.Move(2) + l.text = l.r.Lexeme()[start:] + return StartTagCloseVoidToken, l.r.Shift() + } else if c == '?' { + l.r.Move(2) + l.text = l.r.Lexeme()[start:] + return StartTagClosePIToken, l.r.Shift() + } else { + l.r.Move(1) + l.text = l.r.Lexeme()[start:] + return StartTagCloseToken, l.r.Shift() + } + } + + for { + c = l.r.Peek(0) + if c == '<' { + if l.r.Pos() > 0 { + return TextToken, l.r.Shift() + } + c = l.r.Peek(1) + if c == '/' { + l.r.Move(2) + return EndTagToken, l.shiftEndTag() + } else if c == '!' { + l.r.Move(2) + if l.at('-', '-') { + l.r.Move(2) + return CommentToken, l.shiftCommentText() + } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') { + l.r.Move(7) + return CDATAToken, l.shiftCDATAText() + } else if l.at('D', 'O', 'C', 'T', 'Y', 'P', 'E') { + l.r.Move(8) + return DOCTYPEToken, l.shiftDOCTYPEText() + } + l.r.Move(-2) + } else if c == '?' { + l.r.Move(2) + l.inTag = true + return StartTagPIToken, l.shiftStartTag() + } + l.r.Move(1) + l.inTag = true + return StartTagToken, l.shiftStartTag() + } else if c == 0 { + if l.r.Pos() > 0 { + return TextToken, l.r.Shift() + } + l.err = parse.NewErrorLexer("unexpected null character", l.r) + return ErrorToken, nil + } + l.r.Move(1) + } +} + +// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters. +func (l *Lexer) Text() []byte { + return l.text +} + +// AttrVal returns the attribute value when an AttributeToken was returned from Next. +func (l *Lexer) AttrVal() []byte { + return l.attrVal +} + +//////////////////////////////////////////////////////////////// + +// The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html + +func (l *Lexer) shiftDOCTYPEText() []byte { + inString := false + inBrackets := false + for { + c := l.r.Peek(0) + if c == '"' { + inString = !inString + } else if (c == '[' || c == ']') && !inString { + inBrackets = (c == '[') + } else if c == '>' && !inString && !inBrackets { + l.text = l.r.Lexeme()[9:] + l.r.Move(1) + return l.r.Shift() + } else if c == 0 { + l.text = l.r.Lexeme()[9:] + return l.r.Shift() + } + l.r.Move(1) + } +} + +func (l *Lexer) shiftCDATAText() []byte { + for { + c := l.r.Peek(0) + if c == ']' && l.r.Peek(1) == ']' && l.r.Peek(2) == '>' { + l.text = l.r.Lexeme()[9:] + l.r.Move(3) + return l.r.Shift() + } else if c == 0 { + l.text = l.r.Lexeme()[9:] + return l.r.Shift() + } + l.r.Move(1) + } +} + +func (l *Lexer) shiftCommentText() []byte { + for { + c := l.r.Peek(0) + if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { + l.text = l.r.Lexeme()[4:] + l.r.Move(3) + return l.r.Shift() + } else if c == 0 { + return l.r.Shift() + } + l.r.Move(1) + } +} + +func (l *Lexer) shiftStartTag() []byte { + nameStart := l.r.Pos() + for { + if c := l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 { + break + } + l.r.Move(1) + } + l.text = l.r.Lexeme()[nameStart:] + return l.r.Shift() +} + +func (l *Lexer) shiftAttribute() []byte { + nameStart := l.r.Pos() + var c byte + for { // attribute name state + if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 { + break + } + l.r.Move(1) + } + nameEnd := l.r.Pos() + for { // after attribute name state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' { + l.r.Move(1) + continue + } + break + } + if c == '=' { + l.r.Move(1) + for { // before attribute value state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' { + l.r.Move(1) + continue + } + break + } + attrPos := l.r.Pos() + delim := c + if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state + l.r.Move(1) + for { + c = l.r.Peek(0) + if c == delim { + l.r.Move(1) + break + } else if c == 0 { + break + } + l.r.Move(1) + if c == '\t' || c == '\n' || c == '\r' { + l.r.Lexeme()[l.r.Pos()-1] = ' ' + } + } + } else { // attribute value unquoted state + for { + if c = l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 { + break + } + l.r.Move(1) + } + } + l.attrVal = l.r.Lexeme()[attrPos:] + } else { + l.r.Rewind(nameEnd) + l.attrVal = nil + } + l.text = l.r.Lexeme()[nameStart:nameEnd] + return l.r.Shift() +} + +func (l *Lexer) shiftEndTag() []byte { + for { + c := l.r.Peek(0) + if c == '>' { + l.text = l.r.Lexeme()[2:] + l.r.Move(1) + break + } else if c == 0 { + l.text = l.r.Lexeme()[2:] + break + } + l.r.Move(1) + } + + end := len(l.text) + for end > 0 { + if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' { + end-- + continue + } + break + } + l.text = l.text[:end] + return l.r.Shift() +} + +//////////////////////////////////////////////////////////////// + +func (l *Lexer) at(b ...byte) bool { + for i, c := range b { + if l.r.Peek(i) != c { + return false + } + } + return true +} |