aboutsummaryrefslogtreecommitdiffhomepage
path: root/vendor/github.com/tdewolff/parse/html/lex.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/tdewolff/parse/html/lex.go')
-rw-r--r--vendor/github.com/tdewolff/parse/html/lex.go97
1 files changed, 55 insertions, 42 deletions
diff --git a/vendor/github.com/tdewolff/parse/html/lex.go b/vendor/github.com/tdewolff/parse/html/lex.go
index c81490a..b191247 100644
--- a/vendor/github.com/tdewolff/parse/html/lex.go
+++ b/vendor/github.com/tdewolff/parse/html/lex.go
@@ -79,10 +79,10 @@ func NewLexer(r io.Reader) *Lexer {
// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
func (l *Lexer) Err() error {
- if err := l.r.Err(); err != nil {
- return err
+ if l.err != nil {
+ return l.err
}
- return l.err
+ return l.r.Err()
}
// Restore restores the NULL byte at the end of the buffer.
@@ -103,8 +103,7 @@ func (l *Lexer) Next() (TokenType, []byte) {
}
break
}
- if c == 0 {
- l.err = parse.NewErrorLexer("unexpected null character", l.r)
+ if c == 0 && l.r.Err() != nil {
return ErrorToken, nil
} else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
return AttributeToken, l.shiftAttribute()
@@ -133,13 +132,16 @@ func (l *Lexer) Next() (TokenType, []byte) {
c = l.r.Peek(0)
if c == '<' {
c = l.r.Peek(1)
+ isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
if l.r.Pos() > 0 {
- if c == '/' && l.r.Peek(2) != 0 || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
+ if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
+ // return currently buffered texttoken so that we can return tag next iteration
return TextToken, l.r.Shift()
}
- } else if c == '/' && l.r.Peek(2) != 0 {
+ } else if isEndTag {
l.r.Move(2)
- if c = l.r.Peek(0); c != '>' && !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
+ // only endtags that are not followed by > or EOF arrive here
+ if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
return CommentToken, l.shiftBogusComment()
}
return EndTagToken, l.shiftEndTag()
@@ -154,11 +156,10 @@ func (l *Lexer) Next() (TokenType, []byte) {
l.r.Move(1)
return CommentToken, l.shiftBogusComment()
}
- } else if c == 0 {
+ } else if c == 0 && l.r.Err() != nil {
if l.r.Pos() > 0 {
return TextToken, l.r.Shift()
}
- l.err = parse.NewErrorLexer("unexpected null character", l.r)
return ErrorToken, nil
}
l.r.Move(1)
@@ -182,7 +183,7 @@ func (l *Lexer) AttrVal() []byte {
func (l *Lexer) shiftRawText() []byte {
if l.rawTag == Plaintext {
for {
- if l.r.Peek(0) == 0 {
+ if l.r.Peek(0) == 0 && l.r.Err() != nil {
return l.r.Shift()
}
l.r.Move(1)
@@ -237,15 +238,16 @@ func (l *Lexer) shiftRawText() []byte {
inScript = false
}
}
- } else if c == 0 {
+ } else if c == 0 && l.r.Err() != nil {
return l.r.Shift()
+ } else {
+ l.r.Move(1)
}
- l.r.Move(1)
}
} else {
l.r.Move(1)
}
- } else if c == 0 {
+ } else if c == 0 && l.r.Err() != nil {
return l.r.Shift()
} else {
l.r.Move(1)
@@ -258,7 +260,7 @@ func (l *Lexer) readMarkup() (TokenType, []byte) {
if l.at('-', '-') {
l.r.Move(2)
for {
- if l.r.Peek(0) == 0 {
+ if l.r.Peek(0) == 0 && l.r.Err() != nil {
return CommentToken, l.r.Shift()
} else if l.at('-', '-', '>') {
l.text = l.r.Lexeme()[4:]
@@ -274,7 +276,7 @@ func (l *Lexer) readMarkup() (TokenType, []byte) {
} else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
l.r.Move(7)
for {
- if l.r.Peek(0) == 0 {
+ if l.r.Peek(0) == 0 && l.r.Err() != nil {
return TextToken, l.r.Shift()
} else if l.at(']', ']', '>') {
l.r.Move(3)
@@ -289,7 +291,7 @@ func (l *Lexer) readMarkup() (TokenType, []byte) {
l.r.Move(1)
}
for {
- if c := l.r.Peek(0); c == '>' || c == 0 {
+ if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
l.text = l.r.Lexeme()[9:]
if c == '>' {
l.r.Move(1)
@@ -310,7 +312,7 @@ func (l *Lexer) shiftBogusComment() []byte {
l.text = l.r.Lexeme()[2:]
l.r.Move(1)
return l.r.Shift()
- } else if c == 0 {
+ } else if c == 0 && l.r.Err() != nil {
l.text = l.r.Lexeme()[2:]
return l.r.Shift()
}
@@ -320,19 +322,25 @@ func (l *Lexer) shiftBogusComment() []byte {
func (l *Lexer) shiftStartTag() (TokenType, []byte) {
for {
- if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 {
+ if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
break
}
l.r.Move(1)
}
l.text = parse.ToLower(l.r.Lexeme()[1:])
if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
- if h == Svg {
- l.inTag = false
- return SvgToken, l.shiftXml(h)
- } else if h == Math {
+ if h == Svg || h == Math {
+ data := l.shiftXml(h)
+ if l.err != nil {
+ return ErrorToken, nil
+ }
+
l.inTag = false
- return MathToken, l.shiftXml(h)
+ if h == Svg {
+ return SvgToken, data
+ } else {
+ return MathToken, data
+ }
}
l.rawTag = h
}
@@ -343,7 +351,7 @@ func (l *Lexer) shiftAttribute() []byte {
nameStart := l.r.Pos()
var c byte
for { // attribute name state
- if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 {
+ if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
break
}
l.r.Move(1)
@@ -374,14 +382,14 @@ func (l *Lexer) shiftAttribute() []byte {
if c == delim {
l.r.Move(1)
break
- } else if c == 0 {
+ } else if c == 0 && l.r.Err() != nil {
break
}
l.r.Move(1)
}
} else { // attribute value unquoted state
for {
- if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 {
+ if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
break
}
l.r.Move(1)
@@ -403,7 +411,7 @@ func (l *Lexer) shiftEndTag() []byte {
l.text = l.r.Lexeme()[2:]
l.r.Move(1)
break
- } else if c == 0 {
+ } else if c == 0 && l.r.Err() != nil {
l.text = l.r.Lexeme()[2:]
break
}
@@ -422,6 +430,8 @@ func (l *Lexer) shiftEndTag() []byte {
return parse.ToLower(l.r.Shift())
}
+// shiftXml parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
+// So far we have already parsed `<svg` or `<math`.
func (l *Lexer) shiftXml(rawTag Hash) []byte {
inQuote := false
for {
@@ -429,26 +439,26 @@ func (l *Lexer) shiftXml(rawTag Hash) []byte {
if c == '"' {
inQuote = !inQuote
l.r.Move(1)
- } else if c == '<' && !inQuote {
- if l.r.Peek(1) == '/' {
- mark := l.r.Pos()
- l.r.Move(2)
- for {
- if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
- break
- }
- l.r.Move(1)
- }
- if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
+ } else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
+ mark := l.r.Pos()
+ l.r.Move(2)
+ for {
+ if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
break
}
- } else {
l.r.Move(1)
}
+ if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
+ break
+ }
} else if c == 0 {
+ if l.r.Err() == nil {
+ l.err = parse.NewErrorLexer("unexpected null character", l.r)
+ }
return l.r.Shift()
+ } else {
+ l.r.Move(1)
}
- l.r.Move(1)
}
for {
@@ -457,7 +467,10 @@ func (l *Lexer) shiftXml(rawTag Hash) []byte {
l.r.Move(1)
break
} else if c == 0 {
- break
+ if l.r.Err() == nil {
+ l.err = parse.NewErrorLexer("unexpected null character", l.r)
+ }
+ return l.r.Shift()
}
l.r.Move(1)
}