diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse/html/lex_test.go')
-rw-r--r-- | vendor/github.com/tdewolff/parse/html/lex_test.go | 262 |
1 files changed, 262 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/html/lex_test.go b/vendor/github.com/tdewolff/parse/html/lex_test.go new file mode 100644 index 0000000..5f4ca0b --- /dev/null +++ b/vendor/github.com/tdewolff/parse/html/lex_test.go @@ -0,0 +1,262 @@ +package html // import "github.com/tdewolff/parse/html" + +import ( + "bytes" + "fmt" + "io" + "testing" + + "github.com/tdewolff/parse" + "github.com/tdewolff/test" +) + +type TTs []TokenType + +func TestTokens(t *testing.T) { + var tokenTests = []struct { + html string + expected []TokenType + }{ + {"<html></html>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}}, + {"<img/>", TTs{StartTagToken, StartTagVoidToken}}, + {"<!-- comment -->", TTs{CommentToken}}, + {"<!-- comment --!>", TTs{CommentToken}}, + {"<p>text</p>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<input type='button'/>", TTs{StartTagToken, AttributeToken, StartTagVoidToken}}, + {"<input type='button' value=''/>", TTs{StartTagToken, AttributeToken, AttributeToken, StartTagVoidToken}}, + {"<input type='=/>' \r\n\t\f value=\"'\" name=x checked />", TTs{StartTagToken, AttributeToken, AttributeToken, AttributeToken, AttributeToken, StartTagVoidToken}}, + {"<!doctype>", TTs{DoctypeToken}}, + {"<!doctype html>", TTs{DoctypeToken}}, + {"<?bogus>", TTs{CommentToken}}, + {"</0bogus>", TTs{CommentToken}}, + {"<!bogus>", TTs{CommentToken}}, + {"< ", TTs{TextToken}}, + {"</", TTs{TextToken}}, + + // raw tags + {"<title><p></p></title>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<TITLE><p></p></TITLE>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<plaintext></plaintext>", TTs{StartTagToken, StartTagCloseToken, TextToken}}, + {"<script></script>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}}, + {"<script>var x='</script>';</script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken}}, + {"<script><!--var x='</script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken}}, + {"<script><!--var x='<script></script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<script><!--var x='<script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<![CDATA[ test ]]>", TTs{TextToken}}, + {"<svg>text</svg>", TTs{SvgToken}}, + {"<math>text</math>", TTs{MathToken}}, + {`<svg>text<x a="</svg>"></x></svg>`, TTs{SvgToken}}, + {"<a><svg>text</svg></a>", TTs{StartTagToken, StartTagCloseToken, SvgToken, EndTagToken}}, + + // early endings + {"<!-- comment", TTs{CommentToken}}, + {"<? bogus comment", TTs{CommentToken}}, + {"<foo", TTs{StartTagToken}}, + {"</foo", TTs{EndTagToken}}, + {"<foo x", TTs{StartTagToken, AttributeToken}}, + {"<foo x=", TTs{StartTagToken, AttributeToken}}, + {"<foo x='", TTs{StartTagToken, AttributeToken}}, + {"<foo x=''", TTs{StartTagToken, AttributeToken}}, + {"<!DOCTYPE note SYSTEM", TTs{DoctypeToken}}, + {"<![CDATA[ test", TTs{TextToken}}, + {"<script>", TTs{StartTagToken, StartTagCloseToken}}, + {"<script><!--", TTs{StartTagToken, StartTagCloseToken, TextToken}}, + {"<script><!--var x='<script></script>';-->", TTs{StartTagToken, StartTagCloseToken, TextToken}}, + + // go-fuzz + {"</>", TTs{EndTagToken}}, + } + for _, tt := range tokenTests { + t.Run(tt.html, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.html)) + i := 0 + for { + token, _ := l.Next() + if token == ErrorToken { + test.T(t, l.Err(), io.EOF) + test.T(t, i, len(tt.expected), "when error occurred we must be at the end") + break + } + test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected)) + if i < len(tt.expected) { + test.T(t, token, tt.expected[i], "token types must match") + } + i++ + } + }) + } + + test.T(t, TokenType(100).String(), "Invalid(100)") +} + +func TestTags(t *testing.T) { + var tagTests = []struct { + html string + expected string + }{ + {"<foo:bar.qux-norf/>", "foo:bar.qux-norf"}, + {"<foo?bar/qux>", "foo?bar/qux"}, + {"<!DOCTYPE note SYSTEM \"Note.dtd\">", " note SYSTEM \"Note.dtd\""}, + {"</foo >", "foo"}, + + // early endings + {"<foo ", "foo"}, + } + for _, tt := range tagTests { + t.Run(tt.html, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.html)) + for { + token, _ := l.Next() + if token == ErrorToken { + test.T(t, l.Err(), io.EOF) + test.Fail(t, "when error occurred we must be at the end") + break + } else if token == StartTagToken || token == EndTagToken || token == DoctypeToken { + test.String(t, string(l.Text()), tt.expected) + break + } + } + }) + } +} + +func TestAttributes(t *testing.T) { + var attributeTests = []struct { + attr string + expected []string + }{ + {"<foo a=\"b\" />", []string{"a", "\"b\""}}, + {"<foo \nchecked \r\n value\r=\t'=/>\"' />", []string{"checked", "", "value", "'=/>\"'"}}, + {"<foo bar=\" a \n\t\r b \" />", []string{"bar", "\" a \n\t\r b \""}}, + {"<foo a/>", []string{"a", ""}}, + {"<foo /=/>", []string{"/", "/"}}, + + // early endings + {"<foo x", []string{"x", ""}}, + {"<foo x=", []string{"x", ""}}, + {"<foo x='", []string{"x", "'"}}, + } + for _, tt := range attributeTests { + t.Run(tt.attr, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.attr)) + i := 0 + for { + token, _ := l.Next() + if token == ErrorToken { + test.T(t, l.Err(), io.EOF) + test.T(t, i, len(tt.expected), "when error occurred we must be at the end") + break + } else if token == AttributeToken { + test.That(t, i+1 < len(tt.expected), "index", i+1, "must not exceed expected attributes size", len(tt.expected)) + if i+1 < len(tt.expected) { + test.String(t, string(l.Text()), tt.expected[i], "attribute keys must match") + test.String(t, string(l.AttrVal()), tt.expected[i+1], "attribute keys must match") + i += 2 + } + } + } + }) + } +} + +func TestErrors(t *testing.T) { + var errorTests = []struct { + html string + col int + }{ + {"a\x00b", 2}, + } + for _, tt := range errorTests { + t.Run(tt.html, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.html)) + for { + token, _ := l.Next() + if token == ErrorToken { + if tt.col == 0 { + test.T(t, l.Err(), io.EOF) + } else if perr, ok := l.Err().(*parse.Error); ok { + test.T(t, perr.Col, tt.col) + } else { + test.Fail(t, "bad error:", l.Err()) + } + break + } + } + }) + } +} + +//////////////////////////////////////////////////////////////// + +var J int +var ss = [][]byte{ + []byte(" style"), + []byte("style"), + []byte(" \r\n\tstyle"), + []byte(" style"), + []byte(" x"), + []byte("x"), +} + +func BenchmarkWhitespace1(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, s := range ss { + j := 0 + for { + if c := s[j]; c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + j++ + } else { + break + } + } + J += j + } + } +} + +func BenchmarkWhitespace2(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, s := range ss { + j := 0 + for { + if c := s[j]; c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + j++ + continue + } + break + } + J += j + } + } +} + +func BenchmarkWhitespace3(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, s := range ss { + j := 0 + for { + if c := s[j]; c != ' ' && c != '\t' && c != '\n' && c != '\r' && c != '\f' { + break + } + j++ + } + J += j + } + } +} + +//////////////////////////////////////////////////////////////// + +func ExampleNewLexer() { + l := NewLexer(bytes.NewBufferString("<span class='user'>John Doe</span>")) + out := "" + for { + tt, data := l.Next() + if tt == ErrorToken { + break + } + out += string(data) + } + fmt.Println(out) + // Output: <span class='user'>John Doe</span> +} |