diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse/html')
-rw-r--r-- | vendor/github.com/tdewolff/parse/html/README.md | 98 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/html/hash.go | 831 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/html/hash_test.go | 58 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/html/lex.go | 485 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/html/lex_test.go | 262 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/html/util.go | 129 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/html/util_test.go | 43 |
7 files changed, 1906 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/html/README.md b/vendor/github.com/tdewolff/parse/html/README.md new file mode 100644 index 0000000..edca629 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/html/README.md @@ -0,0 +1,98 @@ +# HTML [![GoDoc](http://godoc.org/github.com/tdewolff/parse/html?status.svg)](http://godoc.org/github.com/tdewolff/parse/html) [![GoCover](http://gocover.io/_badge/github.com/tdewolff/parse/html)](http://gocover.io/github.com/tdewolff/parse/html) + +This package is an HTML5 lexer written in [Go][1]. It follows the specification at [The HTML syntax](http://www.w3.org/TR/html5/syntax.html). The lexer takes an io.Reader and converts it into tokens until the EOF. + +## Installation +Run the following command + + go get github.com/tdewolff/parse/html + +or add the following import and run project with `go get` + + import "github.com/tdewolff/parse/html" + +## Lexer +### Usage +The following initializes a new Lexer with io.Reader `r`: +``` go +l := html.NewLexer(r) +``` + +To tokenize until EOF an error, use: +``` go +for { + tt, data := l.Next() + switch tt { + case html.ErrorToken: + // error or EOF set in l.Err() + return + case html.StartTagToken: + // ... + for { + ttAttr, dataAttr := l.Next() + if ttAttr != html.AttributeToken { + break + } + // ... + } + // ... + } +} +``` + +All tokens: +``` go +ErrorToken TokenType = iota // extra token when errors occur +CommentToken +DoctypeToken +StartTagToken +StartTagCloseToken +StartTagVoidToken +EndTagToken +AttributeToken +TextToken +``` + +### Examples +``` go +package main + +import ( + "os" + + "github.com/tdewolff/parse/html" +) + +// Tokenize HTML from stdin. +func main() { + l := html.NewLexer(os.Stdin) + for { + tt, data := l.Next() + switch tt { + case html.ErrorToken: + if l.Err() != io.EOF { + fmt.Println("Error on line", l.Line(), ":", l.Err()) + } + return + case html.StartTagToken: + fmt.Println("Tag", string(data)) + for { + ttAttr, dataAttr := l.Next() + if ttAttr != html.AttributeToken { + break + } + + key := dataAttr + val := l.AttrVal() + fmt.Println("Attribute", string(key), "=", string(val)) + } + // ... + } + } +} +``` + +## License +Released under the [MIT license](https://github.com/tdewolff/parse/blob/master/LICENSE.md). + +[1]: http://golang.org/ "Go Language" diff --git a/vendor/github.com/tdewolff/parse/html/hash.go b/vendor/github.com/tdewolff/parse/html/hash.go new file mode 100644 index 0000000..382e5c5 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/html/hash.go @@ -0,0 +1,831 @@ +package html + +// generated by hasher -type=Hash -file=hash.go; DO NOT EDIT, except for adding more constants to the list and rerun go generate + +// uses github.com/tdewolff/hasher +//go:generate hasher -type=Hash -file=hash.go + +// Hash defines perfect hashes for a predefined list of strings +type Hash uint32 + +// Unique hash definitions to be used instead of strings +const ( + A Hash = 0x1 // a + Abbr Hash = 0x4 // abbr + Accept Hash = 0x3206 // accept + Accept_Charset Hash = 0x320e // accept-charset + Accesskey Hash = 0x4409 // accesskey + Acronym Hash = 0xbb07 // acronym + Action Hash = 0x2ba06 // action + Address Hash = 0x67e07 // address + Align Hash = 0x1605 // align + Alink Hash = 0xd205 // alink + Allowfullscreen Hash = 0x23d0f // allowfullscreen + Alt Hash = 0xee03 // alt + Annotation Hash = 0x2070a // annotation + AnnotationXml Hash = 0x2070d // annotationXml + Applet Hash = 0x14506 // applet + Area Hash = 0x38d04 // area + Article Hash = 0x40e07 // article + Aside Hash = 0x8305 // aside + Async Hash = 0xfa05 // async + Audio Hash = 0x11605 // audio + Autocomplete Hash = 0x12e0c // autocomplete + Autofocus Hash = 0x13a09 // autofocus + Autoplay Hash = 0x14f08 // autoplay + Axis Hash = 0x15704 // axis + B Hash = 0x101 // b + Background Hash = 0x1e0a // background + Base Hash = 0x45404 // base + Basefont Hash = 0x45408 // basefont + Bdi Hash = 0xcb03 // bdi + Bdo Hash = 0x18403 // bdo + Bgcolor Hash = 0x19707 // bgcolor + Bgsound Hash = 0x19e07 // bgsound + Big Hash = 0x1a603 // big + Blink Hash = 0x1a905 // blink + Blockquote Hash = 0x1ae0a // blockquote + Body Hash = 0x4004 // body + Border Hash = 0x33806 // border + Br Hash = 0x202 // br + Button Hash = 0x1b806 // button + Canvas Hash = 0x7f06 // canvas + Caption Hash = 0x27f07 // caption + Center Hash = 0x62a06 // center + Challenge Hash = 0x1e509 // challenge + Charset Hash = 0x3907 // charset + Checked Hash = 0x3b407 // checked + Cite Hash = 0xfe04 // cite + Class Hash = 0x1c305 // class + Classid Hash = 0x1c307 // classid + Clear Hash = 0x41205 // clear + Code Hash = 0x1d604 // code + Codebase Hash = 0x45008 // codebase + Codetype Hash = 0x1d608 // codetype + Col Hash = 0x19903 // col + Colgroup Hash = 0x1ee08 // colgroup + Color Hash = 0x19905 // color + Cols Hash = 0x20204 // cols + Colspan Hash = 0x20207 // colspan + Command Hash = 0x21407 // command + Compact Hash = 0x21b07 // compact + Content Hash = 0x4a907 // content + Contenteditable Hash = 0x4a90f // contenteditable + Contextmenu Hash = 0x3bd0b // contextmenu + Controls Hash = 0x22a08 // controls + Coords Hash = 0x23606 // coords + Crossorigin Hash = 0x25b0b // crossorigin + Data Hash = 0x4c004 // data + Datalist Hash = 0x4c008 // datalist + Datetime Hash = 0x2ea08 // datetime + Dd Hash = 0x31602 // dd + Declare Hash = 0x8607 // declare + Default Hash = 0x5407 // default + DefaultChecked Hash = 0x5040e // defaultChecked + DefaultMuted Hash = 0x5650c // defaultMuted + DefaultSelected Hash = 0x540f // defaultSelected + Defer Hash = 0x6205 // defer + Del Hash = 0x7203 // del + Desc Hash = 0x7c04 // desc + Details Hash = 0x9207 // details + Dfn Hash = 0xab03 // dfn + Dialog Hash = 0xcc06 // dialog + Dir Hash = 0xd903 // dir + Dirname Hash = 0xd907 // dirname + Disabled Hash = 0x10408 // disabled + Div Hash = 0x10b03 // div + Dl Hash = 0x1a402 // dl + Download Hash = 0x48608 // download + Draggable Hash = 0x1c909 // draggable + Dropzone Hash = 0x41908 // dropzone + Dt Hash = 0x60602 // dt + Em Hash = 0x6e02 // em + Embed Hash = 0x6e05 // embed + Enabled Hash = 0x4e07 // enabled + Enctype Hash = 0x2cf07 // enctype + Face Hash = 0x62804 // face + Fieldset Hash = 0x26c08 // fieldset + Figcaption Hash = 0x27c0a // figcaption + Figure Hash = 0x29006 // figure + Font Hash = 0x45804 // font + Footer Hash = 0xf106 // footer + For Hash = 0x29c03 // for + ForeignObject Hash = 0x29c0d // foreignObject + Foreignobject Hash = 0x2a90d // foreignobject + Form Hash = 0x2b604 // form + Formaction Hash = 0x2b60a // formaction + Formenctype Hash = 0x2cb0b // formenctype + Formmethod Hash = 0x2d60a // formmethod + Formnovalidate Hash = 0x2e00e // formnovalidate + Formtarget Hash = 0x2f50a // formtarget + Frame Hash = 0xa305 // frame + Frameborder Hash = 0x3330b // frameborder + Frameset Hash = 0xa308 // frameset + H1 Hash = 0x19502 // h1 + H2 Hash = 0x32402 // h2 + H3 Hash = 0x34902 // h3 + H4 Hash = 0x38602 // h4 + H5 Hash = 0x60802 // h5 + H6 Hash = 0x2ff02 // h6 + Head Hash = 0x37204 // head + Header Hash = 0x37206 // header + Headers Hash = 0x37207 // headers + Height Hash = 0x30106 // height + Hgroup Hash = 0x30906 // hgroup + Hidden Hash = 0x31406 // hidden + High Hash = 0x32104 // high + Hr Hash = 0xaf02 // hr + Href Hash = 0xaf04 // href + Hreflang Hash = 0xaf08 // hreflang + Html Hash = 0x30504 // html + Http_Equiv Hash = 0x3260a // http-equiv + I Hash = 0x601 // i + Icon Hash = 0x4a804 // icon + Id Hash = 0x8502 // id + Iframe Hash = 0x33206 // iframe + Image Hash = 0x33e05 // image + Img Hash = 0x34303 // img + Inert Hash = 0x55005 // inert + Input Hash = 0x47305 // input + Ins Hash = 0x26403 // ins + Isindex Hash = 0x15907 // isindex + Ismap Hash = 0x34b05 // ismap + Itemid Hash = 0xff06 // itemid + Itemprop Hash = 0x58808 // itemprop + Itemref Hash = 0x62207 // itemref + Itemscope Hash = 0x35609 // itemscope + Itemtype Hash = 0x36008 // itemtype + Kbd Hash = 0xca03 // kbd + Keygen Hash = 0x4a06 // keygen + Keytype Hash = 0x68807 // keytype + Kind Hash = 0xd604 // kind + Label Hash = 0x7405 // label + Lang Hash = 0xb304 // lang + Language Hash = 0xb308 // language + Legend Hash = 0x1d006 // legend + Li Hash = 0x1702 // li + Link Hash = 0xd304 // link + List Hash = 0x4c404 // list + Listing Hash = 0x4c407 // listing + Longdesc Hash = 0x7808 // longdesc + Loop Hash = 0x12104 // loop + Low Hash = 0x23f03 // low + Main Hash = 0x1004 // main + Malignmark Hash = 0xc10a // malignmark + Manifest Hash = 0x65e08 // manifest + Map Hash = 0x14403 // map + Mark Hash = 0xc704 // mark + Marquee Hash = 0x36807 // marquee + Math Hash = 0x36f04 // math + Max Hash = 0x37e03 // max + Maxlength Hash = 0x37e09 // maxlength + Media Hash = 0xde05 // media + Mediagroup Hash = 0xde0a // mediagroup + Menu Hash = 0x3c404 // menu + Meta Hash = 0x4d304 // meta + Meter Hash = 0x2f005 // meter + Method Hash = 0x2da06 // method + Mglyph Hash = 0x34406 // mglyph + Mi Hash = 0x2c02 // mi + Min Hash = 0x2c03 // min + Mn Hash = 0x2e302 // mn + Mo Hash = 0x4f702 // mo + Ms Hash = 0x35902 // ms + Mtext Hash = 0x38805 // mtext + Multiple Hash = 0x39608 // multiple + Muted Hash = 0x39e05 // muted + Name Hash = 0xdc04 // name + Nav Hash = 0x1303 // nav + Nobr Hash = 0x1a04 // nobr + Noembed Hash = 0x6c07 // noembed + Noframes Hash = 0xa108 // noframes + Nohref Hash = 0xad06 // nohref + Noresize Hash = 0x24b08 // noresize + Noscript Hash = 0x31908 // noscript + Noshade Hash = 0x4ff07 // noshade + Novalidate Hash = 0x2e40a // novalidate + Nowrap Hash = 0x59106 // nowrap + Object Hash = 0x2b006 // object + Ol Hash = 0x17102 // ol + Onabort Hash = 0x1bc07 // onabort + Onafterprint Hash = 0x2840c // onafterprint + Onbeforeprint Hash = 0x2be0d // onbeforeprint + Onbeforeunload Hash = 0x6720e // onbeforeunload + Onblur Hash = 0x17e06 // onblur + Oncancel Hash = 0x11a08 // oncancel + Oncanplay Hash = 0x18609 // oncanplay + Oncanplaythrough Hash = 0x18610 // oncanplaythrough + Onchange Hash = 0x42f08 // onchange + Onclick Hash = 0x6b607 // onclick + Onclose Hash = 0x3a307 // onclose + Oncontextmenu Hash = 0x3bb0d // oncontextmenu + Oncuechange Hash = 0x3c80b // oncuechange + Ondblclick Hash = 0x3d30a // ondblclick + Ondrag Hash = 0x3dd06 // ondrag + Ondragend Hash = 0x3dd09 // ondragend + Ondragenter Hash = 0x3e60b // ondragenter + Ondragleave Hash = 0x3f10b // ondragleave + Ondragover Hash = 0x3fc0a // ondragover + Ondragstart Hash = 0x4060b // ondragstart + Ondrop Hash = 0x41706 // ondrop + Ondurationchange Hash = 0x42710 // ondurationchange + Onemptied Hash = 0x41e09 // onemptied + Onended Hash = 0x43707 // onended + Onerror Hash = 0x43e07 // onerror + Onfocus Hash = 0x44507 // onfocus + Onhashchange Hash = 0x4650c // onhashchange + Oninput Hash = 0x47107 // oninput + Oninvalid Hash = 0x47809 // oninvalid + Onkeydown Hash = 0x48109 // onkeydown + Onkeypress Hash = 0x48e0a // onkeypress + Onkeyup Hash = 0x49e07 // onkeyup + Onload Hash = 0x4b806 // onload + Onloadeddata Hash = 0x4b80c // onloadeddata + Onloadedmetadata Hash = 0x4cb10 // onloadedmetadata + Onloadstart Hash = 0x4e10b // onloadstart + Onmessage Hash = 0x4ec09 // onmessage + Onmousedown Hash = 0x4f50b // onmousedown + Onmousemove Hash = 0x5120b // onmousemove + Onmouseout Hash = 0x51d0a // onmouseout + Onmouseover Hash = 0x52a0b // onmouseover + Onmouseup Hash = 0x53509 // onmouseup + Onmousewheel Hash = 0x53e0c // onmousewheel + Onoffline Hash = 0x54a09 // onoffline + Ononline Hash = 0x55508 // ononline + Onpagehide Hash = 0x55d0a // onpagehide + Onpageshow Hash = 0x5710a // onpageshow + Onpause Hash = 0x57d07 // onpause + Onplay Hash = 0x59c06 // onplay + Onplaying Hash = 0x59c09 // onplaying + Onpopstate Hash = 0x5a50a // onpopstate + Onprogress Hash = 0x5af0a // onprogress + Onratechange Hash = 0x5be0c // onratechange + Onreset Hash = 0x5ca07 // onreset + Onresize Hash = 0x5d108 // onresize + Onscroll Hash = 0x5d908 // onscroll + Onseeked Hash = 0x5e408 // onseeked + Onseeking Hash = 0x5ec09 // onseeking + Onselect Hash = 0x5f508 // onselect + Onshow Hash = 0x5ff06 // onshow + Onstalled Hash = 0x60a09 // onstalled + Onstorage Hash = 0x61309 // onstorage + Onsubmit Hash = 0x61c08 // onsubmit + Onsuspend Hash = 0x63009 // onsuspend + Ontimeupdate Hash = 0x4590c // ontimeupdate + Onunload Hash = 0x63908 // onunload + Onvolumechange Hash = 0x6410e // onvolumechange + Onwaiting Hash = 0x64f09 // onwaiting + Open Hash = 0x58e04 // open + Optgroup Hash = 0x12308 // optgroup + Optimum Hash = 0x65807 // optimum + Option Hash = 0x66e06 // option + Output Hash = 0x52406 // output + P Hash = 0xc01 // p + Param Hash = 0xc05 // param + Pattern Hash = 0x9b07 // pattern + Pauseonexit Hash = 0x57f0b // pauseonexit + Picture Hash = 0xe707 // picture + Ping Hash = 0x12a04 // ping + Placeholder Hash = 0x16b0b // placeholder + Plaintext Hash = 0x1f509 // plaintext + Poster Hash = 0x30e06 // poster + Pre Hash = 0x34f03 // pre + Preload Hash = 0x34f07 // preload + Profile Hash = 0x66707 // profile + Progress Hash = 0x5b108 // progress + Prompt Hash = 0x59606 // prompt + Public Hash = 0x4a406 // public + Q Hash = 0x8d01 // q + Radiogroup Hash = 0x30a // radiogroup + Rb Hash = 0x1d02 // rb + Readonly Hash = 0x38e08 // readonly + Rel Hash = 0x35003 // rel + Required Hash = 0x8b08 // required + Rev Hash = 0x29403 // rev + Reversed Hash = 0x29408 // reversed + Rows Hash = 0x6604 // rows + Rowspan Hash = 0x6607 // rowspan + Rp Hash = 0x28a02 // rp + Rt Hash = 0x1c102 // rt + Rtc Hash = 0x1c103 // rtc + Ruby Hash = 0xf604 // ruby + Rules Hash = 0x17505 // rules + S Hash = 0x3d01 // s + Samp Hash = 0x9804 // samp + Sandbox Hash = 0x16307 // sandbox + Scope Hash = 0x35a05 // scope + Scoped Hash = 0x35a06 // scoped + Script Hash = 0x31b06 // script + Scrolling Hash = 0x5db09 // scrolling + Seamless Hash = 0x3a808 // seamless + Section Hash = 0x17907 // section + Select Hash = 0x5f706 // select + Selected Hash = 0x5f708 // selected + Shape Hash = 0x23105 // shape + Size Hash = 0x24f04 // size + Sizes Hash = 0x24f05 // sizes + Small Hash = 0x23b05 // small + Sortable Hash = 0x25308 // sortable + Source Hash = 0x26606 // source + Spacer Hash = 0x37806 // spacer + Span Hash = 0x6904 // span + Spellcheck Hash = 0x3af0a // spellcheck + Src Hash = 0x44b03 // src + Srcdoc Hash = 0x44b06 // srcdoc + Srclang Hash = 0x49707 // srclang + Srcset Hash = 0x5b806 // srcset + Start Hash = 0x40c05 // start + Step Hash = 0x66404 // step + Strike Hash = 0x68406 // strike + Strong Hash = 0x68f06 // strong + Style Hash = 0x69505 // style + Sub Hash = 0x61e03 // sub + Summary Hash = 0x69a07 // summary + Sup Hash = 0x6a103 // sup + Svg Hash = 0x6a403 // svg + System Hash = 0x6a706 // system + Tabindex Hash = 0x4d908 // tabindex + Table Hash = 0x25605 // table + Target Hash = 0x2f906 // target + Tbody Hash = 0x3f05 // tbody + Td Hash = 0xaa02 // td + Template Hash = 0x6aa08 // template + Text Hash = 0x1fa04 // text + Textarea Hash = 0x38908 // textarea + Tfoot Hash = 0xf005 // tfoot + Th Hash = 0x18f02 // th + Thead Hash = 0x37105 // thead + Time Hash = 0x2ee04 // time + Title Hash = 0x14a05 // title + Tr Hash = 0x1fd02 // tr + Track Hash = 0x1fd05 // track + Translate Hash = 0x22109 // translate + Truespeed Hash = 0x27309 // truespeed + Tt Hash = 0x9d02 // tt + Type Hash = 0x11204 // type + Typemustmatch Hash = 0x1da0d // typemustmatch + U Hash = 0xb01 // u + Ul Hash = 0x5802 // ul + Undeterminate Hash = 0x250d // undeterminate + Usemap Hash = 0x14106 // usemap + Valign Hash = 0x1506 // valign + Value Hash = 0x10d05 // value + Valuetype Hash = 0x10d09 // valuetype + Var Hash = 0x32f03 // var + Video Hash = 0x6b205 // video + Visible Hash = 0x6bd07 // visible + Vlink Hash = 0x6c405 // vlink + Wbr Hash = 0x57a03 // wbr + Width Hash = 0x60405 // width + Wrap Hash = 0x59304 // wrap + Xmlns Hash = 0x15f05 // xmlns + Xmp Hash = 0x16903 // xmp +) + +// String returns the hash' name. +func (i Hash) String() string { + start := uint32(i >> 8) + n := uint32(i & 0xff) + if start+n > uint32(len(_Hash_text)) { + return "" + } + return _Hash_text[start : start+n] +} + +// ToHash returns the hash whose name is s. It returns zero if there is no +// such hash. It is case sensitive. +func ToHash(s []byte) Hash { + if len(s) == 0 || len(s) > _Hash_maxLen { + return 0 + } + h := uint32(_Hash_hash0) + for i := 0; i < len(s); i++ { + h ^= uint32(s[i]) + h *= 16777619 + } + if i := _Hash_table[h&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) { + t := _Hash_text[i>>8 : i>>8+i&0xff] + for i := 0; i < len(s); i++ { + if t[i] != s[i] { + goto NEXT + } + } + return i + } +NEXT: + if i := _Hash_table[(h>>16)&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) { + t := _Hash_text[i>>8 : i>>8+i&0xff] + for i := 0; i < len(s); i++ { + if t[i] != s[i] { + return 0 + } + } + return i + } + return 0 +} + +const _Hash_hash0 = 0x5334b67c +const _Hash_maxLen = 16 +const _Hash_text = "abbradiogrouparamainavalignobrbackgroundeterminateaccept-cha" + + "rsetbodyaccesskeygenabledefaultSelectedeferowspanoembedelabe" + + "longdescanvasideclarequiredetailsampatternoframesetdfnohrefl" + + "anguageacronymalignmarkbdialogalinkindirnamediagroupictureal" + + "tfooterubyasyncitemidisabledivaluetypeaudioncancelooptgroupi" + + "ngautocompleteautofocusemappletitleautoplayaxisindexmlnsandb" + + "oxmplaceholderulesectionblurbdoncanplaythrough1bgcolorbgsoun" + + "dlbigblinkblockquotebuttonabortclassidraggablegendcodetypemu" + + "stmatchallengecolgrouplaintextrackcolspannotationXmlcommandc" + + "ompactranslatecontrolshapecoordsmallowfullscreenoresizesorta" + + "blecrossoriginsourcefieldsetruespeedfigcaptionafterprintfigu" + + "reversedforeignObjectforeignobjectformactionbeforeprintforme" + + "nctypeformmethodformnovalidatetimeterformtargeth6heightmlhgr" + + "ouposterhiddenoscripthigh2http-equivariframeborderimageimgly" + + "ph3ismapreloaditemscopeditemtypemarqueematheaderspacermaxlen" + + "gth4mtextareadonlymultiplemutedoncloseamlesspellcheckedoncon" + + "textmenuoncuechangeondblclickondragendondragenterondragleave" + + "ondragoverondragstarticlearondropzonemptiedondurationchangeo" + + "nendedonerroronfocusrcdocodebasefontimeupdateonhashchangeoni" + + "nputoninvalidonkeydownloadonkeypressrclangonkeyupublicontent" + + "editableonloadeddatalistingonloadedmetadatabindexonloadstart" + + "onmessageonmousedownoshadefaultCheckedonmousemoveonmouseoutp" + + "utonmouseoveronmouseuponmousewheelonofflinertononlineonpageh" + + "idefaultMutedonpageshowbronpauseonexitempropenowrapromptonpl" + + "ayingonpopstateonprogressrcsetonratechangeonresetonresizeons" + + "crollingonseekedonseekingonselectedonshowidth5onstalledonsto" + + "rageonsubmitemrefacenteronsuspendonunloadonvolumechangeonwai" + + "tingoptimumanifesteprofileoptionbeforeunloaddresstrikeytypes" + + "trongstylesummarysupsvgsystemplatevideonclickvisiblevlink" + +var _Hash_table = [1 << 9]Hash{ + 0x0: 0x2cb0b, // formenctype + 0x1: 0x2d60a, // formmethod + 0x2: 0x3c80b, // oncuechange + 0x3: 0x3dd06, // ondrag + 0x6: 0x68406, // strike + 0x7: 0x6b205, // video + 0x9: 0x4a907, // content + 0xa: 0x4e07, // enabled + 0xb: 0x59106, // nowrap + 0xc: 0xd304, // link + 0xe: 0x28a02, // rp + 0xf: 0x2840c, // onafterprint + 0x10: 0x14506, // applet + 0x11: 0xf005, // tfoot + 0x12: 0x5040e, // defaultChecked + 0x13: 0x3330b, // frameborder + 0x14: 0xf106, // footer + 0x15: 0x5f708, // selected + 0x16: 0x49707, // srclang + 0x18: 0x52a0b, // onmouseover + 0x19: 0x1d604, // code + 0x1b: 0x47809, // oninvalid + 0x1c: 0x62804, // face + 0x1e: 0x3bd0b, // contextmenu + 0x1f: 0xa308, // frameset + 0x21: 0x5650c, // defaultMuted + 0x22: 0x19905, // color + 0x23: 0x59c06, // onplay + 0x25: 0x2f005, // meter + 0x26: 0x61309, // onstorage + 0x27: 0x38e08, // readonly + 0x29: 0x66707, // profile + 0x2a: 0x8607, // declare + 0x2b: 0xb01, // u + 0x2c: 0x31908, // noscript + 0x2d: 0x65e08, // manifest + 0x2e: 0x1b806, // button + 0x2f: 0x2ea08, // datetime + 0x30: 0x47305, // input + 0x31: 0x5407, // default + 0x32: 0x1d608, // codetype + 0x33: 0x2a90d, // foreignobject + 0x34: 0x36807, // marquee + 0x36: 0x19707, // bgcolor + 0x37: 0x19502, // h1 + 0x39: 0x1e0a, // background + 0x3b: 0x2f50a, // formtarget + 0x41: 0x2f906, // target + 0x43: 0x23b05, // small + 0x44: 0x45008, // codebase + 0x45: 0x55005, // inert + 0x47: 0x38805, // mtext + 0x48: 0x6607, // rowspan + 0x49: 0x2be0d, // onbeforeprint + 0x4a: 0x55508, // ononline + 0x4c: 0x29006, // figure + 0x4d: 0x4cb10, // onloadedmetadata + 0x4e: 0xbb07, // acronym + 0x50: 0x39608, // multiple + 0x51: 0x320e, // accept-charset + 0x52: 0x24f05, // sizes + 0x53: 0x29c0d, // foreignObject + 0x55: 0x2e40a, // novalidate + 0x56: 0x55d0a, // onpagehide + 0x57: 0x2e302, // mn + 0x58: 0x38602, // h4 + 0x5a: 0x1c102, // rt + 0x5b: 0xd205, // alink + 0x5e: 0x59606, // prompt + 0x5f: 0x17102, // ol + 0x61: 0x5d108, // onresize + 0x64: 0x69a07, // summary + 0x65: 0x5a50a, // onpopstate + 0x66: 0x38d04, // area + 0x68: 0x64f09, // onwaiting + 0x6b: 0xdc04, // name + 0x6c: 0x23606, // coords + 0x6d: 0x34303, // img + 0x6e: 0x66404, // step + 0x6f: 0x5ec09, // onseeking + 0x70: 0x32104, // high + 0x71: 0x49e07, // onkeyup + 0x72: 0x5f706, // select + 0x73: 0x1fd05, // track + 0x74: 0x34b05, // ismap + 0x76: 0x47107, // oninput + 0x77: 0x8d01, // q + 0x78: 0x48109, // onkeydown + 0x79: 0x33e05, // image + 0x7a: 0x2b604, // form + 0x7b: 0x60a09, // onstalled + 0x7c: 0xe707, // picture + 0x7d: 0x42f08, // onchange + 0x7e: 0x1a905, // blink + 0x7f: 0xee03, // alt + 0x80: 0xfa05, // async + 0x82: 0x1702, // li + 0x84: 0x2c02, // mi + 0x85: 0xff06, // itemid + 0x86: 0x11605, // audio + 0x87: 0x31b06, // script + 0x8b: 0x44b06, // srcdoc + 0x8e: 0xc704, // mark + 0x8f: 0x18403, // bdo + 0x91: 0x5120b, // onmousemove + 0x93: 0x3c404, // menu + 0x94: 0x45804, // font + 0x95: 0x14f08, // autoplay + 0x96: 0x6c405, // vlink + 0x98: 0x6e02, // em + 0x9a: 0x5b806, // srcset + 0x9b: 0x1ee08, // colgroup + 0x9c: 0x58e04, // open + 0x9d: 0x1d006, // legend + 0x9e: 0x4e10b, // onloadstart + 0xa2: 0x22109, // translate + 0xa3: 0x6e05, // embed + 0xa4: 0x1c305, // class + 0xa6: 0x6aa08, // template + 0xa7: 0x37206, // header + 0xa9: 0x4b806, // onload + 0xaa: 0x37105, // thead + 0xab: 0x5db09, // scrolling + 0xac: 0xc05, // param + 0xae: 0x9b07, // pattern + 0xaf: 0x9207, // details + 0xb1: 0x4a406, // public + 0xb3: 0x4f50b, // onmousedown + 0xb4: 0x14403, // map + 0xb6: 0x25b0b, // crossorigin + 0xb7: 0x1506, // valign + 0xb9: 0x1bc07, // onabort + 0xba: 0x66e06, // option + 0xbb: 0x26606, // source + 0xbc: 0x6205, // defer + 0xbd: 0x1e509, // challenge + 0xbf: 0x10d05, // value + 0xc0: 0x23d0f, // allowfullscreen + 0xc1: 0xca03, // kbd + 0xc2: 0x2070d, // annotationXml + 0xc3: 0x5be0c, // onratechange + 0xc4: 0x4f702, // mo + 0xc6: 0x3af0a, // spellcheck + 0xc7: 0x2c03, // min + 0xc8: 0x4b80c, // onloadeddata + 0xc9: 0x41205, // clear + 0xca: 0x42710, // ondurationchange + 0xcb: 0x1a04, // nobr + 0xcd: 0x27309, // truespeed + 0xcf: 0x30906, // hgroup + 0xd0: 0x40c05, // start + 0xd3: 0x41908, // dropzone + 0xd5: 0x7405, // label + 0xd8: 0xde0a, // mediagroup + 0xd9: 0x17e06, // onblur + 0xdb: 0x27f07, // caption + 0xdd: 0x7c04, // desc + 0xde: 0x15f05, // xmlns + 0xdf: 0x30106, // height + 0xe0: 0x21407, // command + 0xe2: 0x57f0b, // pauseonexit + 0xe3: 0x68f06, // strong + 0xe4: 0x43e07, // onerror + 0xe5: 0x61c08, // onsubmit + 0xe6: 0xb308, // language + 0xe7: 0x48608, // download + 0xe9: 0x53509, // onmouseup + 0xec: 0x2cf07, // enctype + 0xed: 0x5f508, // onselect + 0xee: 0x2b006, // object + 0xef: 0x1f509, // plaintext + 0xf0: 0x3d30a, // ondblclick + 0xf1: 0x18610, // oncanplaythrough + 0xf2: 0xd903, // dir + 0xf3: 0x38908, // textarea + 0xf4: 0x12a04, // ping + 0xf5: 0x2da06, // method + 0xf6: 0x22a08, // controls + 0xf7: 0x37806, // spacer + 0xf8: 0x6a403, // svg + 0xf9: 0x30504, // html + 0xfa: 0x3d01, // s + 0xfc: 0xcc06, // dialog + 0xfe: 0x1da0d, // typemustmatch + 0xff: 0x3b407, // checked + 0x101: 0x30e06, // poster + 0x102: 0x3260a, // http-equiv + 0x103: 0x44b03, // src + 0x104: 0x10408, // disabled + 0x105: 0x37207, // headers + 0x106: 0x5af0a, // onprogress + 0x107: 0x26c08, // fieldset + 0x108: 0x32f03, // var + 0x10a: 0xa305, // frame + 0x10b: 0x36008, // itemtype + 0x10c: 0x3fc0a, // ondragover + 0x10d: 0x13a09, // autofocus + 0x10f: 0x601, // i + 0x110: 0x35902, // ms + 0x111: 0x45404, // base + 0x113: 0x35a05, // scope + 0x114: 0x3206, // accept + 0x115: 0x58808, // itemprop + 0x117: 0xfe04, // cite + 0x118: 0x3907, // charset + 0x119: 0x14a05, // title + 0x11a: 0x68807, // keytype + 0x11b: 0x1fa04, // text + 0x11c: 0x65807, // optimum + 0x11e: 0x37204, // head + 0x121: 0x21b07, // compact + 0x123: 0x63009, // onsuspend + 0x124: 0x4c404, // list + 0x125: 0x4590c, // ontimeupdate + 0x126: 0x62a06, // center + 0x127: 0x31406, // hidden + 0x129: 0x35609, // itemscope + 0x12c: 0x1a402, // dl + 0x12d: 0x17907, // section + 0x12e: 0x11a08, // oncancel + 0x12f: 0x6b607, // onclick + 0x130: 0xde05, // media + 0x131: 0x52406, // output + 0x132: 0x4c008, // datalist + 0x133: 0x53e0c, // onmousewheel + 0x134: 0x45408, // basefont + 0x135: 0x37e09, // maxlength + 0x136: 0x6bd07, // visible + 0x137: 0x2e00e, // formnovalidate + 0x139: 0x16903, // xmp + 0x13a: 0x101, // b + 0x13b: 0x5710a, // onpageshow + 0x13c: 0xf604, // ruby + 0x13d: 0x16b0b, // placeholder + 0x13e: 0x4c407, // listing + 0x140: 0x26403, // ins + 0x141: 0x62207, // itemref + 0x144: 0x540f, // defaultSelected + 0x146: 0x3f10b, // ondragleave + 0x147: 0x1ae0a, // blockquote + 0x148: 0x59304, // wrap + 0x14a: 0x1a603, // big + 0x14b: 0x35003, // rel + 0x14c: 0x41706, // ondrop + 0x14e: 0x6a706, // system + 0x14f: 0x30a, // radiogroup + 0x150: 0x25605, // table + 0x152: 0x57a03, // wbr + 0x153: 0x3bb0d, // oncontextmenu + 0x155: 0x250d, // undeterminate + 0x157: 0x20204, // cols + 0x158: 0x16307, // sandbox + 0x159: 0x1303, // nav + 0x15a: 0x37e03, // max + 0x15b: 0x7808, // longdesc + 0x15c: 0x60405, // width + 0x15d: 0x34902, // h3 + 0x15e: 0x19e07, // bgsound + 0x161: 0x10d09, // valuetype + 0x162: 0x69505, // style + 0x164: 0x3f05, // tbody + 0x165: 0x40e07, // article + 0x169: 0xcb03, // bdi + 0x16a: 0x67e07, // address + 0x16b: 0x23105, // shape + 0x16c: 0x2ba06, // action + 0x16e: 0x1fd02, // tr + 0x16f: 0xaa02, // td + 0x170: 0x3dd09, // ondragend + 0x171: 0x5802, // ul + 0x172: 0x33806, // border + 0x174: 0x4a06, // keygen + 0x175: 0x4004, // body + 0x177: 0x1c909, // draggable + 0x178: 0x2b60a, // formaction + 0x17b: 0x34406, // mglyph + 0x17d: 0x1d02, // rb + 0x17e: 0x2ff02, // h6 + 0x17f: 0x41e09, // onemptied + 0x180: 0x5ca07, // onreset + 0x181: 0x1004, // main + 0x182: 0x12104, // loop + 0x183: 0x48e0a, // onkeypress + 0x184: 0x9d02, // tt + 0x186: 0x20207, // colspan + 0x188: 0x36f04, // math + 0x189: 0x1605, // align + 0x18a: 0xa108, // noframes + 0x18b: 0xaf02, // hr + 0x18c: 0xc10a, // malignmark + 0x18e: 0x23f03, // low + 0x18f: 0x8502, // id + 0x190: 0x6604, // rows + 0x191: 0x29403, // rev + 0x192: 0x63908, // onunload + 0x193: 0x39e05, // muted + 0x194: 0x35a06, // scoped + 0x195: 0x31602, // dd + 0x196: 0x60602, // dt + 0x197: 0x6720e, // onbeforeunload + 0x199: 0x2070a, // annotation + 0x19a: 0x29408, // reversed + 0x19c: 0x11204, // type + 0x19d: 0x57d07, // onpause + 0x19e: 0xd604, // kind + 0x19f: 0x4c004, // data + 0x1a0: 0x4ff07, // noshade + 0x1a3: 0x17505, // rules + 0x1a4: 0x12308, // optgroup + 0x1a5: 0x202, // br + 0x1a7: 0x1, // a + 0x1a8: 0x51d0a, // onmouseout + 0x1aa: 0x54a09, // onoffline + 0x1ab: 0x6410e, // onvolumechange + 0x1ae: 0x61e03, // sub + 0x1b3: 0x29c03, // for + 0x1b5: 0x8b08, // required + 0x1b6: 0x5b108, // progress + 0x1b7: 0x14106, // usemap + 0x1b8: 0x7f06, // canvas + 0x1b9: 0x4a804, // icon + 0x1bb: 0x1c103, // rtc + 0x1bc: 0x8305, // aside + 0x1bd: 0x2ee04, // time + 0x1be: 0x4060b, // ondragstart + 0x1c0: 0x27c0a, // figcaption + 0x1c1: 0xaf04, // href + 0x1c2: 0x33206, // iframe + 0x1c3: 0x18609, // oncanplay + 0x1c4: 0x6904, // span + 0x1c5: 0x34f03, // pre + 0x1c6: 0x6c07, // noembed + 0x1c8: 0x5e408, // onseeked + 0x1c9: 0x4d304, // meta + 0x1ca: 0x32402, // h2 + 0x1cb: 0x3a808, // seamless + 0x1cc: 0xab03, // dfn + 0x1cd: 0x15704, // axis + 0x1cf: 0x3e60b, // ondragenter + 0x1d0: 0x18f02, // th + 0x1d1: 0x4650c, // onhashchange + 0x1d2: 0xb304, // lang + 0x1d3: 0x44507, // onfocus + 0x1d5: 0x24f04, // size + 0x1d8: 0x12e0c, // autocomplete + 0x1d9: 0xaf08, // hreflang + 0x1da: 0x9804, // samp + 0x1de: 0x19903, // col + 0x1df: 0x10b03, // div + 0x1e0: 0x25308, // sortable + 0x1e1: 0x7203, // del + 0x1e3: 0x3a307, // onclose + 0x1e6: 0xd907, // dirname + 0x1e8: 0x1c307, // classid + 0x1e9: 0x34f07, // preload + 0x1ea: 0x4d908, // tabindex + 0x1eb: 0x60802, // h5 + 0x1ec: 0x5d908, // onscroll + 0x1ed: 0x4a90f, // contenteditable + 0x1ee: 0x4ec09, // onmessage + 0x1ef: 0x4, // abbr + 0x1f0: 0x15907, // isindex + 0x1f1: 0x6a103, // sup + 0x1f3: 0x24b08, // noresize + 0x1f5: 0x59c09, // onplaying + 0x1f6: 0x4409, // accesskey + 0x1fa: 0xc01, // p + 0x1fb: 0x43707, // onended + 0x1fc: 0x5ff06, // onshow + 0x1fe: 0xad06, // nohref +} diff --git a/vendor/github.com/tdewolff/parse/html/hash_test.go b/vendor/github.com/tdewolff/parse/html/hash_test.go new file mode 100644 index 0000000..c905ba3 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/html/hash_test.go @@ -0,0 +1,58 @@ +package html // import "github.com/tdewolff/parse/html" + +import ( + "bytes" + "testing" + + "github.com/tdewolff/test" +) + +func TestHashTable(t *testing.T) { + test.T(t, ToHash([]byte("address")), Address, "'address' must resolve to Address") + test.T(t, Address.String(), "address") + test.T(t, Accept_Charset.String(), "accept-charset") + test.T(t, ToHash([]byte("")), Hash(0), "empty string must resolve to zero") + test.T(t, Hash(0xffffff).String(), "") + test.T(t, ToHash([]byte("iter")), Hash(0), "'iter' must resolve to zero") + test.T(t, ToHash([]byte("test")), Hash(0), "'test' must resolve to zero") +} + +//////////////////////////////////////////////////////////////// + +var result int + +// naive scenario +func BenchmarkCompareBytes(b *testing.B) { + var r int + val := []byte("span") + for n := 0; n < b.N; n++ { + if bytes.Equal(val, []byte("span")) { + r++ + } + } + result = r +} + +// using-atoms scenario +func BenchmarkFindAndCompareAtom(b *testing.B) { + var r int + val := []byte("span") + for n := 0; n < b.N; n++ { + if ToHash(val) == Span { + r++ + } + } + result = r +} + +// using-atoms worst-case scenario +func BenchmarkFindAtomCompareBytes(b *testing.B) { + var r int + val := []byte("zzzz") + for n := 0; n < b.N; n++ { + if h := ToHash(val); h == 0 && bytes.Equal(val, []byte("zzzz")) { + r++ + } + } + result = r +} diff --git a/vendor/github.com/tdewolff/parse/html/lex.go b/vendor/github.com/tdewolff/parse/html/lex.go new file mode 100644 index 0000000..c81490a --- /dev/null +++ b/vendor/github.com/tdewolff/parse/html/lex.go @@ -0,0 +1,485 @@ +// Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html. +package html // import "github.com/tdewolff/parse/html" + +import ( + "io" + "strconv" + + "github.com/tdewolff/parse" + "github.com/tdewolff/parse/buffer" +) + +// TokenType determines the type of token, eg. a number or a semicolon. +type TokenType uint32 + +// TokenType values. +const ( + ErrorToken TokenType = iota // extra token when errors occur + CommentToken + DoctypeToken + StartTagToken + StartTagCloseToken + StartTagVoidToken + EndTagToken + AttributeToken + TextToken + SvgToken + MathToken +) + +// String returns the string representation of a TokenType. +func (tt TokenType) String() string { + switch tt { + case ErrorToken: + return "Error" + case CommentToken: + return "Comment" + case DoctypeToken: + return "Doctype" + case StartTagToken: + return "StartTag" + case StartTagCloseToken: + return "StartTagClose" + case StartTagVoidToken: + return "StartTagVoid" + case EndTagToken: + return "EndTag" + case AttributeToken: + return "Attribute" + case TextToken: + return "Text" + case SvgToken: + return "Svg" + case MathToken: + return "Math" + } + return "Invalid(" + strconv.Itoa(int(tt)) + ")" +} + +//////////////////////////////////////////////////////////////// + +// Lexer is the state for the lexer. +type Lexer struct { + r *buffer.Lexer + err error + + rawTag Hash + inTag bool + + text []byte + attrVal []byte +} + +// NewLexer returns a new Lexer for a given io.Reader. +func NewLexer(r io.Reader) *Lexer { + return &Lexer{ + r: buffer.NewLexer(r), + } +} + +// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. +func (l *Lexer) Err() error { + if err := l.r.Err(); err != nil { + return err + } + return l.err +} + +// Restore restores the NULL byte at the end of the buffer. +func (l *Lexer) Restore() { + l.r.Restore() +} + +// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. +func (l *Lexer) Next() (TokenType, []byte) { + l.text = nil + var c byte + if l.inTag { + l.attrVal = nil + for { // before attribute name state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + if c == 0 { + l.err = parse.NewErrorLexer("unexpected null character", l.r) + return ErrorToken, nil + } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') { + return AttributeToken, l.shiftAttribute() + } + start := l.r.Pos() + l.inTag = false + if c == '/' { + l.r.Move(2) + l.text = l.r.Lexeme()[start:] + return StartTagVoidToken, l.r.Shift() + } + l.r.Move(1) + l.text = l.r.Lexeme()[start:] + return StartTagCloseToken, l.r.Shift() + } + + if l.rawTag != 0 { + if rawText := l.shiftRawText(); len(rawText) > 0 { + l.rawTag = 0 + return TextToken, rawText + } + l.rawTag = 0 + } + + for { + c = l.r.Peek(0) + if c == '<' { + c = l.r.Peek(1) + if l.r.Pos() > 0 { + if c == '/' && l.r.Peek(2) != 0 || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' { + return TextToken, l.r.Shift() + } + } else if c == '/' && l.r.Peek(2) != 0 { + l.r.Move(2) + if c = l.r.Peek(0); c != '>' && !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + return CommentToken, l.shiftBogusComment() + } + return EndTagToken, l.shiftEndTag() + } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + l.r.Move(1) + l.inTag = true + return l.shiftStartTag() + } else if c == '!' { + l.r.Move(2) + return l.readMarkup() + } else if c == '?' { + l.r.Move(1) + return CommentToken, l.shiftBogusComment() + } + } else if c == 0 { + if l.r.Pos() > 0 { + return TextToken, l.r.Shift() + } + l.err = parse.NewErrorLexer("unexpected null character", l.r) + return ErrorToken, nil + } + l.r.Move(1) + } +} + +// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters. +func (l *Lexer) Text() []byte { + return l.text +} + +// AttrVal returns the attribute value when an AttributeToken was returned from Next. +func (l *Lexer) AttrVal() []byte { + return l.attrVal +} + +//////////////////////////////////////////////////////////////// + +// The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html + +func (l *Lexer) shiftRawText() []byte { + if l.rawTag == Plaintext { + for { + if l.r.Peek(0) == 0 { + return l.r.Shift() + } + l.r.Move(1) + } + } else { // RCDATA, RAWTEXT and SCRIPT + for { + c := l.r.Peek(0) + if c == '<' { + if l.r.Peek(1) == '/' { + mark := l.r.Pos() + l.r.Move(2) + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice + l.r.Rewind(mark) + return l.r.Shift() + } + } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { + l.r.Move(4) + inScript := false + for { + c := l.r.Peek(0) + if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { + l.r.Move(3) + break + } else if c == '<' { + isEnd := l.r.Peek(1) == '/' + if isEnd { + l.r.Move(2) + } else { + l.r.Move(1) + } + mark := l.r.Pos() + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice + if !isEnd { + inScript = true + } else { + if !inScript { + l.r.Rewind(mark - 2) + return l.r.Shift() + } + inScript = false + } + } + } else if c == 0 { + return l.r.Shift() + } + l.r.Move(1) + } + } else { + l.r.Move(1) + } + } else if c == 0 { + return l.r.Shift() + } else { + l.r.Move(1) + } + } + } +} + +func (l *Lexer) readMarkup() (TokenType, []byte) { + if l.at('-', '-') { + l.r.Move(2) + for { + if l.r.Peek(0) == 0 { + return CommentToken, l.r.Shift() + } else if l.at('-', '-', '>') { + l.text = l.r.Lexeme()[4:] + l.r.Move(3) + return CommentToken, l.r.Shift() + } else if l.at('-', '-', '!', '>') { + l.text = l.r.Lexeme()[4:] + l.r.Move(4) + return CommentToken, l.r.Shift() + } + l.r.Move(1) + } + } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') { + l.r.Move(7) + for { + if l.r.Peek(0) == 0 { + return TextToken, l.r.Shift() + } else if l.at(']', ']', '>') { + l.r.Move(3) + return TextToken, l.r.Shift() + } + l.r.Move(1) + } + } else { + if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') { + l.r.Move(7) + if l.r.Peek(0) == ' ' { + l.r.Move(1) + } + for { + if c := l.r.Peek(0); c == '>' || c == 0 { + l.text = l.r.Lexeme()[9:] + if c == '>' { + l.r.Move(1) + } + return DoctypeToken, l.r.Shift() + } + l.r.Move(1) + } + } + } + return CommentToken, l.shiftBogusComment() +} + +func (l *Lexer) shiftBogusComment() []byte { + for { + c := l.r.Peek(0) + if c == '>' { + l.text = l.r.Lexeme()[2:] + l.r.Move(1) + return l.r.Shift() + } else if c == 0 { + l.text = l.r.Lexeme()[2:] + return l.r.Shift() + } + l.r.Move(1) + } +} + +func (l *Lexer) shiftStartTag() (TokenType, []byte) { + for { + if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 { + break + } + l.r.Move(1) + } + l.text = parse.ToLower(l.r.Lexeme()[1:]) + if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math { + if h == Svg { + l.inTag = false + return SvgToken, l.shiftXml(h) + } else if h == Math { + l.inTag = false + return MathToken, l.shiftXml(h) + } + l.rawTag = h + } + return StartTagToken, l.r.Shift() +} + +func (l *Lexer) shiftAttribute() []byte { + nameStart := l.r.Pos() + var c byte + for { // attribute name state + if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 { + break + } + l.r.Move(1) + } + nameEnd := l.r.Pos() + for { // after attribute name state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + if c == '=' { + l.r.Move(1) + for { // before attribute value state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + attrPos := l.r.Pos() + delim := c + if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state + l.r.Move(1) + for { + c := l.r.Peek(0) + if c == delim { + l.r.Move(1) + break + } else if c == 0 { + break + } + l.r.Move(1) + } + } else { // attribute value unquoted state + for { + if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 { + break + } + l.r.Move(1) + } + } + l.attrVal = l.r.Lexeme()[attrPos:] + } else { + l.r.Rewind(nameEnd) + l.attrVal = nil + } + l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd]) + return l.r.Shift() +} + +func (l *Lexer) shiftEndTag() []byte { + for { + c := l.r.Peek(0) + if c == '>' { + l.text = l.r.Lexeme()[2:] + l.r.Move(1) + break + } else if c == 0 { + l.text = l.r.Lexeme()[2:] + break + } + l.r.Move(1) + } + + end := len(l.text) + for end > 0 { + if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' { + end-- + continue + } + break + } + l.text = l.text[:end] + return parse.ToLower(l.r.Shift()) +} + +func (l *Lexer) shiftXml(rawTag Hash) []byte { + inQuote := false + for { + c := l.r.Peek(0) + if c == '"' { + inQuote = !inQuote + l.r.Move(1) + } else if c == '<' && !inQuote { + if l.r.Peek(1) == '/' { + mark := l.r.Pos() + l.r.Move(2) + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice + break + } + } else { + l.r.Move(1) + } + } else if c == 0 { + return l.r.Shift() + } + l.r.Move(1) + } + + for { + c := l.r.Peek(0) + if c == '>' { + l.r.Move(1) + break + } else if c == 0 { + break + } + l.r.Move(1) + } + return l.r.Shift() +} + +//////////////////////////////////////////////////////////////// + +func (l *Lexer) at(b ...byte) bool { + for i, c := range b { + if l.r.Peek(i) != c { + return false + } + } + return true +} + +func (l *Lexer) atCaseInsensitive(b ...byte) bool { + for i, c := range b { + if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c { + return false + } + } + return true +} diff --git a/vendor/github.com/tdewolff/parse/html/lex_test.go b/vendor/github.com/tdewolff/parse/html/lex_test.go new file mode 100644 index 0000000..5f4ca0b --- /dev/null +++ b/vendor/github.com/tdewolff/parse/html/lex_test.go @@ -0,0 +1,262 @@ +package html // import "github.com/tdewolff/parse/html" + +import ( + "bytes" + "fmt" + "io" + "testing" + + "github.com/tdewolff/parse" + "github.com/tdewolff/test" +) + +type TTs []TokenType + +func TestTokens(t *testing.T) { + var tokenTests = []struct { + html string + expected []TokenType + }{ + {"<html></html>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}}, + {"<img/>", TTs{StartTagToken, StartTagVoidToken}}, + {"<!-- comment -->", TTs{CommentToken}}, + {"<!-- comment --!>", TTs{CommentToken}}, + {"<p>text</p>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<input type='button'/>", TTs{StartTagToken, AttributeToken, StartTagVoidToken}}, + {"<input type='button' value=''/>", TTs{StartTagToken, AttributeToken, AttributeToken, StartTagVoidToken}}, + {"<input type='=/>' \r\n\t\f value=\"'\" name=x checked />", TTs{StartTagToken, AttributeToken, AttributeToken, AttributeToken, AttributeToken, StartTagVoidToken}}, + {"<!doctype>", TTs{DoctypeToken}}, + {"<!doctype html>", TTs{DoctypeToken}}, + {"<?bogus>", TTs{CommentToken}}, + {"</0bogus>", TTs{CommentToken}}, + {"<!bogus>", TTs{CommentToken}}, + {"< ", TTs{TextToken}}, + {"</", TTs{TextToken}}, + + // raw tags + {"<title><p></p></title>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<TITLE><p></p></TITLE>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<plaintext></plaintext>", TTs{StartTagToken, StartTagCloseToken, TextToken}}, + {"<script></script>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}}, + {"<script>var x='</script>';</script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken}}, + {"<script><!--var x='</script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken}}, + {"<script><!--var x='<script></script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<script><!--var x='<script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}}, + {"<![CDATA[ test ]]>", TTs{TextToken}}, + {"<svg>text</svg>", TTs{SvgToken}}, + {"<math>text</math>", TTs{MathToken}}, + {`<svg>text<x a="</svg>"></x></svg>`, TTs{SvgToken}}, + {"<a><svg>text</svg></a>", TTs{StartTagToken, StartTagCloseToken, SvgToken, EndTagToken}}, + + // early endings + {"<!-- comment", TTs{CommentToken}}, + {"<? bogus comment", TTs{CommentToken}}, + {"<foo", TTs{StartTagToken}}, + {"</foo", TTs{EndTagToken}}, + {"<foo x", TTs{StartTagToken, AttributeToken}}, + {"<foo x=", TTs{StartTagToken, AttributeToken}}, + {"<foo x='", TTs{StartTagToken, AttributeToken}}, + {"<foo x=''", TTs{StartTagToken, AttributeToken}}, + {"<!DOCTYPE note SYSTEM", TTs{DoctypeToken}}, + {"<![CDATA[ test", TTs{TextToken}}, + {"<script>", TTs{StartTagToken, StartTagCloseToken}}, + {"<script><!--", TTs{StartTagToken, StartTagCloseToken, TextToken}}, + {"<script><!--var x='<script></script>';-->", TTs{StartTagToken, StartTagCloseToken, TextToken}}, + + // go-fuzz + {"</>", TTs{EndTagToken}}, + } + for _, tt := range tokenTests { + t.Run(tt.html, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.html)) + i := 0 + for { + token, _ := l.Next() + if token == ErrorToken { + test.T(t, l.Err(), io.EOF) + test.T(t, i, len(tt.expected), "when error occurred we must be at the end") + break + } + test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected)) + if i < len(tt.expected) { + test.T(t, token, tt.expected[i], "token types must match") + } + i++ + } + }) + } + + test.T(t, TokenType(100).String(), "Invalid(100)") +} + +func TestTags(t *testing.T) { + var tagTests = []struct { + html string + expected string + }{ + {"<foo:bar.qux-norf/>", "foo:bar.qux-norf"}, + {"<foo?bar/qux>", "foo?bar/qux"}, + {"<!DOCTYPE note SYSTEM \"Note.dtd\">", " note SYSTEM \"Note.dtd\""}, + {"</foo >", "foo"}, + + // early endings + {"<foo ", "foo"}, + } + for _, tt := range tagTests { + t.Run(tt.html, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.html)) + for { + token, _ := l.Next() + if token == ErrorToken { + test.T(t, l.Err(), io.EOF) + test.Fail(t, "when error occurred we must be at the end") + break + } else if token == StartTagToken || token == EndTagToken || token == DoctypeToken { + test.String(t, string(l.Text()), tt.expected) + break + } + } + }) + } +} + +func TestAttributes(t *testing.T) { + var attributeTests = []struct { + attr string + expected []string + }{ + {"<foo a=\"b\" />", []string{"a", "\"b\""}}, + {"<foo \nchecked \r\n value\r=\t'=/>\"' />", []string{"checked", "", "value", "'=/>\"'"}}, + {"<foo bar=\" a \n\t\r b \" />", []string{"bar", "\" a \n\t\r b \""}}, + {"<foo a/>", []string{"a", ""}}, + {"<foo /=/>", []string{"/", "/"}}, + + // early endings + {"<foo x", []string{"x", ""}}, + {"<foo x=", []string{"x", ""}}, + {"<foo x='", []string{"x", "'"}}, + } + for _, tt := range attributeTests { + t.Run(tt.attr, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.attr)) + i := 0 + for { + token, _ := l.Next() + if token == ErrorToken { + test.T(t, l.Err(), io.EOF) + test.T(t, i, len(tt.expected), "when error occurred we must be at the end") + break + } else if token == AttributeToken { + test.That(t, i+1 < len(tt.expected), "index", i+1, "must not exceed expected attributes size", len(tt.expected)) + if i+1 < len(tt.expected) { + test.String(t, string(l.Text()), tt.expected[i], "attribute keys must match") + test.String(t, string(l.AttrVal()), tt.expected[i+1], "attribute keys must match") + i += 2 + } + } + } + }) + } +} + +func TestErrors(t *testing.T) { + var errorTests = []struct { + html string + col int + }{ + {"a\x00b", 2}, + } + for _, tt := range errorTests { + t.Run(tt.html, func(t *testing.T) { + l := NewLexer(bytes.NewBufferString(tt.html)) + for { + token, _ := l.Next() + if token == ErrorToken { + if tt.col == 0 { + test.T(t, l.Err(), io.EOF) + } else if perr, ok := l.Err().(*parse.Error); ok { + test.T(t, perr.Col, tt.col) + } else { + test.Fail(t, "bad error:", l.Err()) + } + break + } + } + }) + } +} + +//////////////////////////////////////////////////////////////// + +var J int +var ss = [][]byte{ + []byte(" style"), + []byte("style"), + []byte(" \r\n\tstyle"), + []byte(" style"), + []byte(" x"), + []byte("x"), +} + +func BenchmarkWhitespace1(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, s := range ss { + j := 0 + for { + if c := s[j]; c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + j++ + } else { + break + } + } + J += j + } + } +} + +func BenchmarkWhitespace2(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, s := range ss { + j := 0 + for { + if c := s[j]; c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + j++ + continue + } + break + } + J += j + } + } +} + +func BenchmarkWhitespace3(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, s := range ss { + j := 0 + for { + if c := s[j]; c != ' ' && c != '\t' && c != '\n' && c != '\r' && c != '\f' { + break + } + j++ + } + J += j + } + } +} + +//////////////////////////////////////////////////////////////// + +func ExampleNewLexer() { + l := NewLexer(bytes.NewBufferString("<span class='user'>John Doe</span>")) + out := "" + for { + tt, data := l.Next() + if tt == ErrorToken { + break + } + out += string(data) + } + fmt.Println(out) + // Output: <span class='user'>John Doe</span> +} diff --git a/vendor/github.com/tdewolff/parse/html/util.go b/vendor/github.com/tdewolff/parse/html/util.go new file mode 100644 index 0000000..c8c3aab --- /dev/null +++ b/vendor/github.com/tdewolff/parse/html/util.go @@ -0,0 +1,129 @@ +package html // import "github.com/tdewolff/parse/html" + +import "github.com/tdewolff/parse" + +var ( + singleQuoteEntityBytes = []byte("'") + doubleQuoteEntityBytes = []byte(""") +) + +var charTable = [256]bool{ + // ASCII + false, false, false, false, false, false, false, false, + false, true, true, true, true, true, false, false, // tab, new line, vertical tab, form feed, carriage return + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, true, false, false, false, true, true, // space, ", &, ' + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, true, true, true, false, // <, =, > + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, false, false, false, false, false, false, // ` + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + // non-ASCII + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, +} + +// EscapeAttrVal returns the escaped attribute value bytes without quotes. +func EscapeAttrVal(buf *[]byte, orig, b []byte) []byte { + singles := 0 + doubles := 0 + unquoted := true + entities := false + for i, c := range b { + if charTable[c] { + if c == '&' { + entities = true + if quote, n := parse.QuoteEntity(b[i:]); n > 0 { + if quote == '"' { + unquoted = false + doubles++ + } else { + unquoted = false + singles++ + } + } + } else { + unquoted = false + if c == '"' { + doubles++ + } else if c == '\'' { + singles++ + } + } + } + } + if unquoted { + return b + } else if !entities && len(orig) == len(b)+2 && (singles == 0 && orig[0] == '\'' || doubles == 0 && orig[0] == '"') { + return orig + } + + n := len(b) + 2 + var quote byte + var escapedQuote []byte + if doubles > singles { + n += singles * 4 + quote = '\'' + escapedQuote = singleQuoteEntityBytes + } else { + n += doubles * 4 + quote = '"' + escapedQuote = doubleQuoteEntityBytes + } + if n > cap(*buf) { + *buf = make([]byte, 0, n) // maximum size, not actual size + } + t := (*buf)[:n] // maximum size, not actual size + t[0] = quote + j := 1 + start := 0 + for i, c := range b { + if c == '&' { + if entityQuote, n := parse.QuoteEntity(b[i:]); n > 0 { + j += copy(t[j:], b[start:i]) + if entityQuote != quote { + t[j] = entityQuote + j++ + } else { + j += copy(t[j:], escapedQuote) + } + start = i + n + } + } else if c == quote { + j += copy(t[j:], b[start:i]) + j += copy(t[j:], escapedQuote) + start = i + 1 + } + } + j += copy(t[j:], b[start:]) + t[j] = quote + return t[:j+1] +} diff --git a/vendor/github.com/tdewolff/parse/html/util_test.go b/vendor/github.com/tdewolff/parse/html/util_test.go new file mode 100644 index 0000000..3722a08 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/html/util_test.go @@ -0,0 +1,43 @@ +package html // import "github.com/tdewolff/parse/html" + +import ( + "testing" + + "github.com/tdewolff/test" +) + +func TestEscapeAttrVal(t *testing.T) { + var escapeAttrValTests = []struct { + attrVal string + expected string + }{ + {"xyz", "xyz"}, + {"", ""}, + {"x&z", "x&z"}, + {"x/z", "x/z"}, + {"x'z", "\"x'z\""}, + {"x\"z", "'x\"z'"}, + {"'x\"z'", "'x\"z'"}, + {"'x'\"'z'", "\"x'"'z\""}, + {"\"x"'"z\"", "'x\"'\"z'"}, + {"\"x'z\"", "\"x'z\""}, + {"'x"z'", "'x\"z'"}, + {"'x\">'", "'x\">'"}, + {"You're encouraged to log in; however, it's not mandatory. [o]", "\"You're encouraged to log in; however, it's not mandatory. [o]\""}, + {"a'b=\"\"", "'a'b=\"\"'"}, + {"x<z", "\"x<z\""}, + {"'x\"'\"z'", "'x\"'\"z'"}, + } + var buf []byte + for _, tt := range escapeAttrValTests { + t.Run(tt.attrVal, func(t *testing.T) { + b := []byte(tt.attrVal) + orig := b + if len(b) > 1 && (b[0] == '"' || b[0] == '\'') && b[0] == b[len(b)-1] { + b = b[1 : len(b)-1] + } + val := EscapeAttrVal(&buf, orig, []byte(b)) + test.String(t, string(val), tt.expected) + }) + } +} |