diff options
author | Frédéric Guillot <fred@miniflux.net> | 2017-11-19 21:10:04 -0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2017-11-19 22:01:46 -0800 |
commit | 8ffb773f43c8dc54801ca1d111854e7e881c93c9 (patch) | |
tree | 38133a2fc612597a75fed1d13e5b4042f58a2b7e /reader/sanitizer |
First commit
Diffstat (limited to 'reader/sanitizer')
-rw-r--r-- | reader/sanitizer/sanitizer.go | 360 | ||||
-rw-r--r-- | reader/sanitizer/sanitizer_test.go | 144 | ||||
-rw-r--r-- | reader/sanitizer/strip_tags.go | 35 | ||||
-rw-r--r-- | reader/sanitizer/strip_tags_test.go | 17 |
4 files changed, 556 insertions, 0 deletions
diff --git a/reader/sanitizer/sanitizer.go b/reader/sanitizer/sanitizer.go new file mode 100644 index 0000000..6af034c --- /dev/null +++ b/reader/sanitizer/sanitizer.go @@ -0,0 +1,360 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import ( + "bytes" + "fmt" + "github.com/miniflux/miniflux2/reader/url" + "io" + "strings" + + "golang.org/x/net/html" +) + +// Sanitize returns safe HTML. +func Sanitize(baseURL, input string) string { + tokenizer := html.NewTokenizer(bytes.NewBufferString(input)) + var buffer bytes.Buffer + var tagStack []string + + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + return buffer.String() + } + + return "" + } + + token := tokenizer.Token() + switch token.Type { + case html.TextToken: + buffer.WriteString(token.Data) + case html.StartTagToken: + tagName := token.DataAtom.String() + + if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) { + attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) + + if hasRequiredAttributes(tagName, attrNames) { + if len(attrNames) > 0 { + buffer.WriteString("<" + tagName + " " + htmlAttributes + ">") + } else { + buffer.WriteString("<" + tagName + ">") + } + + tagStack = append(tagStack, tagName) + } + } + case html.EndTagToken: + tagName := token.DataAtom.String() + if isValidTag(tagName) && inList(tagName, tagStack) { + buffer.WriteString(fmt.Sprintf("</%s>", tagName)) + } + case html.SelfClosingTagToken: + tagName := token.DataAtom.String() + if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) { + attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) + + if hasRequiredAttributes(tagName, attrNames) { + if len(attrNames) > 0 { + buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>") + } else { + buffer.WriteString("<" + tagName + "/>") + } + } + } + } + } +} + +func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) (attrNames []string, html string) { + var htmlAttrs []string + var err error + + for _, attribute := range attributes { + value := attribute.Val + + if !isValidAttribute(tagName, attribute.Key) { + continue + } + + if isExternalResourceAttribute(attribute.Key) { + if tagName == "iframe" && !isValidIframeSource(attribute.Val) { + continue + } else { + value, err = url.GetAbsoluteURL(baseURL, value) + if err != nil { + continue + } + + if !hasValidScheme(value) || isBlacklistedResource(value) { + continue + } + } + } + + attrNames = append(attrNames, attribute.Key) + htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, value)) + } + + extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName) + if len(extraAttrNames) > 0 { + attrNames = append(attrNames, extraAttrNames...) + htmlAttrs = append(htmlAttrs, extraHTMLAttributes...) + } + + return attrNames, strings.Join(htmlAttrs, " ") +} + +func getExtraAttributes(tagName string) ([]string, []string) { + if tagName == "a" { + return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`} + } + + if tagName == "video" || tagName == "audio" { + return []string{"controls"}, []string{"controls"} + } + + return nil, nil +} + +func isValidTag(tagName string) bool { + for element := range getTagWhitelist() { + if tagName == element { + return true + } + } + + return false +} + +func isValidAttribute(tagName, attributeName string) bool { + for element, attributes := range getTagWhitelist() { + if tagName == element { + if inList(attributeName, attributes) { + return true + } + } + } + + return false +} + +func isExternalResourceAttribute(attribute string) bool { + switch attribute { + case "src", "href", "poster", "cite": + return true + default: + return false + } +} + +func isPixelTracker(tagName string, attributes []html.Attribute) bool { + if tagName == "img" { + hasHeight := false + hasWidth := false + + for _, attribute := range attributes { + if attribute.Key == "height" && attribute.Val == "1" { + hasHeight = true + } + + if attribute.Key == "width" && attribute.Val == "1" { + hasWidth = true + } + } + + return hasHeight && hasWidth + } + + return false +} + +func hasRequiredAttributes(tagName string, attributes []string) bool { + elements := make(map[string][]string) + elements["a"] = []string{"href"} + elements["iframe"] = []string{"src"} + elements["img"] = []string{"src"} + elements["source"] = []string{"src"} + + for element, attrs := range elements { + if tagName == element { + for _, attribute := range attributes { + for _, attr := range attrs { + if attr == attribute { + return true + } + } + } + + return false + } + } + + return true +} + +func hasValidScheme(src string) bool { + // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + whitelist := []string{ + "apt://", + "bitcoin://", + "callto://", + "ed2k://", + "facetime://", + "feed://", + "ftp://", + "geo://", + "gopher://", + "git://", + "http://", + "https://", + "irc://", + "irc6://", + "ircs://", + "itms://", + "jabber://", + "magnet://", + "mailto://", + "maps://", + "news://", + "nfs://", + "nntp://", + "rtmp://", + "sip://", + "sips://", + "skype://", + "smb://", + "sms://", + "spotify://", + "ssh://", + "sftp://", + "steam://", + "svn://", + "tel://", + "webcal://", + "xmpp://", + } + + for _, prefix := range whitelist { + if strings.HasPrefix(src, prefix) { + return true + } + } + + return false +} + +func isBlacklistedResource(src string) bool { + blacklist := []string{ + "feedsportal.com", + "api.flattr.com", + "stats.wordpress.com", + "plus.google.com/share", + "twitter.com/share", + "feeds.feedburner.com", + } + + for _, element := range blacklist { + if strings.Contains(src, element) { + return true + } + } + + return false +} + +func isValidIframeSource(src string) bool { + whitelist := []string{ + "http://www.youtube.com", + "https://www.youtube.com", + "http://player.vimeo.com", + "https://player.vimeo.com", + "http://www.dailymotion.com", + "https://www.dailymotion.com", + "http://vk.com", + "https://vk.com", + } + + for _, prefix := range whitelist { + if strings.HasPrefix(src, prefix) { + return true + } + } + + return false +} + +func getTagWhitelist() map[string][]string { + whitelist := make(map[string][]string) + whitelist["img"] = []string{"alt", "title", "src"} + whitelist["audio"] = []string{"src"} + whitelist["video"] = []string{"poster", "height", "width", "src"} + whitelist["source"] = []string{"src", "type"} + whitelist["dt"] = []string{} + whitelist["dd"] = []string{} + whitelist["dl"] = []string{} + whitelist["table"] = []string{} + whitelist["caption"] = []string{} + whitelist["thead"] = []string{} + whitelist["tfooter"] = []string{} + whitelist["tr"] = []string{} + whitelist["td"] = []string{"rowspan", "colspan"} + whitelist["th"] = []string{"rowspan", "colspan"} + whitelist["h1"] = []string{} + whitelist["h2"] = []string{} + whitelist["h3"] = []string{} + whitelist["h4"] = []string{} + whitelist["h5"] = []string{} + whitelist["h6"] = []string{} + whitelist["strong"] = []string{} + whitelist["em"] = []string{} + whitelist["code"] = []string{} + whitelist["pre"] = []string{} + whitelist["blockquote"] = []string{} + whitelist["q"] = []string{"cite"} + whitelist["p"] = []string{} + whitelist["ul"] = []string{} + whitelist["li"] = []string{} + whitelist["ol"] = []string{} + whitelist["br"] = []string{} + whitelist["del"] = []string{} + whitelist["a"] = []string{"href", "title"} + whitelist["figure"] = []string{} + whitelist["figcaption"] = []string{} + whitelist["cite"] = []string{} + whitelist["time"] = []string{"datetime"} + whitelist["abbr"] = []string{"title"} + whitelist["acronym"] = []string{"title"} + whitelist["wbr"] = []string{} + whitelist["dfn"] = []string{} + whitelist["sub"] = []string{} + whitelist["sup"] = []string{} + whitelist["var"] = []string{} + whitelist["samp"] = []string{} + whitelist["s"] = []string{} + whitelist["del"] = []string{} + whitelist["ins"] = []string{} + whitelist["kbd"] = []string{} + whitelist["rp"] = []string{} + whitelist["rt"] = []string{} + whitelist["rtc"] = []string{} + whitelist["ruby"] = []string{} + whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"} + return whitelist +} + +func inList(needle string, haystack []string) bool { + for _, element := range haystack { + if element == needle { + return true + } + } + + return false +} diff --git a/reader/sanitizer/sanitizer_test.go b/reader/sanitizer/sanitizer_test.go new file mode 100644 index 0000000..73862d3 --- /dev/null +++ b/reader/sanitizer/sanitizer_test.go @@ -0,0 +1,144 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "testing" + +func TestValidInput(t *testing.T) { + input := `<p>This is a <strong>text</strong> with an image: <img src="http://example.org/" alt="Test">.</p>` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestSelfClosingTags(t *testing.T) { + input := `<p>This <br> is a <strong>text</strong> <br/>with an image: <img src="http://example.org/" alt="Test"/>.</p>` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestTable(t *testing.T) { + input := `<table><tr><th>A</th><th colspan="2">B</th></tr><tr><td>C</td><td>D</td><td>E</td></tr></table>` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestRelativeURL(t *testing.T) { + input := `This <a href="/test.html">link is relative</a> and this image: <img src="../folder/image.png"/>` + expected := `This <a href="http://example.org/test.html" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">link is relative</a> and this image: <img src="http://example.org/folder/image.png"/>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestProtocolRelativeURL(t *testing.T) { + input := `This <a href="//static.example.org/index.html">link is relative</a>.` + expected := `This <a href="https://static.example.org/index.html" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">link is relative</a>.` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidTag(t *testing.T) { + input := `<p>My invalid <b>tag</b>.</p>` + expected := `<p>My invalid tag.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestVideoTag(t *testing.T) { + input := `<p>My valid <video src="videofile.webm" autoplay poster="posterimage.jpg">fallback</video>.</p>` + expected := `<p>My valid <video src="http://example.org/videofile.webm" poster="http://example.org/posterimage.jpg" controls>fallback</video>.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestAudioAndSourceTag(t *testing.T) { + input := `<p>My music <audio controls="controls"><source src="foo.wav" type="audio/wav"></audio>.</p>` + expected := `<p>My music <audio controls><source src="http://example.org/foo.wav" type="audio/wav"></audio>.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestUnknownTag(t *testing.T) { + input := `<p>My invalid <unknown>tag</unknown>.</p>` + expected := `<p>My invalid tag.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidNestedTag(t *testing.T) { + input := `<p>My invalid <b>tag with some <em>valid</em> tag</b>.</p>` + expected := `<p>My invalid tag with some <em>valid</em> tag.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidIFrame(t *testing.T) { + input := `<iframe src="http://example.org/"></iframe>` + expected := `` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidURLScheme(t *testing.T) { + input := `<p>This link is <a src="file:///etc/passwd">not valid</a></p>` + expected := `<p>This link is not valid</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestBlacklistedLink(t *testing.T) { + input := `<p>This image is not valid <img src="https://stats.wordpress.com/some-tracker"></p>` + expected := `<p>This image is not valid </p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestPixelTracker(t *testing.T) { + input := `<p><img src="https://tracker1.example.org/" height="1" width="1"> and <img src="https://tracker2.example.org/" height="1" width="1"/></p>` + expected := `<p> and </p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} diff --git a/reader/sanitizer/strip_tags.go b/reader/sanitizer/strip_tags.go new file mode 100644 index 0000000..f3cecd1 --- /dev/null +++ b/reader/sanitizer/strip_tags.go @@ -0,0 +1,35 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import ( + "bytes" + "io" + + "golang.org/x/net/html" +) + +// StripTags removes all HTML/XML tags from the input string. +func StripTags(input string) string { + tokenizer := html.NewTokenizer(bytes.NewBufferString(input)) + var buffer bytes.Buffer + + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + return buffer.String() + } + + return "" + } + + token := tokenizer.Token() + switch token.Type { + case html.TextToken: + buffer.WriteString(token.Data) + } + } +} diff --git a/reader/sanitizer/strip_tags_test.go b/reader/sanitizer/strip_tags_test.go new file mode 100644 index 0000000..763ccc9 --- /dev/null +++ b/reader/sanitizer/strip_tags_test.go @@ -0,0 +1,17 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "testing" + +func TestStripTags(t *testing.T) { + input := `This <a href="/test.html">link is relative</a> and <strong>this</strong> image: <img src="../folder/image.png"/>` + expected := `This link is relative and this image: ` + output := StripTags(input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} |