aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader/sanitizer
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2017-11-19 21:10:04 -0800
committerGravatar Frédéric Guillot <fred@miniflux.net>2017-11-19 22:01:46 -0800
commit8ffb773f43c8dc54801ca1d111854e7e881c93c9 (patch)
tree38133a2fc612597a75fed1d13e5b4042f58a2b7e /reader/sanitizer
First commit
Diffstat (limited to 'reader/sanitizer')
-rw-r--r--reader/sanitizer/sanitizer.go360
-rw-r--r--reader/sanitizer/sanitizer_test.go144
-rw-r--r--reader/sanitizer/strip_tags.go35
-rw-r--r--reader/sanitizer/strip_tags_test.go17
4 files changed, 556 insertions, 0 deletions
diff --git a/reader/sanitizer/sanitizer.go b/reader/sanitizer/sanitizer.go
new file mode 100644
index 0000000..6af034c
--- /dev/null
+++ b/reader/sanitizer/sanitizer.go
@@ -0,0 +1,360 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package sanitizer
+
+import (
+ "bytes"
+ "fmt"
+ "github.com/miniflux/miniflux2/reader/url"
+ "io"
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+// Sanitize returns safe HTML.
+func Sanitize(baseURL, input string) string {
+ tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
+ var buffer bytes.Buffer
+ var tagStack []string
+
+ for {
+ if tokenizer.Next() == html.ErrorToken {
+ err := tokenizer.Err()
+ if err == io.EOF {
+ return buffer.String()
+ }
+
+ return ""
+ }
+
+ token := tokenizer.Token()
+ switch token.Type {
+ case html.TextToken:
+ buffer.WriteString(token.Data)
+ case html.StartTagToken:
+ tagName := token.DataAtom.String()
+
+ if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
+ attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
+
+ if hasRequiredAttributes(tagName, attrNames) {
+ if len(attrNames) > 0 {
+ buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
+ } else {
+ buffer.WriteString("<" + tagName + ">")
+ }
+
+ tagStack = append(tagStack, tagName)
+ }
+ }
+ case html.EndTagToken:
+ tagName := token.DataAtom.String()
+ if isValidTag(tagName) && inList(tagName, tagStack) {
+ buffer.WriteString(fmt.Sprintf("</%s>", tagName))
+ }
+ case html.SelfClosingTagToken:
+ tagName := token.DataAtom.String()
+ if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
+ attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
+
+ if hasRequiredAttributes(tagName, attrNames) {
+ if len(attrNames) > 0 {
+ buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
+ } else {
+ buffer.WriteString("<" + tagName + "/>")
+ }
+ }
+ }
+ }
+ }
+}
+
+func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) (attrNames []string, html string) {
+ var htmlAttrs []string
+ var err error
+
+ for _, attribute := range attributes {
+ value := attribute.Val
+
+ if !isValidAttribute(tagName, attribute.Key) {
+ continue
+ }
+
+ if isExternalResourceAttribute(attribute.Key) {
+ if tagName == "iframe" && !isValidIframeSource(attribute.Val) {
+ continue
+ } else {
+ value, err = url.GetAbsoluteURL(baseURL, value)
+ if err != nil {
+ continue
+ }
+
+ if !hasValidScheme(value) || isBlacklistedResource(value) {
+ continue
+ }
+ }
+ }
+
+ attrNames = append(attrNames, attribute.Key)
+ htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, value))
+ }
+
+ extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
+ if len(extraAttrNames) > 0 {
+ attrNames = append(attrNames, extraAttrNames...)
+ htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
+ }
+
+ return attrNames, strings.Join(htmlAttrs, " ")
+}
+
+func getExtraAttributes(tagName string) ([]string, []string) {
+ if tagName == "a" {
+ return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
+ }
+
+ if tagName == "video" || tagName == "audio" {
+ return []string{"controls"}, []string{"controls"}
+ }
+
+ return nil, nil
+}
+
+func isValidTag(tagName string) bool {
+ for element := range getTagWhitelist() {
+ if tagName == element {
+ return true
+ }
+ }
+
+ return false
+}
+
+func isValidAttribute(tagName, attributeName string) bool {
+ for element, attributes := range getTagWhitelist() {
+ if tagName == element {
+ if inList(attributeName, attributes) {
+ return true
+ }
+ }
+ }
+
+ return false
+}
+
+func isExternalResourceAttribute(attribute string) bool {
+ switch attribute {
+ case "src", "href", "poster", "cite":
+ return true
+ default:
+ return false
+ }
+}
+
+func isPixelTracker(tagName string, attributes []html.Attribute) bool {
+ if tagName == "img" {
+ hasHeight := false
+ hasWidth := false
+
+ for _, attribute := range attributes {
+ if attribute.Key == "height" && attribute.Val == "1" {
+ hasHeight = true
+ }
+
+ if attribute.Key == "width" && attribute.Val == "1" {
+ hasWidth = true
+ }
+ }
+
+ return hasHeight && hasWidth
+ }
+
+ return false
+}
+
+func hasRequiredAttributes(tagName string, attributes []string) bool {
+ elements := make(map[string][]string)
+ elements["a"] = []string{"href"}
+ elements["iframe"] = []string{"src"}
+ elements["img"] = []string{"src"}
+ elements["source"] = []string{"src"}
+
+ for element, attrs := range elements {
+ if tagName == element {
+ for _, attribute := range attributes {
+ for _, attr := range attrs {
+ if attr == attribute {
+ return true
+ }
+ }
+ }
+
+ return false
+ }
+ }
+
+ return true
+}
+
+func hasValidScheme(src string) bool {
+ // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+ whitelist := []string{
+ "apt://",
+ "bitcoin://",
+ "callto://",
+ "ed2k://",
+ "facetime://",
+ "feed://",
+ "ftp://",
+ "geo://",
+ "gopher://",
+ "git://",
+ "http://",
+ "https://",
+ "irc://",
+ "irc6://",
+ "ircs://",
+ "itms://",
+ "jabber://",
+ "magnet://",
+ "mailto://",
+ "maps://",
+ "news://",
+ "nfs://",
+ "nntp://",
+ "rtmp://",
+ "sip://",
+ "sips://",
+ "skype://",
+ "smb://",
+ "sms://",
+ "spotify://",
+ "ssh://",
+ "sftp://",
+ "steam://",
+ "svn://",
+ "tel://",
+ "webcal://",
+ "xmpp://",
+ }
+
+ for _, prefix := range whitelist {
+ if strings.HasPrefix(src, prefix) {
+ return true
+ }
+ }
+
+ return false
+}
+
+func isBlacklistedResource(src string) bool {
+ blacklist := []string{
+ "feedsportal.com",
+ "api.flattr.com",
+ "stats.wordpress.com",
+ "plus.google.com/share",
+ "twitter.com/share",
+ "feeds.feedburner.com",
+ }
+
+ for _, element := range blacklist {
+ if strings.Contains(src, element) {
+ return true
+ }
+ }
+
+ return false
+}
+
+func isValidIframeSource(src string) bool {
+ whitelist := []string{
+ "http://www.youtube.com",
+ "https://www.youtube.com",
+ "http://player.vimeo.com",
+ "https://player.vimeo.com",
+ "http://www.dailymotion.com",
+ "https://www.dailymotion.com",
+ "http://vk.com",
+ "https://vk.com",
+ }
+
+ for _, prefix := range whitelist {
+ if strings.HasPrefix(src, prefix) {
+ return true
+ }
+ }
+
+ return false
+}
+
+func getTagWhitelist() map[string][]string {
+ whitelist := make(map[string][]string)
+ whitelist["img"] = []string{"alt", "title", "src"}
+ whitelist["audio"] = []string{"src"}
+ whitelist["video"] = []string{"poster", "height", "width", "src"}
+ whitelist["source"] = []string{"src", "type"}
+ whitelist["dt"] = []string{}
+ whitelist["dd"] = []string{}
+ whitelist["dl"] = []string{}
+ whitelist["table"] = []string{}
+ whitelist["caption"] = []string{}
+ whitelist["thead"] = []string{}
+ whitelist["tfooter"] = []string{}
+ whitelist["tr"] = []string{}
+ whitelist["td"] = []string{"rowspan", "colspan"}
+ whitelist["th"] = []string{"rowspan", "colspan"}
+ whitelist["h1"] = []string{}
+ whitelist["h2"] = []string{}
+ whitelist["h3"] = []string{}
+ whitelist["h4"] = []string{}
+ whitelist["h5"] = []string{}
+ whitelist["h6"] = []string{}
+ whitelist["strong"] = []string{}
+ whitelist["em"] = []string{}
+ whitelist["code"] = []string{}
+ whitelist["pre"] = []string{}
+ whitelist["blockquote"] = []string{}
+ whitelist["q"] = []string{"cite"}
+ whitelist["p"] = []string{}
+ whitelist["ul"] = []string{}
+ whitelist["li"] = []string{}
+ whitelist["ol"] = []string{}
+ whitelist["br"] = []string{}
+ whitelist["del"] = []string{}
+ whitelist["a"] = []string{"href", "title"}
+ whitelist["figure"] = []string{}
+ whitelist["figcaption"] = []string{}
+ whitelist["cite"] = []string{}
+ whitelist["time"] = []string{"datetime"}
+ whitelist["abbr"] = []string{"title"}
+ whitelist["acronym"] = []string{"title"}
+ whitelist["wbr"] = []string{}
+ whitelist["dfn"] = []string{}
+ whitelist["sub"] = []string{}
+ whitelist["sup"] = []string{}
+ whitelist["var"] = []string{}
+ whitelist["samp"] = []string{}
+ whitelist["s"] = []string{}
+ whitelist["del"] = []string{}
+ whitelist["ins"] = []string{}
+ whitelist["kbd"] = []string{}
+ whitelist["rp"] = []string{}
+ whitelist["rt"] = []string{}
+ whitelist["rtc"] = []string{}
+ whitelist["ruby"] = []string{}
+ whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"}
+ return whitelist
+}
+
+func inList(needle string, haystack []string) bool {
+ for _, element := range haystack {
+ if element == needle {
+ return true
+ }
+ }
+
+ return false
+}
diff --git a/reader/sanitizer/sanitizer_test.go b/reader/sanitizer/sanitizer_test.go
new file mode 100644
index 0000000..73862d3
--- /dev/null
+++ b/reader/sanitizer/sanitizer_test.go
@@ -0,0 +1,144 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package sanitizer
+
+import "testing"
+
+func TestValidInput(t *testing.T) {
+ input := `<p>This is a <strong>text</strong> with an image: <img src="http://example.org/" alt="Test">.</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if input != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
+ }
+}
+
+func TestSelfClosingTags(t *testing.T) {
+ input := `<p>This <br> is a <strong>text</strong> <br/>with an image: <img src="http://example.org/" alt="Test"/>.</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if input != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
+ }
+}
+
+func TestTable(t *testing.T) {
+ input := `<table><tr><th>A</th><th colspan="2">B</th></tr><tr><td>C</td><td>D</td><td>E</td></tr></table>`
+ output := Sanitize("http://example.org/", input)
+
+ if input != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
+ }
+}
+
+func TestRelativeURL(t *testing.T) {
+ input := `This <a href="/test.html">link is relative</a> and this image: <img src="../folder/image.png"/>`
+ expected := `This <a href="http://example.org/test.html" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">link is relative</a> and this image: <img src="http://example.org/folder/image.png"/>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestProtocolRelativeURL(t *testing.T) {
+ input := `This <a href="//static.example.org/index.html">link is relative</a>.`
+ expected := `This <a href="https://static.example.org/index.html" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">link is relative</a>.`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestInvalidTag(t *testing.T) {
+ input := `<p>My invalid <b>tag</b>.</p>`
+ expected := `<p>My invalid tag.</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestVideoTag(t *testing.T) {
+ input := `<p>My valid <video src="videofile.webm" autoplay poster="posterimage.jpg">fallback</video>.</p>`
+ expected := `<p>My valid <video src="http://example.org/videofile.webm" poster="http://example.org/posterimage.jpg" controls>fallback</video>.</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestAudioAndSourceTag(t *testing.T) {
+ input := `<p>My music <audio controls="controls"><source src="foo.wav" type="audio/wav"></audio>.</p>`
+ expected := `<p>My music <audio controls><source src="http://example.org/foo.wav" type="audio/wav"></audio>.</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestUnknownTag(t *testing.T) {
+ input := `<p>My invalid <unknown>tag</unknown>.</p>`
+ expected := `<p>My invalid tag.</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestInvalidNestedTag(t *testing.T) {
+ input := `<p>My invalid <b>tag with some <em>valid</em> tag</b>.</p>`
+ expected := `<p>My invalid tag with some <em>valid</em> tag.</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestInvalidIFrame(t *testing.T) {
+ input := `<iframe src="http://example.org/"></iframe>`
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestInvalidURLScheme(t *testing.T) {
+ input := `<p>This link is <a src="file:///etc/passwd">not valid</a></p>`
+ expected := `<p>This link is not valid</p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestBlacklistedLink(t *testing.T) {
+ input := `<p>This image is not valid <img src="https://stats.wordpress.com/some-tracker"></p>`
+ expected := `<p>This image is not valid </p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestPixelTracker(t *testing.T) {
+ input := `<p><img src="https://tracker1.example.org/" height="1" width="1"> and <img src="https://tracker2.example.org/" height="1" width="1"/></p>`
+ expected := `<p> and </p>`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
diff --git a/reader/sanitizer/strip_tags.go b/reader/sanitizer/strip_tags.go
new file mode 100644
index 0000000..f3cecd1
--- /dev/null
+++ b/reader/sanitizer/strip_tags.go
@@ -0,0 +1,35 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package sanitizer
+
+import (
+ "bytes"
+ "io"
+
+ "golang.org/x/net/html"
+)
+
+// StripTags removes all HTML/XML tags from the input string.
+func StripTags(input string) string {
+ tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
+ var buffer bytes.Buffer
+
+ for {
+ if tokenizer.Next() == html.ErrorToken {
+ err := tokenizer.Err()
+ if err == io.EOF {
+ return buffer.String()
+ }
+
+ return ""
+ }
+
+ token := tokenizer.Token()
+ switch token.Type {
+ case html.TextToken:
+ buffer.WriteString(token.Data)
+ }
+ }
+}
diff --git a/reader/sanitizer/strip_tags_test.go b/reader/sanitizer/strip_tags_test.go
new file mode 100644
index 0000000..763ccc9
--- /dev/null
+++ b/reader/sanitizer/strip_tags_test.go
@@ -0,0 +1,17 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package sanitizer
+
+import "testing"
+
+func TestStripTags(t *testing.T) {
+ input := `This <a href="/test.html">link is relative</a> and <strong>this</strong> image: <img src="../folder/image.png"/>`
+ expected := `This link is relative and this image: `
+ output := StripTags(input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}