Allow the scraper to parse XHTML documents

Only "text/html" was authorized before.
author: Frédéric Guillot <fred@miniflux.net> 2018-11-03 13:44:13 -0700
committer: Frédéric Guillot <fred@miniflux.net> 2018-11-03 13:44:13 -0700
commit: 3b6e44c33131eb06287d070c3f4b6d19b465f4f5 (patch)
tree: b49cc679415691a4ce48da0aefe3e2afa06a80db /reader/scraper/scraper.go
parent: 1ff9950a551e1f96415cf918cd306b4eefb65c07 (diff)
1 files changed, 7 insertions, 1 deletions
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index b62d1ca..58f37d5 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -34,7 +34,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
 		return "", errors.New("scraper: unable to download web page")
 	}
 
-	if !strings.Contains(response.ContentType, "text/html") {
+	if !isWhitelistedContentType(response.ContentType) {
 		return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
 	}
 
@@ -99,3 +99,9 @@ func getPredefinedScraperRules(websiteURL string) string {
 
 	return ""
 }
+
+func isWhitelistedContentType(contentType string) bool {
+	contentType = strings.ToLower(contentType)
+	return strings.HasPrefix(contentType, "text/html") ||
+		strings.HasPrefix(contentType, "application/xhtml+xml")
+}
author	Frédéric Guillot <fred@miniflux.net>	2018-11-03 13:44:13 -0700
committer	Frédéric Guillot <fred@miniflux.net>	2018-11-03 13:44:13 -0700
commit	3b6e44c33131eb06287d070c3f4b6d19b465f4f5 (patch)
tree	b49cc679415691a4ce48da0aefe3e2afa06a80db /reader/scraper/scraper.go
parent	1ff9950a551e1f96415cf918cd306b4eefb65c07 (diff)