aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2018-11-03 13:44:13 -0700
committerGravatar Frédéric Guillot <fred@miniflux.net>2018-11-03 13:44:13 -0700
commit3b6e44c33131eb06287d070c3f4b6d19b465f4f5 (patch)
treeb49cc679415691a4ce48da0aefe3e2afa06a80db /reader
parent1ff9950a551e1f96415cf918cd306b4eefb65c07 (diff)
Allow the scraper to parse XHTML documents
Only "text/html" was authorized before.
Diffstat (limited to 'reader')
-rw-r--r--reader/scraper/scraper.go8
-rw-r--r--reader/scraper/scraper_test.go21
2 files changed, 28 insertions, 1 deletions
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index b62d1ca..58f37d5 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -34,7 +34,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
return "", errors.New("scraper: unable to download web page")
}
- if !strings.Contains(response.ContentType, "text/html") {
+ if !isWhitelistedContentType(response.ContentType) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
}
@@ -99,3 +99,9 @@ func getPredefinedScraperRules(websiteURL string) string {
return ""
}
+
+func isWhitelistedContentType(contentType string) bool {
+ contentType = strings.ToLower(contentType)
+ return strings.HasPrefix(contentType, "text/html") ||
+ strings.HasPrefix(contentType, "application/xhtml+xml")
+}
diff --git a/reader/scraper/scraper_test.go b/reader/scraper/scraper_test.go
index e43ac11..3f7e1ef 100644
--- a/reader/scraper/scraper_test.go
+++ b/reader/scraper/scraper_test.go
@@ -19,3 +19,24 @@ func TestGetPredefinedRules(t *testing.T) {
t.Error("A rule not defined should not return anything")
}
}
+
+func TestWhitelistedContentTypes(t *testing.T) {
+ scenarios := map[string]bool{
+ "text/html": true,
+ "TeXt/hTmL": true,
+ "application/xhtml+xml": true,
+ "text/html; charset=utf-8": true,
+ "application/xhtml+xml; charset=utf-8": true,
+ "text/css": false,
+ "application/javascript": false,
+ "image/png": false,
+ "application/pdf": false,
+ }
+
+ for inputValue, expectedResult := range scenarios {
+ actualResult := isWhitelistedContentType(inputValue)
+ if actualResult != expectedResult {
+ t.Errorf(`Unexpected result for content type whitelist, got "%v" instead of "%v"`, actualResult, expectedResult)
+ }
+ }
+}