diff options
author | Frédéric Guillot <fred@miniflux.net> | 2018-11-03 13:44:13 -0700 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2018-11-03 13:44:13 -0700 |
commit | 3b6e44c33131eb06287d070c3f4b6d19b465f4f5 (patch) | |
tree | b49cc679415691a4ce48da0aefe3e2afa06a80db /reader | |
parent | 1ff9950a551e1f96415cf918cd306b4eefb65c07 (diff) |
Allow the scraper to parse XHTML documents
Only "text/html" was authorized before.
Diffstat (limited to 'reader')
-rw-r--r-- | reader/scraper/scraper.go | 8 | ||||
-rw-r--r-- | reader/scraper/scraper_test.go | 21 |
2 files changed, 28 insertions, 1 deletions
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index b62d1ca..58f37d5 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -34,7 +34,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) { return "", errors.New("scraper: unable to download web page") } - if !strings.Contains(response.ContentType, "text/html") { + if !isWhitelistedContentType(response.ContentType) { return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType) } @@ -99,3 +99,9 @@ func getPredefinedScraperRules(websiteURL string) string { return "" } + +func isWhitelistedContentType(contentType string) bool { + contentType = strings.ToLower(contentType) + return strings.HasPrefix(contentType, "text/html") || + strings.HasPrefix(contentType, "application/xhtml+xml") +} diff --git a/reader/scraper/scraper_test.go b/reader/scraper/scraper_test.go index e43ac11..3f7e1ef 100644 --- a/reader/scraper/scraper_test.go +++ b/reader/scraper/scraper_test.go @@ -19,3 +19,24 @@ func TestGetPredefinedRules(t *testing.T) { t.Error("A rule not defined should not return anything") } } + +func TestWhitelistedContentTypes(t *testing.T) { + scenarios := map[string]bool{ + "text/html": true, + "TeXt/hTmL": true, + "application/xhtml+xml": true, + "text/html; charset=utf-8": true, + "application/xhtml+xml; charset=utf-8": true, + "text/css": false, + "application/javascript": false, + "image/png": false, + "application/pdf": false, + } + + for inputValue, expectedResult := range scenarios { + actualResult := isWhitelistedContentType(inputValue) + if actualResult != expectedResult { + t.Errorf(`Unexpected result for content type whitelist, got "%v" instead of "%v"`, actualResult, expectedResult) + } + } +} |