aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader/scraper
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2018-10-14 11:46:41 -0700
committerGravatar Frédéric Guillot <fred@miniflux.net>2018-10-14 11:46:41 -0700
commit5870f0426002c8e26a9ff472b23e15d7bf1235f7 (patch)
treed303a45c49d0433a5d2f7bed1bcce25d85a8bc0f /reader/scraper
parentd5ff4191b6087b96b88221c6d939f73931001f2a (diff)
Simplify feed parser and format detection
- Avoid doing multiple buffer copies - Move parser and format detection logic to its own package
Diffstat (limited to 'reader/scraper')
-rw-r--r--reader/scraper/scraper.go13
1 files changed, 6 insertions, 7 deletions
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index 7aa7084..b62d1ca 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -18,7 +18,7 @@ import (
"github.com/PuerkitoBio/goquery"
)
-// Fetch downloads a web page a returns relevant contents.
+// Fetch downloads a web page and returns relevant contents.
func Fetch(websiteURL, rules, userAgent string) (string, error) {
clt := client.New(websiteURL)
if userAgent != "" {
@@ -38,8 +38,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
}
- page, err := response.NormalizeBodyEncoding()
- if err != nil {
+ if err = response.EnsureUnicodeBody(); err != nil {
return "", err
}
@@ -52,11 +51,11 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
var content string
if rules != "" {
- logger.Debug(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
- content, err = scrapContent(page, rules)
+ logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
+ content, err = scrapContent(response.Body, rules)
} else {
- logger.Debug(`[Scraper] Using readability for "%s"`, websiteURL)
- content, err = readability.ExtractContent(page)
+ logger.Debug(`[Scraper] Using readability for "%q`, websiteURL)
+ content, err = readability.ExtractContent(response.Body)
}
if err != nil {