From 5870f0426002c8e26a9ff472b23e15d7bf1235f7 Mon Sep 17 00:00:00 2001
From: Frédéric Guillot <fred@miniflux.net>
Date: Sun, 14 Oct 2018 11:46:41 -0700
Subject: Simplify feed parser and format detection

- Avoid doing multiple buffer copies
- Move parser and format detection logic to its own package
---
 reader/scraper/scraper.go | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'reader/scraper')

diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index 7aa7084..b62d1ca 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -18,7 +18,7 @@ import (
 	"github.com/PuerkitoBio/goquery"
 )
 
-// Fetch downloads a web page a returns relevant contents.
+// Fetch downloads a web page and returns relevant contents.
 func Fetch(websiteURL, rules, userAgent string) (string, error) {
 	clt := client.New(websiteURL)
 	if userAgent != "" {
@@ -38,8 +38,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
 		return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
 	}
 
-	page, err := response.NormalizeBodyEncoding()
-	if err != nil {
+	if err = response.EnsureUnicodeBody(); err != nil {
 		return "", err
 	}
 
@@ -52,11 +51,11 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
 
 	var content string
 	if rules != "" {
-		logger.Debug(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
-		content, err = scrapContent(page, rules)
+		logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
+		content, err = scrapContent(response.Body, rules)
 	} else {
-		logger.Debug(`[Scraper] Using readability for "%s"`, websiteURL)
-		content, err = readability.ExtractContent(page)
+		logger.Debug(`[Scraper] Using readability for "%q`, websiteURL)
+		content, err = readability.ExtractContent(response.Body)
 	}
 
 	if err != nil {
-- 
cgit v1.2.3