From 8e1ed8bef3b65cb75c476ebafc1492fea97754d5 Mon Sep 17 00:00:00 2001 From: cinput <59012355+cinput@users.noreply.github.com> Date: Sat, 21 Dec 2019 21:18:31 -0800 Subject: Return outer HTML when scraping elements --- reader/scraper/scraper.go | 8 +------ reader/scraper/scraper_test.go | 36 +++++++++++++++++++++++++++++- reader/scraper/testdata/iframe.html | 12 ++++++++++ reader/scraper/testdata/iframe.html-result | 1 + reader/scraper/testdata/img.html | 12 ++++++++++ reader/scraper/testdata/img.html-result | 1 + reader/scraper/testdata/p.html | 10 +++++++++ reader/scraper/testdata/p.html-result | 1 + 8 files changed, 73 insertions(+), 8 deletions(-) create mode 100644 reader/scraper/testdata/iframe.html create mode 100644 reader/scraper/testdata/iframe.html-result create mode 100644 reader/scraper/testdata/img.html create mode 100644 reader/scraper/testdata/img.html-result create mode 100644 reader/scraper/testdata/p.html create mode 100644 reader/scraper/testdata/p.html-result diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index fd5ba33..045bfbc 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -75,13 +75,7 @@ func scrapContent(page io.Reader, rules string) (string, error) { document.Find(rules).Each(func(i int, s *goquery.Selection) { var content string - // For some inline elements, we get the parent. - if s.Is("img") || s.Is("iframe") { - content, _ = s.Parent().Html() - } else { - content, _ = s.Html() - } - + content, _ = goquery.OuterHtml(s) contents += content }) diff --git a/reader/scraper/scraper_test.go b/reader/scraper/scraper_test.go index 3f7e1ef..14e13ee 100644 --- a/reader/scraper/scraper_test.go +++ b/reader/scraper/scraper_test.go @@ -4,7 +4,12 @@ package scraper // import "miniflux.app/reader/scraper" -import "testing" +import ( + "bytes" + "io/ioutil" + "strings" + "testing" +) func TestGetPredefinedRules(t *testing.T) { if getPredefinedScraperRules("http://www.phoronix.com/") == "" { @@ -40,3 +45,32 @@ func TestWhitelistedContentTypes(t *testing.T) { } } } + +func TestSelectorRules(t *testing.T) { + var ruleTestCases = map[string]string { + "img.html": "article > img", + "iframe.html": "article > iframe", + "p.html": "article > p", + } + + for filename, rule := range ruleTestCases { + html, err := ioutil.ReadFile("testdata/" + filename) + if err != nil { + t.Fatalf(`Unable to read file %q: %v`, filename, err) + } + + actualResult, err := scrapContent(bytes.NewReader(html), rule) + if err != nil { + t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err) + } + + expectedResult, err := ioutil.ReadFile("testdata/" + filename + "-result") + if err != nil { + t.Fatalf(`Unable to read file %q: %v`, filename, err) + } + + if actualResult != strings.TrimSpace(string(expectedResult)) { + t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult) + } + } +} diff --git a/reader/scraper/testdata/iframe.html b/reader/scraper/testdata/iframe.html new file mode 100644 index 0000000..5095b7d --- /dev/null +++ b/reader/scraper/testdata/iframe.html @@ -0,0 +1,12 @@ + + +
+Lorem ipsum dolor sit amet, consectetuer adipiscing ept.
+Apquam tincidunt mauris eu risus.
+Vestibulum auctor dapibus neque.
+Lorem ipsum dolor sit amet, consectetuer adipiscing ept.
Apquam tincidunt mauris eu risus.
Vestibulum auctor dapibus neque.
-- cgit v1.2.3