diff options
author | cinput <59012355+cinput@users.noreply.github.com> | 2019-12-21 21:18:31 -0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2019-12-21 21:18:31 -0800 |
commit | 8e1ed8bef3b65cb75c476ebafc1492fea97754d5 (patch) | |
tree | f539b6d354ccb4ecec548fc42bcf58a21c63ab74 | |
parent | 30f22fbd78fcbd8e46f4c449928bfca289913efd (diff) |
Return outer HTML when scraping elements
-rw-r--r-- | reader/scraper/scraper.go | 8 | ||||
-rw-r--r-- | reader/scraper/scraper_test.go | 36 | ||||
-rw-r--r-- | reader/scraper/testdata/iframe.html | 12 | ||||
-rw-r--r-- | reader/scraper/testdata/iframe.html-result | 1 | ||||
-rw-r--r-- | reader/scraper/testdata/img.html | 12 | ||||
-rw-r--r-- | reader/scraper/testdata/img.html-result | 1 | ||||
-rw-r--r-- | reader/scraper/testdata/p.html | 10 | ||||
-rw-r--r-- | reader/scraper/testdata/p.html-result | 1 |
8 files changed, 73 insertions, 8 deletions
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index fd5ba33..045bfbc 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -75,13 +75,7 @@ func scrapContent(page io.Reader, rules string) (string, error) { document.Find(rules).Each(func(i int, s *goquery.Selection) { var content string - // For some inline elements, we get the parent. - if s.Is("img") || s.Is("iframe") { - content, _ = s.Parent().Html() - } else { - content, _ = s.Html() - } - + content, _ = goquery.OuterHtml(s) contents += content }) diff --git a/reader/scraper/scraper_test.go b/reader/scraper/scraper_test.go index 3f7e1ef..14e13ee 100644 --- a/reader/scraper/scraper_test.go +++ b/reader/scraper/scraper_test.go @@ -4,7 +4,12 @@ package scraper // import "miniflux.app/reader/scraper" -import "testing" +import ( + "bytes" + "io/ioutil" + "strings" + "testing" +) func TestGetPredefinedRules(t *testing.T) { if getPredefinedScraperRules("http://www.phoronix.com/") == "" { @@ -40,3 +45,32 @@ func TestWhitelistedContentTypes(t *testing.T) { } } } + +func TestSelectorRules(t *testing.T) { + var ruleTestCases = map[string]string { + "img.html": "article > img", + "iframe.html": "article > iframe", + "p.html": "article > p", + } + + for filename, rule := range ruleTestCases { + html, err := ioutil.ReadFile("testdata/" + filename) + if err != nil { + t.Fatalf(`Unable to read file %q: %v`, filename, err) + } + + actualResult, err := scrapContent(bytes.NewReader(html), rule) + if err != nil { + t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err) + } + + expectedResult, err := ioutil.ReadFile("testdata/" + filename + "-result") + if err != nil { + t.Fatalf(`Unable to read file %q: %v`, filename, err) + } + + if actualResult != strings.TrimSpace(string(expectedResult)) { + t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult) + } + } +} diff --git a/reader/scraper/testdata/iframe.html b/reader/scraper/testdata/iframe.html new file mode 100644 index 0000000..5095b7d --- /dev/null +++ b/reader/scraper/testdata/iframe.html @@ -0,0 +1,12 @@ +<!DOCTYPE html> +<html lang="en-US"> + <body> + <article> + <iframe id="1" src="about:blank"></iframe> + <iframe id="2" src="about:blank"></iframe> + <iframe id="3" src="about:blank"></iframe> + <iframe id="4" src="about:blank"></iframe> + <iframe id="5" src="about:blank"></iframe> + </article> + </body> +</html> diff --git a/reader/scraper/testdata/iframe.html-result b/reader/scraper/testdata/iframe.html-result new file mode 100644 index 0000000..2d2a5ed --- /dev/null +++ b/reader/scraper/testdata/iframe.html-result @@ -0,0 +1 @@ +<iframe id="1" src="about:blank"></iframe><iframe id="2" src="about:blank"></iframe><iframe id="3" src="about:blank"></iframe><iframe id="4" src="about:blank"></iframe><iframe id="5" src="about:blank"></iframe> diff --git a/reader/scraper/testdata/img.html b/reader/scraper/testdata/img.html new file mode 100644 index 0000000..33ec780 --- /dev/null +++ b/reader/scraper/testdata/img.html @@ -0,0 +1,12 @@ +<!DOCTYPE html> +<html lang="en-US"> + <body> + <article> + <img id="1" src="#" alt="" /> + <img id="2" src="#" alt="" /> + <img id="3" src="#" alt="" /> + <img id="4" src="#" alt="" /> + <img id="5" src="#" alt="" /> + </article> + </body> +</html> diff --git a/reader/scraper/testdata/img.html-result b/reader/scraper/testdata/img.html-result new file mode 100644 index 0000000..54e1019 --- /dev/null +++ b/reader/scraper/testdata/img.html-result @@ -0,0 +1 @@ +<img id="1" src="#" alt=""/><img id="2" src="#" alt=""/><img id="3" src="#" alt=""/><img id="4" src="#" alt=""/><img id="5" src="#" alt=""/> diff --git a/reader/scraper/testdata/p.html b/reader/scraper/testdata/p.html new file mode 100644 index 0000000..45a24c0 --- /dev/null +++ b/reader/scraper/testdata/p.html @@ -0,0 +1,10 @@ +<!DOCTYPE html> +<html lang="en-US"> + <body> + <article> + <p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p> + <p>Apquam tincidunt mauris eu risus.</p> + <p>Vestibulum auctor dapibus neque.</p> + </article> + </body> +</html> diff --git a/reader/scraper/testdata/p.html-result b/reader/scraper/testdata/p.html-result new file mode 100644 index 0000000..430f015 --- /dev/null +++ b/reader/scraper/testdata/p.html-result @@ -0,0 +1 @@ +<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p><p>Apquam tincidunt mauris eu risus.</p><p>Vestibulum auctor dapibus neque.</p> |