aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar cinput <59012355+cinput@users.noreply.github.com>2019-12-21 21:18:31 -0800
committerGravatar Frédéric Guillot <fred@miniflux.net>2019-12-21 21:18:31 -0800
commit8e1ed8bef3b65cb75c476ebafc1492fea97754d5 (patch)
treef539b6d354ccb4ecec548fc42bcf58a21c63ab74
parent30f22fbd78fcbd8e46f4c449928bfca289913efd (diff)
Return outer HTML when scraping elements
-rw-r--r--reader/scraper/scraper.go8
-rw-r--r--reader/scraper/scraper_test.go36
-rw-r--r--reader/scraper/testdata/iframe.html12
-rw-r--r--reader/scraper/testdata/iframe.html-result1
-rw-r--r--reader/scraper/testdata/img.html12
-rw-r--r--reader/scraper/testdata/img.html-result1
-rw-r--r--reader/scraper/testdata/p.html10
-rw-r--r--reader/scraper/testdata/p.html-result1
8 files changed, 73 insertions, 8 deletions
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index fd5ba33..045bfbc 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -75,13 +75,7 @@ func scrapContent(page io.Reader, rules string) (string, error) {
document.Find(rules).Each(func(i int, s *goquery.Selection) {
var content string
- // For some inline elements, we get the parent.
- if s.Is("img") || s.Is("iframe") {
- content, _ = s.Parent().Html()
- } else {
- content, _ = s.Html()
- }
-
+ content, _ = goquery.OuterHtml(s)
contents += content
})
diff --git a/reader/scraper/scraper_test.go b/reader/scraper/scraper_test.go
index 3f7e1ef..14e13ee 100644
--- a/reader/scraper/scraper_test.go
+++ b/reader/scraper/scraper_test.go
@@ -4,7 +4,12 @@
package scraper // import "miniflux.app/reader/scraper"
-import "testing"
+import (
+ "bytes"
+ "io/ioutil"
+ "strings"
+ "testing"
+)
func TestGetPredefinedRules(t *testing.T) {
if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
@@ -40,3 +45,32 @@ func TestWhitelistedContentTypes(t *testing.T) {
}
}
}
+
+func TestSelectorRules(t *testing.T) {
+ var ruleTestCases = map[string]string {
+ "img.html": "article > img",
+ "iframe.html": "article > iframe",
+ "p.html": "article > p",
+ }
+
+ for filename, rule := range ruleTestCases {
+ html, err := ioutil.ReadFile("testdata/" + filename)
+ if err != nil {
+ t.Fatalf(`Unable to read file %q: %v`, filename, err)
+ }
+
+ actualResult, err := scrapContent(bytes.NewReader(html), rule)
+ if err != nil {
+ t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
+ }
+
+ expectedResult, err := ioutil.ReadFile("testdata/" + filename + "-result")
+ if err != nil {
+ t.Fatalf(`Unable to read file %q: %v`, filename, err)
+ }
+
+ if actualResult != strings.TrimSpace(string(expectedResult)) {
+ t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
+ }
+ }
+}
diff --git a/reader/scraper/testdata/iframe.html b/reader/scraper/testdata/iframe.html
new file mode 100644
index 0000000..5095b7d
--- /dev/null
+++ b/reader/scraper/testdata/iframe.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en-US">
+ <body>
+ <article>
+ <iframe id="1" src="about:blank"></iframe>
+ <iframe id="2" src="about:blank"></iframe>
+ <iframe id="3" src="about:blank"></iframe>
+ <iframe id="4" src="about:blank"></iframe>
+ <iframe id="5" src="about:blank"></iframe>
+ </article>
+ </body>
+</html>
diff --git a/reader/scraper/testdata/iframe.html-result b/reader/scraper/testdata/iframe.html-result
new file mode 100644
index 0000000..2d2a5ed
--- /dev/null
+++ b/reader/scraper/testdata/iframe.html-result
@@ -0,0 +1 @@
+<iframe id="1" src="about:blank"></iframe><iframe id="2" src="about:blank"></iframe><iframe id="3" src="about:blank"></iframe><iframe id="4" src="about:blank"></iframe><iframe id="5" src="about:blank"></iframe>
diff --git a/reader/scraper/testdata/img.html b/reader/scraper/testdata/img.html
new file mode 100644
index 0000000..33ec780
--- /dev/null
+++ b/reader/scraper/testdata/img.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en-US">
+ <body>
+ <article>
+ <img id="1" src="#" alt="" />
+ <img id="2" src="#" alt="" />
+ <img id="3" src="#" alt="" />
+ <img id="4" src="#" alt="" />
+ <img id="5" src="#" alt="" />
+ </article>
+ </body>
+</html>
diff --git a/reader/scraper/testdata/img.html-result b/reader/scraper/testdata/img.html-result
new file mode 100644
index 0000000..54e1019
--- /dev/null
+++ b/reader/scraper/testdata/img.html-result
@@ -0,0 +1 @@
+<img id="1" src="#" alt=""/><img id="2" src="#" alt=""/><img id="3" src="#" alt=""/><img id="4" src="#" alt=""/><img id="5" src="#" alt=""/>
diff --git a/reader/scraper/testdata/p.html b/reader/scraper/testdata/p.html
new file mode 100644
index 0000000..45a24c0
--- /dev/null
+++ b/reader/scraper/testdata/p.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html lang="en-US">
+ <body>
+ <article>
+ <p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p>
+ <p>Apquam tincidunt mauris eu risus.</p>
+ <p>Vestibulum auctor dapibus neque.</p>
+ </article>
+ </body>
+</html>
diff --git a/reader/scraper/testdata/p.html-result b/reader/scraper/testdata/p.html-result
new file mode 100644
index 0000000..430f015
--- /dev/null
+++ b/reader/scraper/testdata/p.html-result
@@ -0,0 +1 @@
+<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p><p>Apquam tincidunt mauris eu risus.</p><p>Vestibulum auctor dapibus neque.</p>