Add scraper rules

author: Frédéric Guillot <fred@miniflux.net> 2017-12-10 20:51:04 -0800
committer: Frédéric Guillot <fred@miniflux.net> 2017-12-10 20:51:04 -0800
commit: 87ccad5c7f1edf8bce37af547ca1659326398fa8 (patch)
tree: 4998a78ac68b1c2c03e6152d26567928cdf042ec /reader/scraper
parent: 7a35c58f53d76356292e3e0ca9c91add3595a9e0 (diff)
3 files changed, 91 insertions, 2 deletions
diff --git a/reader/scraper/rules.go b/reader/scraper/rules.go
new file mode 100644
index 0000000..ae6c4a5
--- /dev/null
+++ b/reader/scraper/rules.go
@@ -0,0 +1,16 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+// List of predefined scraper rules (alphabetically sorted)
+// domain => CSS selectors
+var predefinedRules = map[string]string{
+	"lemonde.fr":        "div#articleBody",
+	"lesjoiesducode.fr": ".blog-post-content img",
+	"linux.com":         "div.content, div[property]",
+	"opensource.com":    "div[property]",
+	"phoronix.com":      "div.content",
+	"techcrunch.com":    "div.article-entry",
+}
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index 6c51862..b79a088 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -6,14 +6,19 @@ package scraper
 
 import (
 	"errors"
+	"io"
+	"log"
+	"strings"
 
+	"github.com/PuerkitoBio/goquery"
 	"github.com/miniflux/miniflux2/http"
 	"github.com/miniflux/miniflux2/reader/readability"
 	"github.com/miniflux/miniflux2/reader/sanitizer"
+	"github.com/miniflux/miniflux2/url"
 )
 
 // Fetch download a web page a returns relevant contents.
-func Fetch(websiteURL string) (string, error) {
+func Fetch(websiteURL, rules string) (string, error) {
 	client := http.NewClient(websiteURL)
 	response, err := client.Get()
 	if err != nil {
@@ -29,10 +34,57 @@ func Fetch(websiteURL string) (string, error) {
 		return "", err
 	}
 
-	content, err := readability.ExtractContent(page)
+	var content string
+	if rules == "" {
+		rules = getPredefinedScraperRules(websiteURL)
+	}
+
+	if rules != "" {
+		log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
+		content, err = scrapContent(page, rules)
+	} else {
+		log.Printf(`[Scraper] Using readability for "%s"`, websiteURL)
+		content, err = readability.ExtractContent(page)
+	}
+
 	if err != nil {
 		return "", err
 	}
 
 	return sanitizer.Sanitize(websiteURL, content), nil
 }
+
+func scrapContent(page io.Reader, rules string) (string, error) {
+	document, err := goquery.NewDocumentFromReader(page)
+	if err != nil {
+		return "", err
+	}
+
+	contents := ""
+	document.Find(rules).Each(func(i int, s *goquery.Selection) {
+		var content string
+
+		// For some inline elements, we get the parent.
+		if s.Is("img") {
+			content, _ = s.Parent().Html()
+		} else {
+			content, _ = s.Html()
+		}
+
+		contents += content
+	})
+
+	return contents, nil
+}
+
+func getPredefinedScraperRules(websiteURL string) string {
+	urlDomain := url.Domain(websiteURL)
+
+	for domain, rules := range predefinedRules {
+		if strings.Contains(urlDomain, domain) {
+			return rules
+		}
+	}
+
+	return ""
+}
diff --git a/reader/scraper/scraper_test.go b/reader/scraper/scraper_test.go
new file mode 100644
index 0000000..b493e25
--- /dev/null
+++ b/reader/scraper/scraper_test.go
@@ -0,0 +1,21 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+import "testing"
+
+func TestGetPredefinedRules(t *testing.T) {
+	if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
+		t.Error("Unable to find rule for phoronix.com")
+	}
+
+	if getPredefinedScraperRules("https://www.linux.com/") == "" {
+		t.Error("Unable to find rule for linux.com")
+	}
+
+	if getPredefinedScraperRules("https://example.org/") != "" {
+		t.Error("A rule not defined should not return anything")
+	}
+}
author	Frédéric Guillot <fred@miniflux.net>	2017-12-10 20:51:04 -0800
committer	Frédéric Guillot <fred@miniflux.net>	2017-12-10 20:51:04 -0800
commit	87ccad5c7f1edf8bce37af547ca1659326398fa8 (patch)
tree	4998a78ac68b1c2c03e6152d26567928cdf042ec /reader/scraper
parent	7a35c58f53d76356292e3e0ca9c91add3595a9e0 (diff)