aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2017-12-12 19:19:36 -0800
committerGravatar Frédéric Guillot <fred@miniflux.net>2017-12-12 19:19:36 -0800
commitef097f02fe76572d2b1887c28da3f2bd83a993a0 (patch)
tree285dd219fab64119728e03b421ab230392ac5e3e /reader
parent33445e5b681bbdffaf0925ed020ecdcc49687f15 (diff)
Add the possibility to enable crawler for feeds
Diffstat (limited to 'reader')
-rw-r--r--reader/feed/handler.go5
-rw-r--r--reader/processor/processor.go20
-rw-r--r--reader/scraper/scraper.go5
3 files changed, 25 insertions, 5 deletions
diff --git a/reader/feed/handler.go b/reader/feed/handler.go
index 7a98613..6b94627 100644
--- a/reader/feed/handler.go
+++ b/reader/feed/handler.go
@@ -33,7 +33,7 @@ type Handler struct {
}
// CreateFeed fetch, parse and store a new feed.
-func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed, error) {
+func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool) (*model.Feed, error) {
defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[Handler:CreateFeed] feedUrl=%s", url))
if !h.store.CategoryExists(userID, categoryID) {
@@ -65,6 +65,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
}
feedProcessor := processor.NewFeedProcessor(subscription)
+ feedProcessor.WithCrawler(crawler)
feedProcessor.Process()
subscription.Category = &model.Category{ID: categoryID}
@@ -72,6 +73,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
subscription.LastModifiedHeader = response.LastModified
subscription.FeedURL = response.EffectiveURL
subscription.UserID = userID
+ subscription.Crawler = crawler
err = h.store.CreateFeed(subscription)
if err != nil {
@@ -143,6 +145,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
feedProcessor := processor.NewFeedProcessor(subscription)
feedProcessor.WithScraperRules(originalFeed.ScraperRules)
feedProcessor.WithRewriteRules(originalFeed.RewriteRules)
+ feedProcessor.WithCrawler(originalFeed.Crawler)
feedProcessor.Process()
originalFeed.EtagHeader = response.ETag
diff --git a/reader/processor/processor.go b/reader/processor/processor.go
index 06dad43..7cc5cb1 100644
--- a/reader/processor/processor.go
+++ b/reader/processor/processor.go
@@ -5,9 +5,12 @@
package processor
import (
+ "log"
+
"github.com/miniflux/miniflux2/model"
"github.com/miniflux/miniflux2/reader/rewrite"
"github.com/miniflux/miniflux2/reader/sanitizer"
+ "github.com/miniflux/miniflux2/reader/scraper"
)
// FeedProcessor handles the processing of feed contents.
@@ -15,6 +18,12 @@ type FeedProcessor struct {
feed *model.Feed
scraperRules string
rewriteRules string
+ crawler bool
+}
+
+// WithCrawler enables the crawler.
+func (f *FeedProcessor) WithCrawler(value bool) {
+ f.crawler = value
}
// WithScraperRules adds scraper rules to the processing.
@@ -30,6 +39,15 @@ func (f *FeedProcessor) WithRewriteRules(rules string) {
// Process applies rewrite and scraper rules.
func (f *FeedProcessor) Process() {
for _, entry := range f.feed.Entries {
+ if f.crawler {
+ content, err := scraper.Fetch(entry.URL, f.scraperRules)
+ if err != nil {
+ log.Println("[FeedProcessor]", err)
+ } else {
+ entry.Content = content
+ }
+ }
+
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
entry.Content = rewrite.Rewriter(entry.URL, entry.Content, f.rewriteRules)
}
@@ -37,5 +55,5 @@ func (f *FeedProcessor) Process() {
// NewFeedProcessor returns a new FeedProcessor.
func NewFeedProcessor(feed *model.Feed) *FeedProcessor {
- return &FeedProcessor{feed: feed}
+ return &FeedProcessor{feed: feed, crawler: false}
}
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index b79a088..e799ad0 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -13,7 +13,6 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/miniflux/miniflux2/http"
"github.com/miniflux/miniflux2/reader/readability"
- "github.com/miniflux/miniflux2/reader/sanitizer"
"github.com/miniflux/miniflux2/url"
)
@@ -34,11 +33,11 @@ func Fetch(websiteURL, rules string) (string, error) {
return "", err
}
- var content string
if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
}
+ var content string
if rules != "" {
log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
content, err = scrapContent(page, rules)
@@ -51,7 +50,7 @@ func Fetch(websiteURL, rules string) (string, error) {
return "", err
}
- return sanitizer.Sanitize(websiteURL, content), nil
+ return content, nil
}
func scrapContent(page io.Reader, rules string) (string, error) {