diff options
author | Frédéric Guillot <fred@miniflux.net> | 2018-12-02 20:51:06 -0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2018-12-02 20:51:06 -0800 |
commit | 311a133ab87423958d850a91dea0b49543b97c2a (patch) | |
tree | d73d39d8adcc23cd69f2e25fab575277e20f3f6e /reader | |
parent | 52de36b158ab2340d2c206adabb7568d036f1aa2 (diff) |
Refactor manual entry scraper
Diffstat (limited to 'reader')
-rw-r--r-- | reader/feed/handler.go | 6 | ||||
-rw-r--r-- | reader/processor/doc.go (renamed from reader/filter/doc.go) | 4 | ||||
-rw-r--r-- | reader/processor/filter.go (renamed from reader/filter/filter.go) | 27 | ||||
-rw-r--r-- | reader/rewrite/rewriter.go | 3 | ||||
-rw-r--r-- | reader/scraper/scraper.go | 2 |
5 files changed, 31 insertions, 11 deletions
diff --git a/reader/feed/handler.go b/reader/feed/handler.go index 401048b..587f76d 100644 --- a/reader/feed/handler.go +++ b/reader/feed/handler.go @@ -14,9 +14,9 @@ import ( "miniflux.app/logger" "miniflux.app/model" "miniflux.app/reader/browser" - "miniflux.app/reader/filter" "miniflux.app/reader/icon" "miniflux.app/reader/parser" + "miniflux.app/reader/processor" "miniflux.app/storage" "miniflux.app/timer" ) @@ -63,7 +63,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool, subscription.WithClientResponse(response) subscription.CheckedNow() - filter.Apply(h.store, subscription) + processor.ProcessFeedEntries(h.store, subscription) if storeErr := h.store.CreateFeed(subscription); storeErr != nil { return nil, storeErr @@ -114,7 +114,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error { } originalFeed.Entries = updatedFeed.Entries - filter.Apply(h.store, originalFeed) + processor.ProcessFeedEntries(h.store, originalFeed) // We don't update existing entries when the crawler is enabled (we crawl only inexisting entries). if storeErr := h.store.UpdateEntries(originalFeed.UserID, originalFeed.ID, originalFeed.Entries, !originalFeed.Crawler); storeErr != nil { diff --git a/reader/filter/doc.go b/reader/processor/doc.go index 92c18c0..df57e98 100644 --- a/reader/filter/doc.go +++ b/reader/processor/doc.go @@ -4,7 +4,7 @@ /* -Package filter applies a set of filters to feed entries. +Package processor applies rules and sanitize content for feed entries. */ -package filter // import "miniflux.app/reader/filter" +package processor // import "miniflux.app/reader/processor" diff --git a/reader/filter/filter.go b/reader/processor/filter.go index a0b9429..faceccc 100644 --- a/reader/filter/filter.go +++ b/reader/processor/filter.go @@ -2,7 +2,7 @@ // Use of this source code is governed by the Apache 2.0 // license that can be found in the LICENSE file. -package filter +package processor import ( "miniflux.app/logger" @@ -13,15 +13,15 @@ import ( "miniflux.app/storage" ) -// Apply executes all entry filters. -func Apply(store *storage.Storage, feed *model.Feed) { +// ProcessFeedEntries downloads original web page for entries and apply filters. +func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) { for _, entry := range feed.Entries { if feed.Crawler { if !store.EntryURLExists(feed.UserID, entry.URL) { content, err := scraper.Fetch(entry.URL, feed.ScraperRules, feed.UserAgent) if err != nil { - logger.Error("Unable to crawl this entry: %q => %v", entry.URL, err) - } else { + logger.Error(`[Filter] Unable to crawl this entry: %q => %v`, entry.URL, err) + } else if content != "" { // We replace the entry content only if the scraper doesn't return any error. entry.Content = content } @@ -34,3 +34,20 @@ func Apply(store *storage.Storage, feed *model.Feed) { entry.Content = sanitizer.Sanitize(entry.URL, entry.Content) } } + +// ProcessEntryWebPage downloads the entry web page and apply rewrite rules. +func ProcessEntryWebPage(entry *model.Entry) error { + content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules, entry.Feed.UserAgent) + if err != nil { + return err + } + + content = rewrite.Rewriter(entry.URL, content, entry.Feed.RewriteRules) + content = sanitizer.Sanitize(entry.URL, content) + + if content != "" { + entry.Content = content + } + + return nil +} diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go index e9b1fae..25c742d 100644 --- a/reader/rewrite/rewriter.go +++ b/reader/rewrite/rewriter.go @@ -7,6 +7,7 @@ package rewrite // import "miniflux.app/reader/rewrite" import ( "strings" + "miniflux.app/logger" "miniflux.app/url" ) @@ -20,6 +21,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string { rules := strings.Split(rulesList, ",") rules = append(rules, "add_pdf_download_link") + logger.Debug(`[Rewrite] Applying rules %v for %q`, rules, entryURL) + for _, rule := range rules { switch strings.TrimSpace(rule) { case "add_image_title": diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index 58f37d5..fd5ba33 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -54,7 +54,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) { logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL) content, err = scrapContent(response.Body, rules) } else { - logger.Debug(`[Scraper] Using readability for "%q`, websiteURL) + logger.Debug(`[Scraper] Using readability for %q`, websiteURL) content, err = readability.ExtractContent(response.Body) } |