From 311a133ab87423958d850a91dea0b49543b97c2a Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Sun, 2 Dec 2018 20:51:06 -0800 Subject: Refactor manual entry scraper --- reader/feed/handler.go | 6 +++--- reader/filter/doc.go | 10 --------- reader/filter/filter.go | 36 ------------------------------- reader/processor/doc.go | 10 +++++++++ reader/processor/filter.go | 53 ++++++++++++++++++++++++++++++++++++++++++++++ reader/rewrite/rewriter.go | 3 +++ reader/scraper/scraper.go | 2 +- 7 files changed, 70 insertions(+), 50 deletions(-) delete mode 100644 reader/filter/doc.go delete mode 100644 reader/filter/filter.go create mode 100644 reader/processor/doc.go create mode 100644 reader/processor/filter.go (limited to 'reader') diff --git a/reader/feed/handler.go b/reader/feed/handler.go index 401048b..587f76d 100644 --- a/reader/feed/handler.go +++ b/reader/feed/handler.go @@ -14,9 +14,9 @@ import ( "miniflux.app/logger" "miniflux.app/model" "miniflux.app/reader/browser" - "miniflux.app/reader/filter" "miniflux.app/reader/icon" "miniflux.app/reader/parser" + "miniflux.app/reader/processor" "miniflux.app/storage" "miniflux.app/timer" ) @@ -63,7 +63,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool, subscription.WithClientResponse(response) subscription.CheckedNow() - filter.Apply(h.store, subscription) + processor.ProcessFeedEntries(h.store, subscription) if storeErr := h.store.CreateFeed(subscription); storeErr != nil { return nil, storeErr @@ -114,7 +114,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error { } originalFeed.Entries = updatedFeed.Entries - filter.Apply(h.store, originalFeed) + processor.ProcessFeedEntries(h.store, originalFeed) // We don't update existing entries when the crawler is enabled (we crawl only inexisting entries). if storeErr := h.store.UpdateEntries(originalFeed.UserID, originalFeed.ID, originalFeed.Entries, !originalFeed.Crawler); storeErr != nil { diff --git a/reader/filter/doc.go b/reader/filter/doc.go deleted file mode 100644 index 92c18c0..0000000 --- a/reader/filter/doc.go +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright 2018 Frédéric Guillot. All rights reserved. -// Use of this source code is governed by the Apache 2.0 -// license that can be found in the LICENSE file. - -/* - -Package filter applies a set of filters to feed entries. - -*/ -package filter // import "miniflux.app/reader/filter" diff --git a/reader/filter/filter.go b/reader/filter/filter.go deleted file mode 100644 index a0b9429..0000000 --- a/reader/filter/filter.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2018 Frédéric Guillot. All rights reserved. -// Use of this source code is governed by the Apache 2.0 -// license that can be found in the LICENSE file. - -package filter - -import ( - "miniflux.app/logger" - "miniflux.app/model" - "miniflux.app/reader/rewrite" - "miniflux.app/reader/sanitizer" - "miniflux.app/reader/scraper" - "miniflux.app/storage" -) - -// Apply executes all entry filters. -func Apply(store *storage.Storage, feed *model.Feed) { - for _, entry := range feed.Entries { - if feed.Crawler { - if !store.EntryURLExists(feed.UserID, entry.URL) { - content, err := scraper.Fetch(entry.URL, feed.ScraperRules, feed.UserAgent) - if err != nil { - logger.Error("Unable to crawl this entry: %q => %v", entry.URL, err) - } else { - // We replace the entry content only if the scraper doesn't return any error. - entry.Content = content - } - } - } - - entry.Content = rewrite.Rewriter(entry.URL, entry.Content, feed.RewriteRules) - - // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered. - entry.Content = sanitizer.Sanitize(entry.URL, entry.Content) - } -} diff --git a/reader/processor/doc.go b/reader/processor/doc.go new file mode 100644 index 0000000..df57e98 --- /dev/null +++ b/reader/processor/doc.go @@ -0,0 +1,10 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +/* + +Package processor applies rules and sanitize content for feed entries. + +*/ +package processor // import "miniflux.app/reader/processor" diff --git a/reader/processor/filter.go b/reader/processor/filter.go new file mode 100644 index 0000000..faceccc --- /dev/null +++ b/reader/processor/filter.go @@ -0,0 +1,53 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package processor + +import ( + "miniflux.app/logger" + "miniflux.app/model" + "miniflux.app/reader/rewrite" + "miniflux.app/reader/sanitizer" + "miniflux.app/reader/scraper" + "miniflux.app/storage" +) + +// ProcessFeedEntries downloads original web page for entries and apply filters. +func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) { + for _, entry := range feed.Entries { + if feed.Crawler { + if !store.EntryURLExists(feed.UserID, entry.URL) { + content, err := scraper.Fetch(entry.URL, feed.ScraperRules, feed.UserAgent) + if err != nil { + logger.Error(`[Filter] Unable to crawl this entry: %q => %v`, entry.URL, err) + } else if content != "" { + // We replace the entry content only if the scraper doesn't return any error. + entry.Content = content + } + } + } + + entry.Content = rewrite.Rewriter(entry.URL, entry.Content, feed.RewriteRules) + + // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered. + entry.Content = sanitizer.Sanitize(entry.URL, entry.Content) + } +} + +// ProcessEntryWebPage downloads the entry web page and apply rewrite rules. +func ProcessEntryWebPage(entry *model.Entry) error { + content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules, entry.Feed.UserAgent) + if err != nil { + return err + } + + content = rewrite.Rewriter(entry.URL, content, entry.Feed.RewriteRules) + content = sanitizer.Sanitize(entry.URL, content) + + if content != "" { + entry.Content = content + } + + return nil +} diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go index e9b1fae..25c742d 100644 --- a/reader/rewrite/rewriter.go +++ b/reader/rewrite/rewriter.go @@ -7,6 +7,7 @@ package rewrite // import "miniflux.app/reader/rewrite" import ( "strings" + "miniflux.app/logger" "miniflux.app/url" ) @@ -20,6 +21,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string { rules := strings.Split(rulesList, ",") rules = append(rules, "add_pdf_download_link") + logger.Debug(`[Rewrite] Applying rules %v for %q`, rules, entryURL) + for _, rule := range rules { switch strings.TrimSpace(rule) { case "add_image_title": diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index 58f37d5..fd5ba33 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -54,7 +54,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) { logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL) content, err = scrapContent(response.Body, rules) } else { - logger.Debug(`[Scraper] Using readability for "%q`, websiteURL) + logger.Debug(`[Scraper] Using readability for %q`, websiteURL) content, err = readability.ExtractContent(response.Body) } -- cgit v1.2.3