diff options
-rw-r--r-- | locale/translations.go | 7 | ||||
-rw-r--r-- | locale/translations/fr_FR.json | 3 | ||||
-rw-r--r-- | model/feed.go | 1 | ||||
-rw-r--r-- | reader/scraper/rules.go | 16 | ||||
-rw-r--r-- | reader/scraper/scraper.go | 56 | ||||
-rw-r--r-- | reader/scraper/scraper_test.go | 21 | ||||
-rw-r--r-- | server/template/html/edit_feed.html | 3 | ||||
-rw-r--r-- | server/template/views.go | 7 | ||||
-rw-r--r-- | server/ui/controller/entry.go | 10 | ||||
-rw-r--r-- | server/ui/controller/feed.go | 9 | ||||
-rw-r--r-- | server/ui/form/feed.go | 19 | ||||
-rw-r--r-- | sql/schema_version_6.sql | 1 | ||||
-rw-r--r-- | sql/sql.go | 5 | ||||
-rw-r--r-- | storage/entry_query_builder.go | 3 | ||||
-rw-r--r-- | storage/feed.go | 11 | ||||
-rw-r--r-- | storage/migration.go | 2 |
16 files changed, 140 insertions, 34 deletions
diff --git a/locale/translations.go b/locale/translations.go index 217541e..7124f1b 100644 --- a/locale/translations.go +++ b/locale/translations.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-12-10 18:56:24.387844114 -0800 PST m=+0.029823201 +// 2017-12-10 20:08:14.447304303 -0800 PST m=+0.040286758 package locale @@ -167,12 +167,13 @@ var translations = map[string]string{ "Activate Fever API": "Activer l'API de Fever", "Fever Username": "Nom d'utilisateur pour l'API de Fever", "Fever Password": "Mot de passe pour l'API de Fever", - "Fetch original content": "Récupérer le contenu original" + "Fetch original content": "Récupérer le contenu original", + "Scraper Rules": "Règles pour récupérer le contenu original" } `, } var translationsChecksums = map[string]string{ "en_US": "6fe95384260941e8a5a3c695a655a932e0a8a6a572c1e45cb2b1ae8baa01b897", - "fr_FR": "fd629b171aefa50dd0a6100acaac8fbecbdf1a1d53e3fce984234565ec5bb5d5", + "fr_FR": "4426cea875ee2c9acb1a2b0619cb82f3a32f71aabe5d07657eaf2f6b7387c5f9", } diff --git a/locale/translations/fr_FR.json b/locale/translations/fr_FR.json index cc82efe..0a51ec3 100644 --- a/locale/translations/fr_FR.json +++ b/locale/translations/fr_FR.json @@ -151,5 +151,6 @@ "Activate Fever API": "Activer l'API de Fever", "Fever Username": "Nom d'utilisateur pour l'API de Fever", "Fever Password": "Mot de passe pour l'API de Fever", - "Fetch original content": "Récupérer le contenu original" + "Fetch original content": "Récupérer le contenu original", + "Scraper Rules": "Règles pour récupérer le contenu original" } diff --git a/model/feed.go b/model/feed.go index dbdb9d6..fb2819d 100644 --- a/model/feed.go +++ b/model/feed.go @@ -22,6 +22,7 @@ type Feed struct { LastModifiedHeader string `json:"last_modified_header,omitempty"` ParsingErrorMsg string `json:"parsing_error_message,omitempty"` ParsingErrorCount int `json:"parsing_error_count,omitempty"` + ScraperRules string `json:"scraper_rules"` Category *Category `json:"category,omitempty"` Entries Entries `json:"entries,omitempty"` Icon *FeedIcon `json:"icon,omitempty"` diff --git a/reader/scraper/rules.go b/reader/scraper/rules.go new file mode 100644 index 0000000..ae6c4a5 --- /dev/null +++ b/reader/scraper/rules.go @@ -0,0 +1,16 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package scraper + +// List of predefined scraper rules (alphabetically sorted) +// domain => CSS selectors +var predefinedRules = map[string]string{ + "lemonde.fr": "div#articleBody", + "lesjoiesducode.fr": ".blog-post-content img", + "linux.com": "div.content, div[property]", + "opensource.com": "div[property]", + "phoronix.com": "div.content", + "techcrunch.com": "div.article-entry", +} diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index 6c51862..b79a088 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -6,14 +6,19 @@ package scraper import ( "errors" + "io" + "log" + "strings" + "github.com/PuerkitoBio/goquery" "github.com/miniflux/miniflux2/http" "github.com/miniflux/miniflux2/reader/readability" "github.com/miniflux/miniflux2/reader/sanitizer" + "github.com/miniflux/miniflux2/url" ) // Fetch download a web page a returns relevant contents. -func Fetch(websiteURL string) (string, error) { +func Fetch(websiteURL, rules string) (string, error) { client := http.NewClient(websiteURL) response, err := client.Get() if err != nil { @@ -29,10 +34,57 @@ func Fetch(websiteURL string) (string, error) { return "", err } - content, err := readability.ExtractContent(page) + var content string + if rules == "" { + rules = getPredefinedScraperRules(websiteURL) + } + + if rules != "" { + log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL) + content, err = scrapContent(page, rules) + } else { + log.Printf(`[Scraper] Using readability for "%s"`, websiteURL) + content, err = readability.ExtractContent(page) + } + if err != nil { return "", err } return sanitizer.Sanitize(websiteURL, content), nil } + +func scrapContent(page io.Reader, rules string) (string, error) { + document, err := goquery.NewDocumentFromReader(page) + if err != nil { + return "", err + } + + contents := "" + document.Find(rules).Each(func(i int, s *goquery.Selection) { + var content string + + // For some inline elements, we get the parent. + if s.Is("img") { + content, _ = s.Parent().Html() + } else { + content, _ = s.Html() + } + + contents += content + }) + + return contents, nil +} + +func getPredefinedScraperRules(websiteURL string) string { + urlDomain := url.Domain(websiteURL) + + for domain, rules := range predefinedRules { + if strings.Contains(urlDomain, domain) { + return rules + } + } + + return "" +} diff --git a/reader/scraper/scraper_test.go b/reader/scraper/scraper_test.go new file mode 100644 index 0000000..b493e25 --- /dev/null +++ b/reader/scraper/scraper_test.go @@ -0,0 +1,21 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package scraper + +import "testing" + +func TestGetPredefinedRules(t *testing.T) { + if getPredefinedScraperRules("http://www.phoronix.com/") == "" { + t.Error("Unable to find rule for phoronix.com") + } + + if getPredefinedScraperRules("https://www.linux.com/") == "" { + t.Error("Unable to find rule for linux.com") + } + + if getPredefinedScraperRules("https://example.org/") != "" { + t.Error("A rule not defined should not return anything") + } +} diff --git a/server/template/html/edit_feed.html b/server/template/html/edit_feed.html index fac2a9b..0495092 100644 --- a/server/template/html/edit_feed.html +++ b/server/template/html/edit_feed.html @@ -45,6 +45,9 @@ <label for="form-feed-url">{{ t "Feed URL" }}</label> <input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required> + <label for="form-scraper-rules">{{ t "Scraper Rules" }}</label> + <input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}"> + <label for="form-category">{{ t "Category" }}</label> <select id="form-category" name="category_id"> {{ range .categories }} diff --git a/server/template/views.go b/server/template/views.go index 80d956d..420a344 100644 --- a/server/template/views.go +++ b/server/template/views.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-12-10 18:56:24.375327888 -0800 PST m=+0.017306975 +// 2017-12-10 20:08:14.428877093 -0800 PST m=+0.021859548 package template @@ -395,6 +395,9 @@ var templateViewsMap = map[string]string{ <label for="form-feed-url">{{ t "Feed URL" }}</label> <input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required> + <label for="form-scraper-rules">{{ t "Scraper Rules" }}</label> + <input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}"> + <label for="form-category">{{ t "Category" }}</label> <select id="form-category" name="category_id"> {{ range .categories }} @@ -1181,7 +1184,7 @@ var templateViewsMapChecksums = map[string]string{ "create_category": "2b82af5d2dcd67898dc5daa57a6461e6ff8121a6089b2a2a1be909f35e4a2275", "create_user": "45e226df757126d5fe7c464e295e9a34f07952cfdb71e31e49839850d35af139", "edit_category": "cee720faadcec58289b707ad30af623d2ee66c1ce23a732965463250d7ff41c5", - "edit_feed": "c5bc4c22bf7e8348d880395250545595d21fb8c8e723fc5d7cca68e25d250884", + "edit_feed": "b3c7dd5e93d58e051abcd59da31217d8e9b50587014b895d1b7c9172247b35f8", "edit_user": "82d9749d76ddbd2352816d813c4b1f6d92f2222de678b4afe5821090246735c7", "entry": "ebcf9bb35812dd02759718f7f7411267e6a6c8efd59a9aa0a0e735bcb88efeff", "feed_entries": "547c19eb36b20e350ce70ed045173b064cdcd6b114afb241c9f2dda9d88fcc27", diff --git a/server/ui/controller/entry.go b/server/ui/controller/entry.go index eb47201..5c1d3f5 100644 --- a/server/ui/controller/entry.go +++ b/server/ui/controller/entry.go @@ -40,18 +40,14 @@ func (c *Controller) FetchContent(ctx *core.Context, request *core.Request, resp return } - content, err := scraper.Fetch(entry.URL) + content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules) if err != nil { response.JSON().ServerError(err) return } - if len(content) > len(entry.Content) { - entry.Content = content - c.store.UpdateEntryContent(entry) - } else { - content = entry.Content - } + entry.Content = content + c.store.UpdateEntryContent(entry) response.JSON().Created(map[string]string{"content": content}) } diff --git a/server/ui/controller/feed.go b/server/ui/controller/feed.go index eeb66c4..d5bc858 100644 --- a/server/ui/controller/feed.go +++ b/server/ui/controller/feed.go @@ -217,10 +217,11 @@ func (c *Controller) getFeedFormTemplateArgs(ctx *core.Context, user *model.User if feedForm == nil { args["form"] = form.FeedForm{ - SiteURL: feed.SiteURL, - FeedURL: feed.FeedURL, - Title: feed.Title, - CategoryID: feed.Category.ID, + SiteURL: feed.SiteURL, + FeedURL: feed.FeedURL, + Title: feed.Title, + ScraperRules: feed.ScraperRules, + CategoryID: feed.Category.ID, } } else { args["form"] = feedForm diff --git a/server/ui/form/feed.go b/server/ui/form/feed.go index 8a8cf20..7d07375 100644 --- a/server/ui/form/feed.go +++ b/server/ui/form/feed.go @@ -14,10 +14,11 @@ import ( // FeedForm represents a feed form in the UI type FeedForm struct { - FeedURL string - SiteURL string - Title string - CategoryID int64 + FeedURL string + SiteURL string + Title string + ScraperRules string + CategoryID int64 } // ValidateModification validates FeedForm fields @@ -34,6 +35,7 @@ func (f FeedForm) Merge(feed *model.Feed) *model.Feed { feed.Title = f.Title feed.SiteURL = f.SiteURL feed.FeedURL = f.FeedURL + feed.ScraperRules = f.ScraperRules feed.ParsingErrorCount = 0 feed.ParsingErrorMsg = "" return feed @@ -47,9 +49,10 @@ func NewFeedForm(r *http.Request) *FeedForm { } return &FeedForm{ - FeedURL: r.FormValue("feed_url"), - SiteURL: r.FormValue("site_url"), - Title: r.FormValue("title"), - CategoryID: int64(categoryID), + FeedURL: r.FormValue("feed_url"), + SiteURL: r.FormValue("site_url"), + Title: r.FormValue("title"), + ScraperRules: r.FormValue("scraper_rules"), + CategoryID: int64(categoryID), } } diff --git a/sql/schema_version_6.sql b/sql/schema_version_6.sql new file mode 100644 index 0000000..6f79ca8 --- /dev/null +++ b/sql/schema_version_6.sql @@ -0,0 +1 @@ +alter table feeds add column scraper_rules text default ''; @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-12-10 18:56:24.36359961 -0800 PST m=+0.005578697 +// 2017-12-10 20:08:14.411225368 -0800 PST m=+0.004207823 package sql @@ -137,6 +137,8 @@ alter table users add column entry_direction entry_sorting_direction default 'as primary key(user_id) ) `, + "schema_version_6": `alter table feeds add column scraper_rules text default ''; +`, } var SqlMapChecksums = map[string]string{ @@ -145,4 +147,5 @@ var SqlMapChecksums = map[string]string{ "schema_version_3": "a54745dbc1c51c000f74d4e5068f1e2f43e83309f023415b1749a47d5c1e0f12", "schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9", "schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c", + "schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4", } diff --git a/storage/entry_query_builder.go b/storage/entry_query_builder.go index 143fd3b..957adaa 100644 --- a/storage/entry_query_builder.go +++ b/storage/entry_query_builder.go @@ -152,7 +152,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) { SELECT e.id, e.user_id, e.feed_id, e.hash, e.published_at at time zone '%s', e.title, e.url, e.author, e.content, e.status, f.title as feed_title, f.feed_url, f.site_url, f.checked_at, - f.category_id, c.title as category_title, + f.category_id, c.title as category_title, f.scraper_rules, fi.icon_id FROM entries e LEFT JOIN feeds f ON f.id=e.feed_id @@ -197,6 +197,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) { &entry.Feed.CheckedAt, &entry.Feed.Category.ID, &entry.Feed.Category.Title, + &entry.Feed.ScraperRules, &iconID, ) diff --git a/storage/feed.go b/storage/feed.go index 9d42664..e722b0b 100644 --- a/storage/feed.go +++ b/storage/feed.go @@ -52,7 +52,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) { feeds := make(model.Feeds, 0) query := `SELECT f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header, - f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, + f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules, f.category_id, c.title as category_title, fi.icon_id FROM feeds f @@ -84,6 +84,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) { &feed.CheckedAt, &feed.ParsingErrorCount, &errorMsg, + &feed.ScraperRules, &feed.Category.ID, &feed.Category.Title, &iconID, @@ -122,7 +123,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) { query := ` SELECT f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header, - f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, + f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules, f.category_id, c.title as category_title FROM feeds f LEFT JOIN categories c ON c.id=f.category_id @@ -139,6 +140,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) { &feed.CheckedAt, &feed.ParsingErrorCount, &feed.ParsingErrorMsg, + &feed.ScraperRules, &feed.Category.ID, &feed.Category.Title, ) @@ -195,8 +197,8 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) { query := `UPDATE feeds SET feed_url=$1, site_url=$2, title=$3, category_id=$4, etag_header=$5, last_modified_header=$6, checked_at=$7, - parsing_error_msg=$8, parsing_error_count=$9 - WHERE id=$10 AND user_id=$11` + parsing_error_msg=$8, parsing_error_count=$9, scraper_rules=$10 + WHERE id=$11 AND user_id=$12` _, err = s.db.Exec(query, feed.FeedURL, @@ -208,6 +210,7 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) { feed.CheckedAt, feed.ParsingErrorMsg, feed.ParsingErrorCount, + feed.ScraperRules, feed.ID, feed.UserID, ) diff --git a/storage/migration.go b/storage/migration.go index 76ad801..8f3c8cb 100644 --- a/storage/migration.go +++ b/storage/migration.go @@ -12,7 +12,7 @@ import ( "github.com/miniflux/miniflux2/sql" ) -const schemaVersion = 5 +const schemaVersion = 6 // Migrate run database migrations. func (s *Storage) Migrate() { |