aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--locale/translations.go7
-rw-r--r--locale/translations/fr_FR.json3
-rw-r--r--model/feed.go1
-rw-r--r--reader/scraper/rules.go16
-rw-r--r--reader/scraper/scraper.go56
-rw-r--r--reader/scraper/scraper_test.go21
-rw-r--r--server/template/html/edit_feed.html3
-rw-r--r--server/template/views.go7
-rw-r--r--server/ui/controller/entry.go10
-rw-r--r--server/ui/controller/feed.go9
-rw-r--r--server/ui/form/feed.go19
-rw-r--r--sql/schema_version_6.sql1
-rw-r--r--sql/sql.go5
-rw-r--r--storage/entry_query_builder.go3
-rw-r--r--storage/feed.go11
-rw-r--r--storage/migration.go2
16 files changed, 140 insertions, 34 deletions
diff --git a/locale/translations.go b/locale/translations.go
index 217541e..7124f1b 100644
--- a/locale/translations.go
+++ b/locale/translations.go
@@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
-// 2017-12-10 18:56:24.387844114 -0800 PST m=+0.029823201
+// 2017-12-10 20:08:14.447304303 -0800 PST m=+0.040286758
package locale
@@ -167,12 +167,13 @@ var translations = map[string]string{
"Activate Fever API": "Activer l'API de Fever",
"Fever Username": "Nom d'utilisateur pour l'API de Fever",
"Fever Password": "Mot de passe pour l'API de Fever",
- "Fetch original content": "Récupérer le contenu original"
+ "Fetch original content": "Récupérer le contenu original",
+ "Scraper Rules": "Règles pour récupérer le contenu original"
}
`,
}
var translationsChecksums = map[string]string{
"en_US": "6fe95384260941e8a5a3c695a655a932e0a8a6a572c1e45cb2b1ae8baa01b897",
- "fr_FR": "fd629b171aefa50dd0a6100acaac8fbecbdf1a1d53e3fce984234565ec5bb5d5",
+ "fr_FR": "4426cea875ee2c9acb1a2b0619cb82f3a32f71aabe5d07657eaf2f6b7387c5f9",
}
diff --git a/locale/translations/fr_FR.json b/locale/translations/fr_FR.json
index cc82efe..0a51ec3 100644
--- a/locale/translations/fr_FR.json
+++ b/locale/translations/fr_FR.json
@@ -151,5 +151,6 @@
"Activate Fever API": "Activer l'API de Fever",
"Fever Username": "Nom d'utilisateur pour l'API de Fever",
"Fever Password": "Mot de passe pour l'API de Fever",
- "Fetch original content": "Récupérer le contenu original"
+ "Fetch original content": "Récupérer le contenu original",
+ "Scraper Rules": "Règles pour récupérer le contenu original"
}
diff --git a/model/feed.go b/model/feed.go
index dbdb9d6..fb2819d 100644
--- a/model/feed.go
+++ b/model/feed.go
@@ -22,6 +22,7 @@ type Feed struct {
LastModifiedHeader string `json:"last_modified_header,omitempty"`
ParsingErrorMsg string `json:"parsing_error_message,omitempty"`
ParsingErrorCount int `json:"parsing_error_count,omitempty"`
+ ScraperRules string `json:"scraper_rules"`
Category *Category `json:"category,omitempty"`
Entries Entries `json:"entries,omitempty"`
Icon *FeedIcon `json:"icon,omitempty"`
diff --git a/reader/scraper/rules.go b/reader/scraper/rules.go
new file mode 100644
index 0000000..ae6c4a5
--- /dev/null
+++ b/reader/scraper/rules.go
@@ -0,0 +1,16 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+// List of predefined scraper rules (alphabetically sorted)
+// domain => CSS selectors
+var predefinedRules = map[string]string{
+ "lemonde.fr": "div#articleBody",
+ "lesjoiesducode.fr": ".blog-post-content img",
+ "linux.com": "div.content, div[property]",
+ "opensource.com": "div[property]",
+ "phoronix.com": "div.content",
+ "techcrunch.com": "div.article-entry",
+}
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index 6c51862..b79a088 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -6,14 +6,19 @@ package scraper
import (
"errors"
+ "io"
+ "log"
+ "strings"
+ "github.com/PuerkitoBio/goquery"
"github.com/miniflux/miniflux2/http"
"github.com/miniflux/miniflux2/reader/readability"
"github.com/miniflux/miniflux2/reader/sanitizer"
+ "github.com/miniflux/miniflux2/url"
)
// Fetch download a web page a returns relevant contents.
-func Fetch(websiteURL string) (string, error) {
+func Fetch(websiteURL, rules string) (string, error) {
client := http.NewClient(websiteURL)
response, err := client.Get()
if err != nil {
@@ -29,10 +34,57 @@ func Fetch(websiteURL string) (string, error) {
return "", err
}
- content, err := readability.ExtractContent(page)
+ var content string
+ if rules == "" {
+ rules = getPredefinedScraperRules(websiteURL)
+ }
+
+ if rules != "" {
+ log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
+ content, err = scrapContent(page, rules)
+ } else {
+ log.Printf(`[Scraper] Using readability for "%s"`, websiteURL)
+ content, err = readability.ExtractContent(page)
+ }
+
if err != nil {
return "", err
}
return sanitizer.Sanitize(websiteURL, content), nil
}
+
+func scrapContent(page io.Reader, rules string) (string, error) {
+ document, err := goquery.NewDocumentFromReader(page)
+ if err != nil {
+ return "", err
+ }
+
+ contents := ""
+ document.Find(rules).Each(func(i int, s *goquery.Selection) {
+ var content string
+
+ // For some inline elements, we get the parent.
+ if s.Is("img") {
+ content, _ = s.Parent().Html()
+ } else {
+ content, _ = s.Html()
+ }
+
+ contents += content
+ })
+
+ return contents, nil
+}
+
+func getPredefinedScraperRules(websiteURL string) string {
+ urlDomain := url.Domain(websiteURL)
+
+ for domain, rules := range predefinedRules {
+ if strings.Contains(urlDomain, domain) {
+ return rules
+ }
+ }
+
+ return ""
+}
diff --git a/reader/scraper/scraper_test.go b/reader/scraper/scraper_test.go
new file mode 100644
index 0000000..b493e25
--- /dev/null
+++ b/reader/scraper/scraper_test.go
@@ -0,0 +1,21 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+import "testing"
+
+func TestGetPredefinedRules(t *testing.T) {
+ if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
+ t.Error("Unable to find rule for phoronix.com")
+ }
+
+ if getPredefinedScraperRules("https://www.linux.com/") == "" {
+ t.Error("Unable to find rule for linux.com")
+ }
+
+ if getPredefinedScraperRules("https://example.org/") != "" {
+ t.Error("A rule not defined should not return anything")
+ }
+}
diff --git a/server/template/html/edit_feed.html b/server/template/html/edit_feed.html
index fac2a9b..0495092 100644
--- a/server/template/html/edit_feed.html
+++ b/server/template/html/edit_feed.html
@@ -45,6 +45,9 @@
<label for="form-feed-url">{{ t "Feed URL" }}</label>
<input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
+ <label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
+ <input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
+
<label for="form-category">{{ t "Category" }}</label>
<select id="form-category" name="category_id">
{{ range .categories }}
diff --git a/server/template/views.go b/server/template/views.go
index 80d956d..420a344 100644
--- a/server/template/views.go
+++ b/server/template/views.go
@@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
-// 2017-12-10 18:56:24.375327888 -0800 PST m=+0.017306975
+// 2017-12-10 20:08:14.428877093 -0800 PST m=+0.021859548
package template
@@ -395,6 +395,9 @@ var templateViewsMap = map[string]string{
<label for="form-feed-url">{{ t "Feed URL" }}</label>
<input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
+ <label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
+ <input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
+
<label for="form-category">{{ t "Category" }}</label>
<select id="form-category" name="category_id">
{{ range .categories }}
@@ -1181,7 +1184,7 @@ var templateViewsMapChecksums = map[string]string{
"create_category": "2b82af5d2dcd67898dc5daa57a6461e6ff8121a6089b2a2a1be909f35e4a2275",
"create_user": "45e226df757126d5fe7c464e295e9a34f07952cfdb71e31e49839850d35af139",
"edit_category": "cee720faadcec58289b707ad30af623d2ee66c1ce23a732965463250d7ff41c5",
- "edit_feed": "c5bc4c22bf7e8348d880395250545595d21fb8c8e723fc5d7cca68e25d250884",
+ "edit_feed": "b3c7dd5e93d58e051abcd59da31217d8e9b50587014b895d1b7c9172247b35f8",
"edit_user": "82d9749d76ddbd2352816d813c4b1f6d92f2222de678b4afe5821090246735c7",
"entry": "ebcf9bb35812dd02759718f7f7411267e6a6c8efd59a9aa0a0e735bcb88efeff",
"feed_entries": "547c19eb36b20e350ce70ed045173b064cdcd6b114afb241c9f2dda9d88fcc27",
diff --git a/server/ui/controller/entry.go b/server/ui/controller/entry.go
index eb47201..5c1d3f5 100644
--- a/server/ui/controller/entry.go
+++ b/server/ui/controller/entry.go
@@ -40,18 +40,14 @@ func (c *Controller) FetchContent(ctx *core.Context, request *core.Request, resp
return
}
- content, err := scraper.Fetch(entry.URL)
+ content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules)
if err != nil {
response.JSON().ServerError(err)
return
}
- if len(content) > len(entry.Content) {
- entry.Content = content
- c.store.UpdateEntryContent(entry)
- } else {
- content = entry.Content
- }
+ entry.Content = content
+ c.store.UpdateEntryContent(entry)
response.JSON().Created(map[string]string{"content": content})
}
diff --git a/server/ui/controller/feed.go b/server/ui/controller/feed.go
index eeb66c4..d5bc858 100644
--- a/server/ui/controller/feed.go
+++ b/server/ui/controller/feed.go
@@ -217,10 +217,11 @@ func (c *Controller) getFeedFormTemplateArgs(ctx *core.Context, user *model.User
if feedForm == nil {
args["form"] = form.FeedForm{
- SiteURL: feed.SiteURL,
- FeedURL: feed.FeedURL,
- Title: feed.Title,
- CategoryID: feed.Category.ID,
+ SiteURL: feed.SiteURL,
+ FeedURL: feed.FeedURL,
+ Title: feed.Title,
+ ScraperRules: feed.ScraperRules,
+ CategoryID: feed.Category.ID,
}
} else {
args["form"] = feedForm
diff --git a/server/ui/form/feed.go b/server/ui/form/feed.go
index 8a8cf20..7d07375 100644
--- a/server/ui/form/feed.go
+++ b/server/ui/form/feed.go
@@ -14,10 +14,11 @@ import (
// FeedForm represents a feed form in the UI
type FeedForm struct {
- FeedURL string
- SiteURL string
- Title string
- CategoryID int64
+ FeedURL string
+ SiteURL string
+ Title string
+ ScraperRules string
+ CategoryID int64
}
// ValidateModification validates FeedForm fields
@@ -34,6 +35,7 @@ func (f FeedForm) Merge(feed *model.Feed) *model.Feed {
feed.Title = f.Title
feed.SiteURL = f.SiteURL
feed.FeedURL = f.FeedURL
+ feed.ScraperRules = f.ScraperRules
feed.ParsingErrorCount = 0
feed.ParsingErrorMsg = ""
return feed
@@ -47,9 +49,10 @@ func NewFeedForm(r *http.Request) *FeedForm {
}
return &FeedForm{
- FeedURL: r.FormValue("feed_url"),
- SiteURL: r.FormValue("site_url"),
- Title: r.FormValue("title"),
- CategoryID: int64(categoryID),
+ FeedURL: r.FormValue("feed_url"),
+ SiteURL: r.FormValue("site_url"),
+ Title: r.FormValue("title"),
+ ScraperRules: r.FormValue("scraper_rules"),
+ CategoryID: int64(categoryID),
}
}
diff --git a/sql/schema_version_6.sql b/sql/schema_version_6.sql
new file mode 100644
index 0000000..6f79ca8
--- /dev/null
+++ b/sql/schema_version_6.sql
@@ -0,0 +1 @@
+alter table feeds add column scraper_rules text default '';
diff --git a/sql/sql.go b/sql/sql.go
index 732fc81..edbdd9c 100644
--- a/sql/sql.go
+++ b/sql/sql.go
@@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
-// 2017-12-10 18:56:24.36359961 -0800 PST m=+0.005578697
+// 2017-12-10 20:08:14.411225368 -0800 PST m=+0.004207823
package sql
@@ -137,6 +137,8 @@ alter table users add column entry_direction entry_sorting_direction default 'as
primary key(user_id)
)
`,
+ "schema_version_6": `alter table feeds add column scraper_rules text default '';
+`,
}
var SqlMapChecksums = map[string]string{
@@ -145,4 +147,5 @@ var SqlMapChecksums = map[string]string{
"schema_version_3": "a54745dbc1c51c000f74d4e5068f1e2f43e83309f023415b1749a47d5c1e0f12",
"schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9",
"schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c",
+ "schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4",
}
diff --git a/storage/entry_query_builder.go b/storage/entry_query_builder.go
index 143fd3b..957adaa 100644
--- a/storage/entry_query_builder.go
+++ b/storage/entry_query_builder.go
@@ -152,7 +152,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
SELECT
e.id, e.user_id, e.feed_id, e.hash, e.published_at at time zone '%s', e.title, e.url, e.author, e.content, e.status,
f.title as feed_title, f.feed_url, f.site_url, f.checked_at,
- f.category_id, c.title as category_title,
+ f.category_id, c.title as category_title, f.scraper_rules,
fi.icon_id
FROM entries e
LEFT JOIN feeds f ON f.id=e.feed_id
@@ -197,6 +197,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
&entry.Feed.CheckedAt,
&entry.Feed.Category.ID,
&entry.Feed.Category.Title,
+ &entry.Feed.ScraperRules,
&iconID,
)
diff --git a/storage/feed.go b/storage/feed.go
index 9d42664..e722b0b 100644
--- a/storage/feed.go
+++ b/storage/feed.go
@@ -52,7 +52,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
feeds := make(model.Feeds, 0)
query := `SELECT
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
- f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
+ f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
f.category_id, c.title as category_title,
fi.icon_id
FROM feeds f
@@ -84,6 +84,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
&feed.CheckedAt,
&feed.ParsingErrorCount,
&errorMsg,
+ &feed.ScraperRules,
&feed.Category.ID,
&feed.Category.Title,
&iconID,
@@ -122,7 +123,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
query := `
SELECT
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
- f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
+ f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
f.category_id, c.title as category_title
FROM feeds f
LEFT JOIN categories c ON c.id=f.category_id
@@ -139,6 +140,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
&feed.CheckedAt,
&feed.ParsingErrorCount,
&feed.ParsingErrorMsg,
+ &feed.ScraperRules,
&feed.Category.ID,
&feed.Category.Title,
)
@@ -195,8 +197,8 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
query := `UPDATE feeds SET
feed_url=$1, site_url=$2, title=$3, category_id=$4, etag_header=$5, last_modified_header=$6, checked_at=$7,
- parsing_error_msg=$8, parsing_error_count=$9
- WHERE id=$10 AND user_id=$11`
+ parsing_error_msg=$8, parsing_error_count=$9, scraper_rules=$10
+ WHERE id=$11 AND user_id=$12`
_, err = s.db.Exec(query,
feed.FeedURL,
@@ -208,6 +210,7 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
feed.CheckedAt,
feed.ParsingErrorMsg,
feed.ParsingErrorCount,
+ feed.ScraperRules,
feed.ID,
feed.UserID,
)
diff --git a/storage/migration.go b/storage/migration.go
index 76ad801..8f3c8cb 100644
--- a/storage/migration.go
+++ b/storage/migration.go
@@ -12,7 +12,7 @@ import (
"github.com/miniflux/miniflux2/sql"
)
-const schemaVersion = 5
+const schemaVersion = 6
// Migrate run database migrations.
func (s *Storage) Migrate() {