diff options
author | Frédéric Guillot <fred@miniflux.net> | 2017-12-13 21:30:40 -0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2017-12-13 21:30:40 -0800 |
commit | c6d9eb361454b70406cf6573b284d5e99ba3a2d2 (patch) | |
tree | b53ce39814d225a8f8f02e2036a314ff1d7a1059 /reader | |
parent | 827683ab59131ec38ed7cfa268bcaa6dc77d1412 (diff) |
Improve content scraper
Diffstat (limited to 'reader')
-rw-r--r-- | reader/rewrite/rewrite_functions.go | 8 | ||||
-rw-r--r-- | reader/rewrite/rewriter.go | 4 | ||||
-rw-r--r-- | reader/rewrite/rewriter_test.go | 10 | ||||
-rw-r--r-- | reader/scraper/rules.go | 6 | ||||
-rw-r--r-- | reader/scraper/scraper.go | 3 |
5 files changed, 31 insertions, 0 deletions
diff --git a/reader/rewrite/rewrite_functions.go b/reader/rewrite/rewrite_functions.go index f1a5b83..6717deb 100644 --- a/reader/rewrite/rewrite_functions.go +++ b/reader/rewrite/rewrite_functions.go @@ -5,6 +5,7 @@ package rewrite import ( + "fmt" "regexp" "strings" @@ -38,3 +39,10 @@ func addYoutubeVideo(entryURL, entryContent string) string { } return entryContent } + +func addPDFLink(entryURL, entryContent string) string { + if strings.HasSuffix(entryURL, ".pdf") { + return fmt.Sprintf(`<a href="%s">PDF</a><br>%s`, entryURL, entryContent) + } + return entryContent +} diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go index d4d5096..7862f4e 100644 --- a/reader/rewrite/rewriter.go +++ b/reader/rewrite/rewriter.go @@ -18,12 +18,16 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string { } rules := strings.Split(rulesList, ",") + rules = append(rules, "add_pdf_download_link") + for _, rule := range rules { switch strings.TrimSpace(rule) { case "add_image_title": entryContent = addImageTitle(entryURL, entryContent) case "add_youtube_video": entryContent = addYoutubeVideo(entryURL, entryContent) + case "add_pdf_download_link": + entryContent = addPDFLink(entryURL, entryContent) } } diff --git a/reader/rewrite/rewriter_test.go b/reader/rewrite/rewriter_test.go index 5f13895..995508b 100644 --- a/reader/rewrite/rewriter_test.go +++ b/reader/rewrite/rewriter_test.go @@ -58,3 +58,13 @@ func TestRewriteWithXkcdAndNoImage(t *testing.T) { t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) } } + +func TestRewriteWithPDFLink(t *testing.T) { + description := "test" + output := Rewriter("https://example.org/document.pdf", description, ``) + expected := `<a href="https://example.org/document.pdf">PDF</a><br>test` + + if expected != output { + t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) + } +} diff --git a/reader/scraper/rules.go b/reader/scraper/rules.go index ae6c4a5..9dc1405 100644 --- a/reader/scraper/rules.go +++ b/reader/scraper/rules.go @@ -7,10 +7,16 @@ package scraper // List of predefined scraper rules (alphabetically sorted) // domain => CSS selectors var predefinedRules = map[string]string{ + "github.com": "article.entry-content", + "igen.fr": "section.corps", "lemonde.fr": "div#articleBody", "lesjoiesducode.fr": ".blog-post-content img", "linux.com": "div.content, div[property]", + "medium.com": ".section-content", "opensource.com": "div[property]", + "osnews.com": "div.newscontent1", "phoronix.com": "div.content", "techcrunch.com": "div.article-entry", + "theregister.co.uk": "#body", + "wired.com": "main figure, article", } diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index 8f5ff15..3d1fb41 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -33,6 +33,9 @@ func Fetch(websiteURL, rules string) (string, error) { return "", err } + // The entry URL could be a redirect somewhere else. + websiteURL = response.EffectiveURL + if rules == "" { rules = getPredefinedScraperRules(websiteURL) } |