aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader
diff options
context:
space:
mode:
authorGravatar dzaikos <you@example.com>2018-07-09 01:22:48 -0400
committerGravatar dzaikos <you@example.com>2018-07-09 01:22:48 -0400
commit6d25e02cb5e3ddd9f3fc8dfb44cf29c98866d587 (patch)
treef73c9917433086af792304b2f290f3694a8d5579 /reader
parent8ee4280461aef523677a41a358e86f4918bcffa9 (diff)
New `add_dynamic_image` rewriter for JavaScript-loaded images.
Searches tags for various `data-*` attributes and sets `img` tag `src` attribute appropriately. Falls back to searching `noscript` for `img` tags. Includes unit tests.
Diffstat (limited to 'reader')
-rw-r--r--reader/rewrite/rewrite_functions.go63
-rw-r--r--reader/rewrite/rewriter.go2
-rw-r--r--reader/rewrite/rewriter_test.go42
3 files changed, 107 insertions, 0 deletions
diff --git a/reader/rewrite/rewrite_functions.go b/reader/rewrite/rewrite_functions.go
index 412266d..4348584 100644
--- a/reader/rewrite/rewrite_functions.go
+++ b/reader/rewrite/rewrite_functions.go
@@ -14,6 +14,7 @@ import (
var (
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
+ imgRegex = regexp.MustCompile(`<img [^>]+>`)
)
func addImageTitle(entryURL, entryContent string) string {
@@ -40,6 +41,68 @@ func addImageTitle(entryURL, entryContent string) string {
return entryContent
}
+func addDynamicImage(entryURL, entryContent string) string {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
+ if err != nil {
+ return entryContent
+ }
+
+ // Ordered most preferred to least preferred.
+ candidateAttrs := []string{
+ "data-src",
+ "data-original",
+ "data-orig",
+ "data-url",
+ "data-orig-file",
+ "data-large-file",
+ "data-medium-file",
+ "data-2000src",
+ "data-1000src",
+ "data-800src",
+ "data-655src",
+ "data-500src",
+ "data-380src",
+ }
+
+ changed := false
+
+ doc.Find("img,div").Each(func(i int, img *goquery.Selection) {
+ for _, candidateAttr := range candidateAttrs {
+ if srcAttr, found := img.Attr(candidateAttr); found {
+ changed = true
+
+ if img.Is("img") {
+ img.SetAttr("src",srcAttr)
+ } else {
+ altAttr := img.AttrOr("alt", "")
+ img.ReplaceWithHtml(`<img src="` + srcAttr + `" alt="` + altAttr + `"/>`)
+ }
+
+ break;
+ }
+ }
+ })
+
+ if !changed {
+ doc.Find("noscript").Each(func(i int, noscript *goquery.Selection) {
+ matches := imgRegex.FindAllString(noscript.Text(), 2)
+
+ if len(matches) == 1 {
+ changed = true
+
+ noscript.ReplaceWithHtml(matches[0])
+ }
+ })
+ }
+
+ if changed {
+ output, _ := doc.Find("body").First().Html()
+ return output
+ }
+
+ return entryContent
+}
+
func addYoutubeVideo(entryURL, entryContent string) string {
matches := youtubeRegex.FindStringSubmatch(entryURL)
diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go
index 7862f4e..39a499c 100644
--- a/reader/rewrite/rewriter.go
+++ b/reader/rewrite/rewriter.go
@@ -24,6 +24,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
switch strings.TrimSpace(rule) {
case "add_image_title":
entryContent = addImageTitle(entryURL, entryContent)
+ case "add_dynamic_image":
+ entryContent = addDynamicImage(entryURL, entryContent)
case "add_youtube_video":
entryContent = addYoutubeVideo(entryURL, entryContent)
case "add_pdf_download_link":
diff --git a/reader/rewrite/rewriter_test.go b/reader/rewrite/rewriter_test.go
index 7a33b1f..2e4aeac 100644
--- a/reader/rewrite/rewriter_test.go
+++ b/reader/rewrite/rewriter_test.go
@@ -40,6 +40,7 @@ func TestRewriteWithXkcdLink(t *testing.T) {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
}
}
+
func TestRewriteWithXkcdLinkAndImageNoTitle(t *testing.T) {
description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`
output := Rewriter("https://xkcd.com/1912/", description, ``)
@@ -48,6 +49,7 @@ func TestRewriteWithXkcdLinkAndImageNoTitle(t *testing.T) {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
}
}
+
func TestRewriteWithXkcdLinkAndNoImage(t *testing.T) {
description := "test"
output := Rewriter("https://xkcd.com/1912/", description, ``)
@@ -76,3 +78,43 @@ func TestRewriteWithPDFLink(t *testing.T) {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
}
}
+
+func TestRewriteWithNoLazyImage(t *testing.T) {
+ description := `<img src="https://example.org/image.jpg" alt="Image"><noscript><p>Some text</p></noscript>`
+ output := Rewriter("https://example.org/article", description, "add_dynamic_image")
+ expected := description
+
+ if expected != output {
+ t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+ }
+}
+
+func TestRewriteWithLazyImage(t *testing.T) {
+ description := `<img src="" data-url="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+ output := Rewriter("https://example.org/article", description, "add_dynamic_image")
+ expected := `<img src="https://example.org/image.jpg" data-url="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+
+ if expected != output {
+ t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+ }
+}
+
+func TestRewriteWithLazyDivImage(t *testing.T) {
+ description := `<div data-url="https://example.org/image.jpg" alt="Image"></div><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+ output := Rewriter("https://example.org/article", description, "add_dynamic_image")
+ expected := `<img src="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+
+ if expected != output {
+ t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+ }
+}
+
+func TestRewriteWithUnknownLazyNoScriptImage(t *testing.T) {
+ description := `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+ output := Rewriter("https://example.org/article", description, "add_dynamic_image")
+ expected := `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"/><img src="https://example.org/fallback.jpg" alt="Fallback"/>`
+
+ if expected != output {
+ t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+ }
+}