diff options
author | dzaikos <you@example.com> | 2018-07-09 01:22:48 -0400 |
---|---|---|
committer | dzaikos <you@example.com> | 2018-07-09 01:22:48 -0400 |
commit | 6d25e02cb5e3ddd9f3fc8dfb44cf29c98866d587 (patch) | |
tree | f73c9917433086af792304b2f290f3694a8d5579 | |
parent | 8ee4280461aef523677a41a358e86f4918bcffa9 (diff) |
New `add_dynamic_image` rewriter for JavaScript-loaded images.
Searches tags for various `data-*` attributes and sets `img` tag `src` attribute appropriately. Falls back to searching `noscript` for `img` tags.
Includes unit tests.
-rw-r--r-- | reader/rewrite/rewrite_functions.go | 63 | ||||
-rw-r--r-- | reader/rewrite/rewriter.go | 2 | ||||
-rw-r--r-- | reader/rewrite/rewriter_test.go | 42 |
3 files changed, 107 insertions, 0 deletions
diff --git a/reader/rewrite/rewrite_functions.go b/reader/rewrite/rewrite_functions.go index 412266d..4348584 100644 --- a/reader/rewrite/rewrite_functions.go +++ b/reader/rewrite/rewrite_functions.go @@ -14,6 +14,7 @@ import ( var ( youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`) + imgRegex = regexp.MustCompile(`<img [^>]+>`) ) func addImageTitle(entryURL, entryContent string) string { @@ -40,6 +41,68 @@ func addImageTitle(entryURL, entryContent string) string { return entryContent } +func addDynamicImage(entryURL, entryContent string) string { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) + if err != nil { + return entryContent + } + + // Ordered most preferred to least preferred. + candidateAttrs := []string{ + "data-src", + "data-original", + "data-orig", + "data-url", + "data-orig-file", + "data-large-file", + "data-medium-file", + "data-2000src", + "data-1000src", + "data-800src", + "data-655src", + "data-500src", + "data-380src", + } + + changed := false + + doc.Find("img,div").Each(func(i int, img *goquery.Selection) { + for _, candidateAttr := range candidateAttrs { + if srcAttr, found := img.Attr(candidateAttr); found { + changed = true + + if img.Is("img") { + img.SetAttr("src",srcAttr) + } else { + altAttr := img.AttrOr("alt", "") + img.ReplaceWithHtml(`<img src="` + srcAttr + `" alt="` + altAttr + `"/>`) + } + + break; + } + } + }) + + if !changed { + doc.Find("noscript").Each(func(i int, noscript *goquery.Selection) { + matches := imgRegex.FindAllString(noscript.Text(), 2) + + if len(matches) == 1 { + changed = true + + noscript.ReplaceWithHtml(matches[0]) + } + }) + } + + if changed { + output, _ := doc.Find("body").First().Html() + return output + } + + return entryContent +} + func addYoutubeVideo(entryURL, entryContent string) string { matches := youtubeRegex.FindStringSubmatch(entryURL) diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go index 7862f4e..39a499c 100644 --- a/reader/rewrite/rewriter.go +++ b/reader/rewrite/rewriter.go @@ -24,6 +24,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string { switch strings.TrimSpace(rule) { case "add_image_title": entryContent = addImageTitle(entryURL, entryContent) + case "add_dynamic_image": + entryContent = addDynamicImage(entryURL, entryContent) case "add_youtube_video": entryContent = addYoutubeVideo(entryURL, entryContent) case "add_pdf_download_link": diff --git a/reader/rewrite/rewriter_test.go b/reader/rewrite/rewriter_test.go index 7a33b1f..2e4aeac 100644 --- a/reader/rewrite/rewriter_test.go +++ b/reader/rewrite/rewriter_test.go @@ -40,6 +40,7 @@ func TestRewriteWithXkcdLink(t *testing.T) { t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) } } + func TestRewriteWithXkcdLinkAndImageNoTitle(t *testing.T) { description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />` output := Rewriter("https://xkcd.com/1912/", description, ``) @@ -48,6 +49,7 @@ func TestRewriteWithXkcdLinkAndImageNoTitle(t *testing.T) { t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) } } + func TestRewriteWithXkcdLinkAndNoImage(t *testing.T) { description := "test" output := Rewriter("https://xkcd.com/1912/", description, ``) @@ -76,3 +78,43 @@ func TestRewriteWithPDFLink(t *testing.T) { t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) } } + +func TestRewriteWithNoLazyImage(t *testing.T) { + description := `<img src="https://example.org/image.jpg" alt="Image"><noscript><p>Some text</p></noscript>` + output := Rewriter("https://example.org/article", description, "add_dynamic_image") + expected := description + + if expected != output { + t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) + } +} + +func TestRewriteWithLazyImage(t *testing.T) { + description := `<img src="" data-url="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>` + output := Rewriter("https://example.org/article", description, "add_dynamic_image") + expected := `<img src="https://example.org/image.jpg" data-url="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>` + + if expected != output { + t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) + } +} + +func TestRewriteWithLazyDivImage(t *testing.T) { + description := `<div data-url="https://example.org/image.jpg" alt="Image"></div><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>` + output := Rewriter("https://example.org/article", description, "add_dynamic_image") + expected := `<img src="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>` + + if expected != output { + t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) + } +} + +func TestRewriteWithUnknownLazyNoScriptImage(t *testing.T) { + description := `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>` + output := Rewriter("https://example.org/article", description, "add_dynamic_image") + expected := `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"/><img src="https://example.org/fallback.jpg" alt="Fallback"/>` + + if expected != output { + t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) + } +} |