From bf632fad2e19e9ece4db5957f05727f373541917 Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Sat, 4 Jan 2020 15:18:24 -0800 Subject: Allow only absolute URLs in comments URL Some feeds are using invalid URLs (random text). --- reader/atom/atom_10.go | 11 ++++++++++- reader/atom/atom_10_test.go | 40 ++++++++++++++++++++++++++++++++++++++++ reader/rss/parser_test.go | 25 +++++++++++++++++++++++++ reader/rss/rss.go | 7 ++++++- url/url.go | 9 +++++++++ url/url_test.go | 15 +++++++++++++++ 6 files changed, 105 insertions(+), 2 deletions(-) diff --git a/reader/atom/atom_10.go b/reader/atom/atom_10.go index 099cbed..708cc9f 100644 --- a/reader/atom/atom_10.go +++ b/reader/atom/atom_10.go @@ -84,7 +84,7 @@ func (a *atom10Entry) Transform() *model.Entry { entry.Content = a.entryContent() entry.Title = a.entryTitle() entry.Enclosures = a.entryEnclosures() - entry.CommentsURL = a.Links.firstLinkWithRelationAndType("replies", "text/html") + entry.CommentsURL = a.entryCommentsURL() return entry } @@ -194,6 +194,15 @@ func (a *atom10Entry) entryEnclosures() model.EnclosureList { return enclosures } +// See https://tools.ietf.org/html/rfc4685#section-3 +func (a *atom10Entry) entryCommentsURL() string { + commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html") + if url.IsAbsoluteURL(commentsURL) { + return commentsURL + } + return "" +} + type atom10Text struct { Type string `xml:"type,attr"` Data string `xml:",chardata"` diff --git a/reader/atom/atom_10_test.go b/reader/atom/atom_10_test.go index 63127c4..d614691 100644 --- a/reader/atom/atom_10_test.go +++ b/reader/atom/atom_10_test.go @@ -777,3 +777,43 @@ func TestParseRepliesLinkRelation(t *testing.T) { t.Errorf("Incorrect entry comments URL, got: %s", feed.Entries[0].CommentsURL) } } + +func TestAbsoluteCommentsURL(t *testing.T) { + data := ` + + http://www.example.org/myfeed + My Example Feed + 2005-07-28T12:00:00Z + + James + + tag:entries.com,2005:1 + My original entry + 2006-03-01T12:12:12Z + + + This is my original entry + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].URL != "http://www.example.org/entries/1" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].CommentsURL != "" { + t.Errorf("Incorrect entry comments URL, got: %s", feed.Entries[0].CommentsURL) + } +} diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go index dd1d261..b7ebd0a 100644 --- a/reader/rss/parser_test.go +++ b/reader/rss/parser_test.go @@ -837,6 +837,31 @@ func TestParseEntryWithCommentsURL(t *testing.T) { } } +func TestParseEntryWithInvalidCommentsURL(t *testing.T) { + data := ` + + + https://example.org/ + + Item 1 + https://example.org/item1 + + Some text + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].CommentsURL != "" { + t.Errorf("Incorrect entry comments URL, got: %q", feed.Entries[0].CommentsURL) + } +} + func TestParseInvalidXml(t *testing.T) { data := `garbage` _, err := Parse(bytes.NewBufferString(data)) diff --git a/reader/rss/rss.go b/reader/rss/rss.go index fd120cb..172f6f8 100644 --- a/reader/rss/rss.go +++ b/reader/rss/rss.go @@ -317,7 +317,12 @@ func (r *rssItem) entryEnclosures() model.EnclosureList { func (r *rssItem) entryCommentsURL() string { for _, commentLink := range r.CommentLinks { if commentLink.XMLName.Space == "" { - return strings.TrimSpace(commentLink.Data) + commentsURL := strings.TrimSpace(commentLink.Data) + // The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL) + // See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt + if url.IsAbsoluteURL(commentsURL) { + return commentsURL + } } } diff --git a/url/url.go b/url/url.go index b02348f..d0c627e 100644 --- a/url/url.go +++ b/url/url.go @@ -11,6 +11,15 @@ import ( "strings" ) +// IsAbsoluteURL returns true if the link is absolute. +func IsAbsoluteURL(link string) bool { + u, err := url.Parse(link) + if err != nil { + return false + } + return u.IsAbs() +} + // AbsoluteURL converts the input URL as absolute URL if necessary. func AbsoluteURL(baseURL, input string) (string, error) { if strings.HasPrefix(input, "//") { diff --git a/url/url_test.go b/url/url_test.go index 56b6e13..ea488cf 100644 --- a/url/url_test.go +++ b/url/url_test.go @@ -6,6 +6,21 @@ package url // import "miniflux.app/url" import "testing" +func TestIsAbsoluteURL(t *testing.T) { + scenarios := map[string]bool{ + "https://example.org/file.pdf": true, + "magnet:?xt.1=urn:sha1:YNCKHTQCWBTRNJIV4WNAE52SJUQCZO5C&xt.2=urn:sha1:TXGCZQTH26NL6OUQAJJPFALHG2LTGBC7": true, + "invalid url": false, + } + + for input, expected := range scenarios { + actual := IsAbsoluteURL(input) + if actual != expected { + t.Errorf(`Unexpected result, got %v instead of %v for %q`, actual, expected, input) + } + } +} + func TestAbsoluteURL(t *testing.T) { scenarios := [][]string{ []string{"https://example.org/path/file.ext", "https://example.org/folder/", "/path/file.ext"}, -- cgit v1.2.3