From 1b33bb3d1cc01067fa8091b0e73a389fc189958c Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Mon, 23 Dec 2019 13:29:53 -0800 Subject: Improve Podcast support (iTunes and Google Play feeds) - Add support for Google Play XML namespace - Improve existing iTunes namespace implementation --- reader/rss/dublincore.go | 12 ++ reader/rss/feedburner.go | 11 ++ reader/rss/parser_test.go | 350 ++++++++++++++++++++++++++++++++++++++++++++-- reader/rss/podcast.go | 70 ++++++++++ reader/rss/rss.go | 264 +++++++++++++++++++--------------- 5 files changed, 580 insertions(+), 127 deletions(-) create mode 100644 reader/rss/dublincore.go create mode 100644 reader/rss/feedburner.go create mode 100644 reader/rss/podcast.go diff --git a/reader/rss/dublincore.go b/reader/rss/dublincore.go new file mode 100644 index 0000000..c461ece --- /dev/null +++ b/reader/rss/dublincore.go @@ -0,0 +1,12 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss // import "miniflux.app/reader/rss" + +// DublinCoreElement represents Dublin Core XML elements. +type DublinCoreElement struct { + DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"` + DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` +} diff --git a/reader/rss/feedburner.go b/reader/rss/feedburner.go new file mode 100644 index 0000000..baa4e19 --- /dev/null +++ b/reader/rss/feedburner.go @@ -0,0 +1,11 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss // import "miniflux.app/reader/rss" + +// FeedBurnerElement represents FeedBurner XML elements. +type FeedBurnerElement struct { + FeedBurnerLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` + FeedBurnerEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` +} diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go index 56ae159..dd1d261 100644 --- a/reader/rss/parser_test.go +++ b/reader/rss/parser_test.go @@ -230,6 +230,59 @@ func TestParseFeedURLWithAtomLink(t *testing.T) { } } +func TestParseFeedWithWebmaster(t *testing.T) { + data := ` + + + Example + https://example.org/ + webmaster@example.com + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "webmaster@example.com" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseFeedWithManagingEditor(t *testing.T) { + data := ` + + + Example + https://example.org/ + webmaster@example.com + editor@example.com + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "editor@example.com" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + func TestParseEntryWithAuthorAndInnerHTML(t *testing.T) { data := ` @@ -250,12 +303,14 @@ func TestParseEntryWithAuthorAndInnerHTML(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "by Foo Bar" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "by Foo Bar" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) } } -func TestParseEntryWithAtomAuthor(t *testing.T) { +func TestParseEntryWithNonStandardAtomAuthor(t *testing.T) { data := ` @@ -280,8 +335,68 @@ func TestParseEntryWithAtomAuthor(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "Foo Bar" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "Foo Bar" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseEntryWithAtomAuthorEmail(t *testing.T) { + data := ` + + + Example + https://example.org/ + + + Test + https://example.org/item + + author@example.org + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "author@example.org" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseEntryWithAtomAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + + + Test + https://example.org/item + + Foo Bar + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "Foo Bar" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got: %q instead of %q", result, expected) } } @@ -304,8 +419,10 @@ func TestParseEntryWithDublinCoreAuthor(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "Me (me@example.com)" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "Me (me@example.com)" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) } } @@ -328,8 +445,10 @@ func TestParseEntryWithItunesAuthor(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "Someone" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "Someone" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) } } @@ -352,8 +471,119 @@ func TestParseFeedWithItunesAuthor(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "Someone" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "Someone" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseFeedWithItunesOwner(t *testing.T) { + data := ` + + + Example + https://example.org/ + + John Doe + john.doe@example.com + + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "John Doe" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseFeedWithItunesOwnerEmail(t *testing.T) { + data := ` + + + Example + https://example.org/ + + john.doe@example.com + + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "john.doe@example.com" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseEntryWithGooglePlayAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + + Test + https://example.org/item + Someone + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "Someone" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseFeedWithGooglePlayAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + Someone + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "Someone" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) } } @@ -794,6 +1024,7 @@ func TestParseEntryWithMediaPeerLink(t *testing.T) { if len(feed.Entries) != 1 { t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) } + if len(feed.Entries[0].Enclosures) != 1 { t.Fatalf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) } @@ -820,3 +1051,100 @@ func TestParseEntryWithMediaPeerLink(t *testing.T) { } } } + +func TestEntryDescriptionFromItunesSummary(t *testing.T) { + data := ` + + + Podcast Example + http://www.example.com/index.html + + Podcast Episode + http://example.com/episode.m4a + Tue, 08 Mar 2016 12:00:00 GMT + Episode Subtitle + Episode Summary + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expected := "Episode Summary" + result := feed.Entries[0].Content + if expected != result { + t.Errorf(`Unexpected podcast content, got %q instead of %q`, result, expected) + } +} + +func TestEntryDescriptionFromItunesSubtitle(t *testing.T) { + data := ` + + + Podcast Example + http://www.example.com/index.html + + Podcast Episode + http://example.com/episode.m4a + Tue, 08 Mar 2016 12:00:00 GMT + Episode Subtitle + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expected := "Episode Subtitle" + result := feed.Entries[0].Content + if expected != result { + t.Errorf(`Unexpected podcast content, got %q instead of %q`, result, expected) + } +} + +func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) { + data := ` + + + Podcast Example + http://www.example.com/index.html + + Podcast Episode + http://example.com/episode.m4a + Tue, 08 Mar 2016 12:00:00 GMT + Episode Subtitle + Episode Description + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expected := "Episode Description" + result := feed.Entries[0].Content + if expected != result { + t.Errorf(`Unexpected podcast content, got %q instead of %q`, result, expected) + } +} diff --git a/reader/rss/podcast.go b/reader/rss/podcast.go new file mode 100644 index 0000000..61501ee --- /dev/null +++ b/reader/rss/podcast.go @@ -0,0 +1,70 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss // import "miniflux.app/reader/rss" + +import "strings" + +// PodcastFeedElement represents iTunes and GooglePlay feed XML elements. +// Specs: +// - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS +// - https://developers.google.com/search/reference/podcast/rss-feed +type PodcastFeedElement struct { + ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` + Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>subtitle"` + Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>summary"` + PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>owner"` + GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 channel>author"` +} + +// PodcastEntryElement represents iTunes and GooglePlay entry XML elements. +type PodcastEntryElement struct { + Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"` + Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"` + GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"` +} + +// PodcastOwner represents contact information for the podcast owner. +type PodcastOwner struct { + Name string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd name"` + Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"` +} + +// Image represents podcast artwork. +type Image struct { + URL string `xml:"href,attr"` +} + +// PodcastAuthor returns the author of the podcast. +func (e *PodcastFeedElement) PodcastAuthor() string { + author := "" + + switch { + case e.ItunesAuthor != "": + author = e.ItunesAuthor + case e.GooglePlayAuthor != "": + author = e.GooglePlayAuthor + case e.PodcastOwner.Name != "": + author = e.PodcastOwner.Name + case e.PodcastOwner.Email != "": + author = e.PodcastOwner.Email + } + + return strings.TrimSpace(author) +} + +// PodcastDescription returns the description of the podcast. +func (e *PodcastEntryElement) PodcastDescription() string { + description := "" + + switch { + case e.GooglePlayDescription != "": + description = e.GooglePlayDescription + case e.Summary != "": + description = e.Summary + case e.Subtitle != "": + description = e.Subtitle + } + return strings.TrimSpace(description) +} diff --git a/reader/rss/rss.go b/reader/rss/rss.go index fb0c5bf..fd120cb 100644 --- a/reader/rss/rss.go +++ b/reader/rss/rss.go @@ -20,16 +20,86 @@ import ( "miniflux.app/url" ) +// Specs: https://cyber.harvard.edu/rss/rss.html type rssFeed struct { - XMLName xml.Name `xml:"rss"` - Version string `xml:"version,attr"` - Title string `xml:"channel>title"` - Links []rssLink `xml:"channel>link"` - Language string `xml:"channel>language"` - Description string `xml:"channel>description"` - PubDate string `xml:"channel>pubDate"` - ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` - Items []rssItem `xml:"channel>item"` + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"channel>title"` + Links []rssLink `xml:"channel>link"` + Language string `xml:"channel>language"` + Description string `xml:"channel>description"` + PubDate string `xml:"channel>pubDate"` + ManagingEditor string `xml:"channel>managingEditor"` + Webmaster string `xml:"channel>webMaster"` + Items []rssItem `xml:"channel>item"` + PodcastFeedElement +} + +func (r *rssFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.SiteURL = r.siteURL() + feed.FeedURL = r.feedURL() + feed.Title = strings.TrimSpace(r.Title) + + if feed.Title == "" { + feed.Title = feed.SiteURL + } + + for _, item := range r.Items { + entry := item.Transform() + if entry.Author == "" { + entry.Author = r.feedAuthor() + } + entry.Author = sanitizer.StripTags(entry.Author) + + if entry.URL == "" { + entry.URL = feed.SiteURL + } else { + entryURL, err := url.AbsoluteURL(feed.SiteURL, entry.URL) + if err == nil { + entry.URL = entryURL + } + } + + if entry.Title == "" { + entry.Title = entry.URL + } + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} + +func (r *rssFeed) siteURL() string { + for _, element := range r.Links { + if element.XMLName.Space == "" { + return strings.TrimSpace(element.Data) + } + } + + return "" +} + +func (r *rssFeed) feedURL() string { + for _, element := range r.Links { + if element.XMLName.Space == "http://www.w3.org/2005/Atom" { + return strings.TrimSpace(element.Href) + } + } + + return "" +} + +func (r rssFeed) feedAuthor() string { + author := r.PodcastAuthor() + switch { + case r.ManagingEditor != "": + author = r.ManagingEditor + case r.Webmaster != "": + author = r.Webmaster + } + return strings.TrimSpace(author) } type rssLink struct { @@ -48,6 +118,7 @@ type rssAuthor struct { XMLName xml.Name Data string `xml:",chardata"` Name string `xml:"name"` + Email string `xml:"email"` Inner string `xml:",innerxml"` } @@ -66,83 +137,37 @@ func (enclosure *rssEnclosure) Size() int64 { } type rssItem struct { - GUID string `xml:"guid"` - Title string `xml:"title"` - Links []rssLink `xml:"link"` - OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` - CommentLinks []rssCommentLink `xml:"comments"` - Description string `xml:"description"` - EncodedContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` - PubDate string `xml:"pubDate"` - Date string `xml:"http://purl.org/dc/elements/1.1/ date"` - Authors []rssAuthor `xml:"author"` - Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` - EnclosureLinks []rssEnclosure `xml:"enclosure"` - OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` + GUID string `xml:"guid"` + Title string `xml:"title"` + Links []rssLink `xml:"link"` + Description string `xml:"description"` + PubDate string `xml:"pubDate"` + Authors []rssAuthor `xml:"author"` + CommentLinks []rssCommentLink `xml:"comments"` + EnclosureLinks []rssEnclosure `xml:"enclosure"` + DublinCoreElement + FeedBurnerElement + PodcastEntryElement media.Element } -func (r *rssFeed) SiteURL() string { - for _, element := range r.Links { - if element.XMLName.Space == "" { - return strings.TrimSpace(element.Data) - } - } - - return "" -} - -func (r *rssFeed) FeedURL() string { - for _, element := range r.Links { - if element.XMLName.Space == "http://www.w3.org/2005/Atom" { - return strings.TrimSpace(element.Href) - } - } - - return "" -} - -func (r *rssFeed) Transform() *model.Feed { - feed := new(model.Feed) - feed.SiteURL = r.SiteURL() - feed.FeedURL = r.FeedURL() - feed.Title = strings.TrimSpace(r.Title) - - if feed.Title == "" { - feed.Title = feed.SiteURL - } - - for _, item := range r.Items { - entry := item.Transform() - - if entry.Author == "" && r.ItunesAuthor != "" { - entry.Author = r.ItunesAuthor - } - entry.Author = strings.TrimSpace(sanitizer.StripTags(entry.Author)) - - if entry.URL == "" { - entry.URL = feed.SiteURL - } else { - entryURL, err := url.AbsoluteURL(feed.SiteURL, entry.URL) - if err == nil { - entry.URL = entryURL - } - } - - if entry.Title == "" { - entry.Title = entry.URL - } - - feed.Entries = append(feed.Entries, entry) - } - - return feed +func (r *rssItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.URL = r.entryURL() + entry.CommentsURL = r.entryCommentsURL() + entry.Date = r.entryDate() + entry.Author = r.entryAuthor() + entry.Hash = r.entryHash() + entry.Content = r.entryContent() + entry.Title = r.entryTitle() + entry.Enclosures = r.entryEnclosures() + return entry } -func (r *rssItem) PublishedDate() time.Time { +func (r *rssItem) entryDate() time.Time { value := r.PubDate - if r.Date != "" { - value = r.Date + if r.DublinCoreDate != "" { + value = r.DublinCoreDate } if value != "" { @@ -158,22 +183,37 @@ func (r *rssItem) PublishedDate() time.Time { return time.Now() } -func (r *rssItem) Author() string { - for _, element := range r.Authors { - if element.Name != "" { - return element.Name +func (r *rssItem) entryAuthor() string { + author := "" + + for _, rssAuthor := range r.Authors { + switch rssAuthor.XMLName.Space { + case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0": + author = rssAuthor.Data + case "http://www.w3.org/2005/Atom": + if rssAuthor.Name != "" { + author = rssAuthor.Name + } else if rssAuthor.Email != "" { + author = rssAuthor.Email + } + default: + if rssAuthor.Name != "" { + author = rssAuthor.Name + } else { + author = rssAuthor.Inner + } } + } - if element.Inner != "" { - return element.Inner - } + if author == "" { + author = r.DublinCoreCreator } - return r.Creator + return strings.TrimSpace(author) } -func (r *rssItem) Hash() string { - for _, value := range []string{r.GUID, r.URL()} { +func (r *rssItem) entryHash() string { + for _, value := range []string{r.GUID, r.entryURL()} { if value != "" { return crypto.Hash(value) } @@ -182,17 +222,22 @@ func (r *rssItem) Hash() string { return "" } -func (r *rssItem) Content() string { - if r.EncodedContent != "" { - return r.EncodedContent - } +func (r *rssItem) entryTitle() string { + return strings.TrimSpace(sanitizer.StripTags(r.Title)) +} - return r.Description +func (r *rssItem) entryContent() string { + for _, value := range []string{r.DublinCoreContent, r.Description, r.PodcastDescription()} { + if value != "" { + return value + } + } + return "" } -func (r *rssItem) URL() string { - if r.OriginalLink != "" { - return r.OriginalLink +func (r *rssItem) entryURL() string { + if r.FeedBurnerLink != "" { + return r.FeedBurnerLink } for _, link := range r.Links { @@ -208,7 +253,7 @@ func (r *rssItem) URL() string { return "" } -func (r *rssItem) Enclosures() model.EnclosureList { +func (r *rssItem) entryEnclosures() model.EnclosureList { enclosures := make(model.EnclosureList, 0) duplicates := make(map[string]bool, 0) @@ -226,10 +271,10 @@ func (r *rssItem) Enclosures() model.EnclosureList { for _, enclosure := range r.EnclosureLinks { enclosureURL := enclosure.URL - if r.OrigEnclosureLink != "" { - filename := path.Base(r.OrigEnclosureLink) + if r.FeedBurnerEnclosureLink != "" { + filename := path.Base(r.FeedBurnerEnclosureLink) if strings.Contains(enclosureURL, filename) { - enclosureURL = r.OrigEnclosureLink + enclosureURL = r.FeedBurnerEnclosureLink } } @@ -269,7 +314,7 @@ func (r *rssItem) Enclosures() model.EnclosureList { return enclosures } -func (r *rssItem) CommentsURL() string { +func (r *rssItem) entryCommentsURL() string { for _, commentLink := range r.CommentLinks { if commentLink.XMLName.Space == "" { return strings.TrimSpace(commentLink.Data) @@ -279,19 +324,6 @@ func (r *rssItem) CommentsURL() string { return "" } -func (r *rssItem) Transform() *model.Entry { - entry := new(model.Entry) - entry.URL = r.URL() - entry.CommentsURL = r.CommentsURL() - entry.Date = r.PublishedDate() - entry.Author = r.Author() - entry.Hash = r.Hash() - entry.Content = r.Content() - entry.Title = strings.TrimSpace(r.Title) - entry.Enclosures = r.Enclosures() - return entry -} - func isValidLinkRelation(rel string) bool { switch rel { case "", "alternate", "enclosure", "related", "self", "via": -- cgit v1.2.3