From d5838b67340ca83dcc32f6d1775c183188ec5e7a Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Mon, 20 Nov 2017 19:17:04 -0800 Subject: Move feed parsers packages in reader package --- reader/rss/parser.go | 29 +++ reader/rss/parser_test.go | 550 ++++++++++++++++++++++++++++++++++++++++++++++ reader/rss/rss.go | 235 ++++++++++++++++++++ 3 files changed, 814 insertions(+) create mode 100644 reader/rss/parser.go create mode 100644 reader/rss/parser_test.go create mode 100644 reader/rss/rss.go (limited to 'reader/rss') diff --git a/reader/rss/parser.go b/reader/rss/parser.go new file mode 100644 index 0000000..a5b4434 --- /dev/null +++ b/reader/rss/parser.go @@ -0,0 +1,29 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss + +import ( + "encoding/xml" + "io" + + "github.com/miniflux/miniflux2/errors" + "github.com/miniflux/miniflux2/model" + + "golang.org/x/net/html/charset" +) + +// Parse returns a normalized feed struct from a RSS feed. +func Parse(data io.Reader) (*model.Feed, error) { + feed := new(rssFeed) + decoder := xml.NewDecoder(data) + decoder.CharsetReader = charset.NewReaderLabel + + err := decoder.Decode(feed) + if err != nil { + return nil, errors.NewLocalizedError("Unable to parse RSS feed: %v.", err) + } + + return feed.Transform(), nil +} diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go new file mode 100644 index 0000000..9f1a557 --- /dev/null +++ b/reader/rss/parser_test.go @@ -0,0 +1,550 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss + +import ( + "bytes" + "testing" + "time" + + "github.com/miniflux/miniflux2/errors" +) + +func TestParseRss2Sample(t *testing.T) { + data := ` + + + + Liftoff News + http://liftoff.msfc.nasa.gov/ + Liftoff to Space Exploration. + en-us + Tue, 10 Jun 2003 04:00:00 GMT + Tue, 10 Jun 2003 09:41:01 GMT + http://blogs.law.harvard.edu/tech/rss + Weblog Editor 2.0 + editor@example.com + webmaster@example.com + + Star City + http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp + How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. + Tue, 03 Jun 2003 09:39:21 GMT + http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 + + + Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st. + Fri, 30 May 2003 11:06:42 GMT + http://liftoff.msfc.nasa.gov/2003/05/30.html#item572 + + + The Engine That Does More + http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp + Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that. + Tue, 27 May 2003 08:37:32 GMT + http://liftoff.msfc.nasa.gov/2003/05/27.html#item571 + + + Astronauts' Dirty Laundry + http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp + Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options. + Tue, 20 May 2003 08:56:02 GMT + http://liftoff.msfc.nasa.gov/2003/05/20.html#item570 + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "Liftoff News" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://liftoff.msfc.nasa.gov/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 4 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expectedDate := time.Date(2003, time.June, 3, 9, 39, 21, 0, time.UTC) + if !feed.Entries[0].Date.Equal(expectedDate) { + t.Errorf("Incorrect entry date, got: %v, want: %v", feed.Entries[0].Date, expectedDate) + } + + if feed.Entries[0].Hash != "5b2b4ac2fe1786ddf0fd2da2f1b07f64e691264f41f2db3ea360f31bb6d9152b" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "Star City" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } + + if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's Star City.` { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } +} + +func TestParseFeedWithoutTitle(t *testing.T) { + data := ` + + + https://example.org/ + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "https://example.org/" { + t.Errorf("Incorrect feed title, got: %s", feed.Title) + } +} + +func TestParseEntryWithoutTitle(t *testing.T) { + data := ` + + + https://example.org/ + + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Title != "https://example.org/item" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseEntryWithoutLink(t *testing.T) { + data := ` + + + https://example.org/ + + 1234 + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].URL != "https://example.org/" { + t.Errorf("Incorrect entry link, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Hash != "03ac674216f3e15c761ee1a5e255f067953623c8b388b4459e13f978d7c846f4" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } +} + +func TestParseEntryWithAtomLink(t *testing.T) { + data := ` + + + https://example.org/ + + Test + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].URL != "https://example.org/item" { + t.Errorf("Incorrect entry link, got: %s", feed.Entries[0].URL) + } +} + +func TestParseEntryWithMultipleAtomLinks(t *testing.T) { + data := ` + + + https://example.org/ + + Test + + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].URL != "https://example.org/b" { + t.Errorf("Incorrect entry link, got: %s", feed.Entries[0].URL) + } +} + +func TestParseFeedURLWithAtomLink(t *testing.T) { + data := ` + + + Example + https://example.org/ + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.FeedURL != "https://example.org/rss" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "https://example.org/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } +} + +func TestParseEntryWithAtomAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + + + Test + https://example.org/item + + Foo Bar + Vice President + + FooBar Inc. + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Foo Bar" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseEntryWithDublinCoreAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + + Test + https://example.org/item + Me (me@example.com) + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Me (me@example.com)" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseEntryWithItunesAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + + Test + https://example.org/item + Someone + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Someone" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseFeedWithItunesAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + Someone + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Someone" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseEntryWithDublinCoreDate(t *testing.T) { + data := ` + + + Example + http://example.org/ + + Item 1 + http://example.org/item1 + Description. + UUID + 2002-09-29T23:40:06-05:00 + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + location, _ := time.LoadLocation("EST") + expectedDate := time.Date(2002, time.September, 29, 23, 40, 06, 0, location) + if !feed.Entries[0].Date.Equal(expectedDate) { + t.Errorf("Incorrect entry date, got: %v, want: %v", feed.Entries[0].Date, expectedDate) + } +} + +func TestParseEntryWithContentEncoded(t *testing.T) { + data := ` + + + Example + http://example.org/ + + Item 1 + http://example.org/item1 + Description. + UUID + Example.

]]>
+
+
+
` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Content != `

Example.

` { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } +} + +func TestParseEntryWithFeedBurnerLink(t *testing.T) { + data := ` + + + Example + http://example.org/ + + Item 1 + http://example.org/item1 + http://example.org/original + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].URL != "http://example.org/original" { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].URL) + } +} + +func TestParseEntryTitleWithWhitespaces(t *testing.T) { + data := ` + + + Example + http://example.org + + + Some Title + + http://www.example.org/entries/1 + Fri, 15 Jul 2005 00:00:00 -0500 + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Title != "Some Title" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseEntryWithEnclosures(t *testing.T) { + data := ` + + + My Podcast Feed + http://example.org + some.email@example.org + + Podcasting with RSS + http://www.example.org/entries/1 + An overview of RSS podcasting + Fri, 15 Jul 2005 00:00:00 -0500 + http://www.example.org/entries/1 + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].URL != "http://www.example.org/entries/1" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if len(feed.Entries[0].Enclosures) != 1 { + t.Errorf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) + } + + if feed.Entries[0].Enclosures[0].URL != "http://www.example.org/myaudiofile.mp3" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[0].URL) + } + + if feed.Entries[0].Enclosures[0].MimeType != "audio/mpeg" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[0].MimeType) + } + + if feed.Entries[0].Enclosures[0].Size != 12345 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } +} + +func TestParseEntryWithFeedBurnerEnclosures(t *testing.T) { + data := ` + + + My Example Feed + http://example.org + some.email@example.org + + Example Item + http://www.example.org/entries/1 + + http://example.org/67ca416c-f22a-4228-a681-68fc9998ec10/File.mp3 + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].URL != "http://www.example.org/entries/1" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if len(feed.Entries[0].Enclosures) != 1 { + t.Errorf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) + } + + if feed.Entries[0].Enclosures[0].URL != "http://example.org/67ca416c-f22a-4228-a681-68fc9998ec10/File.mp3" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[0].URL) + } + + if feed.Entries[0].Enclosures[0].MimeType != "audio/mpeg" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[0].MimeType) + } + + if feed.Entries[0].Enclosures[0].Size != 76192460 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } +} + +func TestParseInvalidXml(t *testing.T) { + data := `garbage` + _, err := Parse(bytes.NewBufferString(data)) + if err == nil { + t.Error("Parse should returns an error") + } + + if _, ok := err.(errors.LocalizedError); !ok { + t.Error("The error returned must be a LocalizedError") + } +} diff --git a/reader/rss/rss.go b/reader/rss/rss.go new file mode 100644 index 0000000..2c63514 --- /dev/null +++ b/reader/rss/rss.go @@ -0,0 +1,235 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss + +import ( + "encoding/xml" + "log" + "path" + "strconv" + "strings" + "time" + + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/date" + "github.com/miniflux/miniflux2/reader/processor" + "github.com/miniflux/miniflux2/reader/sanitizer" +) + +type rssFeed struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"channel>title"` + Links []rssLink `xml:"channel>link"` + Language string `xml:"channel>language"` + Description string `xml:"channel>description"` + PubDate string `xml:"channel>pubDate"` + ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` + Items []rssItem `xml:"channel>item"` +} + +type rssLink struct { + XMLName xml.Name + Data string `xml:",chardata"` + Href string `xml:"href,attr"` + Rel string `xml:"rel,attr"` +} + +type rssItem struct { + GUID string `xml:"guid"` + Title string `xml:"title"` + Links []rssLink `xml:"link"` + OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` + Description string `xml:"description"` + Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` + PubDate string `xml:"pubDate"` + Date string `xml:"http://purl.org/dc/elements/1.1/ date"` + Authors []rssAuthor `xml:"author"` + Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + Enclosures []rssEnclosure `xml:"enclosure"` + OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` +} + +type rssAuthor struct { + XMLName xml.Name + Data string `xml:",chardata"` + Name string `xml:"name"` +} + +type rssEnclosure struct { + URL string `xml:"url,attr"` + Type string `xml:"type,attr"` + Length string `xml:"length,attr"` +} + +func (r *rssFeed) GetSiteURL() string { + for _, element := range r.Links { + if element.XMLName.Space == "" { + return element.Data + } + } + + return "" +} + +func (r *rssFeed) GetFeedURL() string { + for _, element := range r.Links { + if element.XMLName.Space == "http://www.w3.org/2005/Atom" { + return element.Href + } + } + + return "" +} + +func (r *rssFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.SiteURL = r.GetSiteURL() + feed.FeedURL = r.GetFeedURL() + feed.Title = sanitizer.StripTags(r.Title) + + if feed.Title == "" { + feed.Title = feed.SiteURL + } + + for _, item := range r.Items { + entry := item.Transform() + + if entry.Author == "" && r.ItunesAuthor != "" { + entry.Author = r.ItunesAuthor + } + entry.Author = sanitizer.StripTags(entry.Author) + + if entry.URL == "" { + entry.URL = feed.SiteURL + } + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} +func (r *rssItem) GetDate() time.Time { + value := r.PubDate + if r.Date != "" { + value = r.Date + } + + if value != "" { + result, err := date.Parse(value) + if err != nil { + log.Println(err) + return time.Now() + } + + return result + } + + return time.Now() +} + +func (r *rssItem) GetAuthor() string { + for _, element := range r.Authors { + if element.Name != "" { + return element.Name + } + + if element.Data != "" { + return element.Data + } + } + + return r.Creator +} + +func (r *rssItem) GetHash() string { + for _, value := range []string{r.GUID, r.GetURL()} { + if value != "" { + return helper.Hash(value) + } + } + + return "" +} + +func (r *rssItem) GetContent() string { + if r.Content != "" { + return r.Content + } + + return r.Description +} + +func (r *rssItem) GetURL() string { + if r.OriginalLink != "" { + return r.OriginalLink + } + + for _, link := range r.Links { + if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) { + return link.Href + } + + if link.Data != "" { + return link.Data + } + } + + return "" +} + +func (r *rssItem) GetEnclosures() model.EnclosureList { + enclosures := make(model.EnclosureList, 0) + + for _, enclosure := range r.Enclosures { + length, _ := strconv.Atoi(enclosure.Length) + enclosureURL := enclosure.URL + + if r.OrigEnclosureLink != "" { + filename := path.Base(r.OrigEnclosureLink) + if strings.Contains(enclosureURL, filename) { + enclosureURL = r.OrigEnclosureLink + } + } + + enclosures = append(enclosures, &model.Enclosure{ + URL: enclosureURL, + MimeType: enclosure.Type, + Size: length, + }) + } + + return enclosures +} + +func (r *rssItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.URL = r.GetURL() + entry.Date = r.GetDate() + entry.Author = r.GetAuthor() + entry.Hash = r.GetHash() + entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent()) + entry.Title = sanitizer.StripTags(strings.Trim(r.Title, " \n\t")) + entry.Enclosures = r.GetEnclosures() + + if entry.Title == "" { + entry.Title = entry.URL + } + + return entry +} + +func isValidLinkRelation(rel string) bool { + switch rel { + case "", "alternate", "enclosure", "related", "self", "via": + return true + default: + if strings.HasPrefix(rel, "http") { + return true + } + return false + } +} -- cgit v1.2.3