diff options
author | Frédéric Guillot <fred@miniflux.net> | 2017-11-20 19:17:04 -0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2017-11-20 19:17:04 -0800 |
commit | d5838b67340ca83dcc32f6d1775c183188ec5e7a (patch) | |
tree | a71705463458e5d2a6f74b0323119c3376f89b97 /reader/rss/rss.go | |
parent | c26787f47667f69a2d3e85db01fe20b9bc70bc9a (diff) |
Move feed parsers packages in reader package
Diffstat (limited to 'reader/rss/rss.go')
-rw-r--r-- | reader/rss/rss.go | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/reader/rss/rss.go b/reader/rss/rss.go new file mode 100644 index 0000000..2c63514 --- /dev/null +++ b/reader/rss/rss.go @@ -0,0 +1,235 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss + +import ( + "encoding/xml" + "log" + "path" + "strconv" + "strings" + "time" + + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/date" + "github.com/miniflux/miniflux2/reader/processor" + "github.com/miniflux/miniflux2/reader/sanitizer" +) + +type rssFeed struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"channel>title"` + Links []rssLink `xml:"channel>link"` + Language string `xml:"channel>language"` + Description string `xml:"channel>description"` + PubDate string `xml:"channel>pubDate"` + ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` + Items []rssItem `xml:"channel>item"` +} + +type rssLink struct { + XMLName xml.Name + Data string `xml:",chardata"` + Href string `xml:"href,attr"` + Rel string `xml:"rel,attr"` +} + +type rssItem struct { + GUID string `xml:"guid"` + Title string `xml:"title"` + Links []rssLink `xml:"link"` + OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` + Description string `xml:"description"` + Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` + PubDate string `xml:"pubDate"` + Date string `xml:"http://purl.org/dc/elements/1.1/ date"` + Authors []rssAuthor `xml:"author"` + Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + Enclosures []rssEnclosure `xml:"enclosure"` + OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` +} + +type rssAuthor struct { + XMLName xml.Name + Data string `xml:",chardata"` + Name string `xml:"name"` +} + +type rssEnclosure struct { + URL string `xml:"url,attr"` + Type string `xml:"type,attr"` + Length string `xml:"length,attr"` +} + +func (r *rssFeed) GetSiteURL() string { + for _, element := range r.Links { + if element.XMLName.Space == "" { + return element.Data + } + } + + return "" +} + +func (r *rssFeed) GetFeedURL() string { + for _, element := range r.Links { + if element.XMLName.Space == "http://www.w3.org/2005/Atom" { + return element.Href + } + } + + return "" +} + +func (r *rssFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.SiteURL = r.GetSiteURL() + feed.FeedURL = r.GetFeedURL() + feed.Title = sanitizer.StripTags(r.Title) + + if feed.Title == "" { + feed.Title = feed.SiteURL + } + + for _, item := range r.Items { + entry := item.Transform() + + if entry.Author == "" && r.ItunesAuthor != "" { + entry.Author = r.ItunesAuthor + } + entry.Author = sanitizer.StripTags(entry.Author) + + if entry.URL == "" { + entry.URL = feed.SiteURL + } + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} +func (r *rssItem) GetDate() time.Time { + value := r.PubDate + if r.Date != "" { + value = r.Date + } + + if value != "" { + result, err := date.Parse(value) + if err != nil { + log.Println(err) + return time.Now() + } + + return result + } + + return time.Now() +} + +func (r *rssItem) GetAuthor() string { + for _, element := range r.Authors { + if element.Name != "" { + return element.Name + } + + if element.Data != "" { + return element.Data + } + } + + return r.Creator +} + +func (r *rssItem) GetHash() string { + for _, value := range []string{r.GUID, r.GetURL()} { + if value != "" { + return helper.Hash(value) + } + } + + return "" +} + +func (r *rssItem) GetContent() string { + if r.Content != "" { + return r.Content + } + + return r.Description +} + +func (r *rssItem) GetURL() string { + if r.OriginalLink != "" { + return r.OriginalLink + } + + for _, link := range r.Links { + if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) { + return link.Href + } + + if link.Data != "" { + return link.Data + } + } + + return "" +} + +func (r *rssItem) GetEnclosures() model.EnclosureList { + enclosures := make(model.EnclosureList, 0) + + for _, enclosure := range r.Enclosures { + length, _ := strconv.Atoi(enclosure.Length) + enclosureURL := enclosure.URL + + if r.OrigEnclosureLink != "" { + filename := path.Base(r.OrigEnclosureLink) + if strings.Contains(enclosureURL, filename) { + enclosureURL = r.OrigEnclosureLink + } + } + + enclosures = append(enclosures, &model.Enclosure{ + URL: enclosureURL, + MimeType: enclosure.Type, + Size: length, + }) + } + + return enclosures +} + +func (r *rssItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.URL = r.GetURL() + entry.Date = r.GetDate() + entry.Author = r.GetAuthor() + entry.Hash = r.GetHash() + entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent()) + entry.Title = sanitizer.StripTags(strings.Trim(r.Title, " \n\t")) + entry.Enclosures = r.GetEnclosures() + + if entry.Title == "" { + entry.Title = entry.URL + } + + return entry +} + +func isValidLinkRelation(rel string) bool { + switch rel { + case "", "alternate", "enclosure", "related", "self", "via": + return true + default: + if strings.HasPrefix(rel, "http") { + return true + } + return false + } +} |