Move feed parsers packages in reader package

author: Frédéric Guillot <fred@miniflux.net> 2017-11-20 19:17:04 -0800
committer: Frédéric Guillot <fred@miniflux.net> 2017-11-20 19:17:04 -0800
commit: d5838b67340ca83dcc32f6d1775c183188ec5e7a (patch)
tree: a71705463458e5d2a6f74b0323119c3376f89b97 /reader/rss/rss.go
parent: c26787f47667f69a2d3e85db01fe20b9bc70bc9a (diff)
1 files changed, 235 insertions, 0 deletions
diff --git a/reader/rss/rss.go b/reader/rss/rss.go
new file mode 100644
index 0000000..2c63514
--- /dev/null
+++ b/reader/rss/rss.go
@@ -0,0 +1,235 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package rss
+
+import (
+	"encoding/xml"
+	"log"
+	"path"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/miniflux/miniflux2/helper"
+	"github.com/miniflux/miniflux2/model"
+	"github.com/miniflux/miniflux2/reader/date"
+	"github.com/miniflux/miniflux2/reader/processor"
+	"github.com/miniflux/miniflux2/reader/sanitizer"
+)
+
+type rssFeed struct {
+	XMLName      xml.Name  `xml:"rss"`
+	Version      string    `xml:"version,attr"`
+	Title        string    `xml:"channel>title"`
+	Links        []rssLink `xml:"channel>link"`
+	Language     string    `xml:"channel>language"`
+	Description  string    `xml:"channel>description"`
+	PubDate      string    `xml:"channel>pubDate"`
+	ItunesAuthor string    `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
+	Items        []rssItem `xml:"channel>item"`
+}
+
+type rssLink struct {
+	XMLName xml.Name
+	Data    string `xml:",chardata"`
+	Href    string `xml:"href,attr"`
+	Rel     string `xml:"rel,attr"`
+}
+
+type rssItem struct {
+	GUID              string         `xml:"guid"`
+	Title             string         `xml:"title"`
+	Links             []rssLink      `xml:"link"`
+	OriginalLink      string         `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"`
+	Description       string         `xml:"description"`
+	Content           string         `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
+	PubDate           string         `xml:"pubDate"`
+	Date              string         `xml:"http://purl.org/dc/elements/1.1/ date"`
+	Authors           []rssAuthor    `xml:"author"`
+	Creator           string         `xml:"http://purl.org/dc/elements/1.1/ creator"`
+	Enclosures        []rssEnclosure `xml:"enclosure"`
+	OrigEnclosureLink string         `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"`
+}
+
+type rssAuthor struct {
+	XMLName xml.Name
+	Data    string `xml:",chardata"`
+	Name    string `xml:"name"`
+}
+
+type rssEnclosure struct {
+	URL    string `xml:"url,attr"`
+	Type   string `xml:"type,attr"`
+	Length string `xml:"length,attr"`
+}
+
+func (r *rssFeed) GetSiteURL() string {
+	for _, element := range r.Links {
+		if element.XMLName.Space == "" {
+			return element.Data
+		}
+	}
+
+	return ""
+}
+
+func (r *rssFeed) GetFeedURL() string {
+	for _, element := range r.Links {
+		if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
+			return element.Href
+		}
+	}
+
+	return ""
+}
+
+func (r *rssFeed) Transform() *model.Feed {
+	feed := new(model.Feed)
+	feed.SiteURL = r.GetSiteURL()
+	feed.FeedURL = r.GetFeedURL()
+	feed.Title = sanitizer.StripTags(r.Title)
+
+	if feed.Title == "" {
+		feed.Title = feed.SiteURL
+	}
+
+	for _, item := range r.Items {
+		entry := item.Transform()
+
+		if entry.Author == "" && r.ItunesAuthor != "" {
+			entry.Author = r.ItunesAuthor
+		}
+		entry.Author = sanitizer.StripTags(entry.Author)
+
+		if entry.URL == "" {
+			entry.URL = feed.SiteURL
+		}
+
+		feed.Entries = append(feed.Entries, entry)
+	}
+
+	return feed
+}
+func (r *rssItem) GetDate() time.Time {
+	value := r.PubDate
+	if r.Date != "" {
+		value = r.Date
+	}
+
+	if value != "" {
+		result, err := date.Parse(value)
+		if err != nil {
+			log.Println(err)
+			return time.Now()
+		}
+
+		return result
+	}
+
+	return time.Now()
+}
+
+func (r *rssItem) GetAuthor() string {
+	for _, element := range r.Authors {
+		if element.Name != "" {
+			return element.Name
+		}
+
+		if element.Data != "" {
+			return element.Data
+		}
+	}
+
+	return r.Creator
+}
+
+func (r *rssItem) GetHash() string {
+	for _, value := range []string{r.GUID, r.GetURL()} {
+		if value != "" {
+			return helper.Hash(value)
+		}
+	}
+
+	return ""
+}
+
+func (r *rssItem) GetContent() string {
+	if r.Content != "" {
+		return r.Content
+	}
+
+	return r.Description
+}
+
+func (r *rssItem) GetURL() string {
+	if r.OriginalLink != "" {
+		return r.OriginalLink
+	}
+
+	for _, link := range r.Links {
+		if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
+			return link.Href
+		}
+
+		if link.Data != "" {
+			return link.Data
+		}
+	}
+
+	return ""
+}
+
+func (r *rssItem) GetEnclosures() model.EnclosureList {
+	enclosures := make(model.EnclosureList, 0)
+
+	for _, enclosure := range r.Enclosures {
+		length, _ := strconv.Atoi(enclosure.Length)
+		enclosureURL := enclosure.URL
+
+		if r.OrigEnclosureLink != "" {
+			filename := path.Base(r.OrigEnclosureLink)
+			if strings.Contains(enclosureURL, filename) {
+				enclosureURL = r.OrigEnclosureLink
+			}
+		}
+
+		enclosures = append(enclosures, &model.Enclosure{
+			URL:      enclosureURL,
+			MimeType: enclosure.Type,
+			Size:     length,
+		})
+	}
+
+	return enclosures
+}
+
+func (r *rssItem) Transform() *model.Entry {
+	entry := new(model.Entry)
+	entry.URL = r.GetURL()
+	entry.Date = r.GetDate()
+	entry.Author = r.GetAuthor()
+	entry.Hash = r.GetHash()
+	entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent())
+	entry.Title = sanitizer.StripTags(strings.Trim(r.Title, " \n\t"))
+	entry.Enclosures = r.GetEnclosures()
+
+	if entry.Title == "" {
+		entry.Title = entry.URL
+	}
+
+	return entry
+}
+
+func isValidLinkRelation(rel string) bool {
+	switch rel {
+	case "", "alternate", "enclosure", "related", "self", "via":
+		return true
+	default:
+		if strings.HasPrefix(rel, "http") {
+			return true
+		}
+		return false
+	}
+}
author	Frédéric Guillot <fred@miniflux.net>	2017-11-20 19:17:04 -0800
committer	Frédéric Guillot <fred@miniflux.net>	2017-11-20 19:17:04 -0800
commit	d5838b67340ca83dcc32f6d1775c183188ec5e7a (patch)
tree	a71705463458e5d2a6f74b0323119c3376f89b97 /reader/rss/rss.go
parent	c26787f47667f69a2d3e85db01fe20b9bc70bc9a (diff)