diff options
Diffstat (limited to 'reader')
34 files changed, 4026 insertions, 0 deletions
diff --git a/reader/feed/atom/atom.go b/reader/feed/atom/atom.go new file mode 100644 index 0000000..db6172f --- /dev/null +++ b/reader/feed/atom/atom.go @@ -0,0 +1,214 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package atom + +import ( + "encoding/xml" + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/feed/date" + "github.com/miniflux/miniflux2/reader/processor" + "github.com/miniflux/miniflux2/reader/sanitizer" + "log" + "strconv" + "strings" + "time" +) + +type AtomFeed struct { + XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"` + ID string `xml:"id"` + Title string `xml:"title"` + Author Author `xml:"author"` + Links []Link `xml:"link"` + Entries []AtomEntry `xml:"entry"` +} + +type AtomEntry struct { + ID string `xml:"id"` + Title string `xml:"title"` + Updated string `xml:"updated"` + Links []Link `xml:"link"` + Summary string `xml:"summary"` + Content Content `xml:"content"` + MediaGroup MediaGroup `xml:"http://search.yahoo.com/mrss/ group"` + Author Author `xml:"author"` +} + +type Author struct { + Name string `xml:"name"` + Email string `xml:"email"` +} + +type Link struct { + Url string `xml:"href,attr"` + Type string `xml:"type,attr"` + Rel string `xml:"rel,attr"` + Length string `xml:"length,attr"` +} + +type Content struct { + Type string `xml:"type,attr"` + Data string `xml:",chardata"` + Xml string `xml:",innerxml"` +} + +type MediaGroup struct { + Description string `xml:"http://search.yahoo.com/mrss/ description"` +} + +func (a *AtomFeed) getSiteURL() string { + for _, link := range a.Links { + if strings.ToLower(link.Rel) == "alternate" { + return link.Url + } + + if link.Rel == "" && link.Type == "" { + return link.Url + } + } + + return "" +} + +func (a *AtomFeed) getFeedURL() string { + for _, link := range a.Links { + if strings.ToLower(link.Rel) == "self" { + return link.Url + } + } + + return "" +} + +func (a *AtomFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.FeedURL = a.getFeedURL() + feed.SiteURL = a.getSiteURL() + feed.Title = sanitizer.StripTags(a.Title) + + if feed.Title == "" { + feed.Title = feed.SiteURL + } + + for _, entry := range a.Entries { + item := entry.Transform() + if item.Author == "" { + item.Author = a.GetAuthor() + } + + feed.Entries = append(feed.Entries, item) + } + + return feed +} + +func (a *AtomFeed) GetAuthor() string { + return getAuthor(a.Author) +} + +func (e *AtomEntry) GetDate() time.Time { + if e.Updated != "" { + result, err := date.Parse(e.Updated) + if err != nil { + log.Println(err) + return time.Now() + } + + return result + } + + return time.Now() +} + +func (e *AtomEntry) GetURL() string { + for _, link := range e.Links { + if strings.ToLower(link.Rel) == "alternate" { + return link.Url + } + + if link.Rel == "" && link.Type == "" { + return link.Url + } + } + + return "" +} + +func (e *AtomEntry) GetAuthor() string { + return getAuthor(e.Author) +} + +func (e *AtomEntry) GetHash() string { + for _, value := range []string{e.ID, e.GetURL()} { + if value != "" { + return helper.Hash(value) + } + } + + return "" +} + +func (e *AtomEntry) GetContent() string { + if e.Content.Type == "html" || e.Content.Type == "text" { + return e.Content.Data + } + + if e.Content.Type == "xhtml" { + return e.Content.Xml + } + + if e.Summary != "" { + return e.Summary + } + + if e.MediaGroup.Description != "" { + return e.MediaGroup.Description + } + + return "" +} + +func (e *AtomEntry) GetEnclosures() model.EnclosureList { + enclosures := make(model.EnclosureList, 0) + + for _, link := range e.Links { + if strings.ToLower(link.Rel) == "enclosure" { + length, _ := strconv.Atoi(link.Length) + enclosures = append(enclosures, &model.Enclosure{URL: link.Url, MimeType: link.Type, Size: length}) + } + } + + return enclosures +} + +func (e *AtomEntry) Transform() *model.Entry { + entry := new(model.Entry) + entry.URL = e.GetURL() + entry.Date = e.GetDate() + entry.Author = sanitizer.StripTags(e.GetAuthor()) + entry.Hash = e.GetHash() + entry.Content = processor.ItemContentProcessor(entry.URL, e.GetContent()) + entry.Title = sanitizer.StripTags(strings.Trim(e.Title, " \n\t")) + entry.Enclosures = e.GetEnclosures() + + if entry.Title == "" { + entry.Title = entry.URL + } + + return entry +} + +func getAuthor(author Author) string { + if author.Name != "" { + return author.Name + } + + if author.Email != "" { + return author.Email + } + + return "" +} diff --git a/reader/feed/atom/parser.go b/reader/feed/atom/parser.go new file mode 100644 index 0000000..0b8f0d7 --- /dev/null +++ b/reader/feed/atom/parser.go @@ -0,0 +1,28 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package atom + +import ( + "encoding/xml" + "fmt" + "github.com/miniflux/miniflux2/model" + "io" + + "golang.org/x/net/html/charset" +) + +// Parse returns a normalized feed struct. +func Parse(data io.Reader) (*model.Feed, error) { + atomFeed := new(AtomFeed) + decoder := xml.NewDecoder(data) + decoder.CharsetReader = charset.NewReaderLabel + + err := decoder.Decode(atomFeed) + if err != nil { + return nil, fmt.Errorf("Unable to parse Atom feed: %v\n", err) + } + + return atomFeed.Transform(), nil +} diff --git a/reader/feed/atom/parser_test.go b/reader/feed/atom/parser_test.go new file mode 100644 index 0000000..39d9420 --- /dev/null +++ b/reader/feed/atom/parser_test.go @@ -0,0 +1,319 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package atom + +import ( + "bytes" + "testing" + "time" +) + +func TestParseAtomSample(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + + <title>Example Feed</title> + <link href="http://example.org/"/> + <updated>2003-12-13T18:30:02Z</updated> + <author> + <name>John Doe</name> + </author> + <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> + + <entry> + <title>Atom-Powered Robots Run Amok</title> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <summary>Some text.</summary> + </entry> + + </feed>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "Example Feed" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://example.org/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if !feed.Entries[0].Date.Equal(time.Date(2003, time.December, 13, 18, 30, 2, 0, time.UTC)) { + t.Errorf("Incorrect entry date, got: %v", feed.Entries[0].Date) + } + + if feed.Entries[0].Hash != "3841e5cf232f5111fc5841e9eba5f4b26d95e7d7124902e0f7272729d65601a6" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://example.org/2003/12/13/atom03" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "Atom-Powered Robots Run Amok" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } + + if feed.Entries[0].Content != "Some text." { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } + + if feed.Entries[0].Author != "John Doe" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseFeedWithoutTitle(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + <link rel="alternate" type="text/html" href="https://example.org/"/> + <link rel="self" type="application/atom+xml" href="https://example.org/feed"/> + <updated>2003-12-13T18:30:02Z</updated> + </feed>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "https://example.org/" { + t.Errorf("Incorrect feed title, got: %s", feed.Title) + } +} + +func TestParseEntryWithoutTitle(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + + <title>Example Feed</title> + <link href="http://example.org/"/> + <updated>2003-12-13T18:30:02Z</updated> + <author> + <name>John Doe</name> + </author> + <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> + + <entry> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <summary>Some text.</summary> + </entry> + + </feed>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Title != "http://example.org/2003/12/13/atom03" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseFeedURL(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + <title>Example Feed</title> + <link rel="alternate" type="text/html" href="https://example.org/"/> + <link rel="self" type="application/atom+xml" href="https://example.org/feed"/> + <updated>2003-12-13T18:30:02Z</updated> + </feed>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.SiteURL != "https://example.org/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if feed.FeedURL != "https://example.org/feed" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } +} + +func TestParseEntryTitleWithWhitespaces(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + <title>Example Feed</title> + <link href="http://example.org/"/> + + <entry> + <title> + Some Title + </title> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <summary>Some text.</summary> + </entry> + + </feed>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Title != "Some Title" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseEntryWithAuthorName(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + <title>Example Feed</title> + <link href="http://example.org/"/> + + <entry> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <summary>Some text.</summary> + <author> + <name>Me</name> + <email>me@localhost</email> + </author> + </entry> + + </feed>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Me" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseEntryWithoutAuthorName(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + <title>Example Feed</title> + <link href="http://example.org/"/> + + <entry> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <summary>Some text.</summary> + <author> + <name/> + <email>me@localhost</email> + </author> + </entry> + + </feed>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "me@localhost" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseEntryWithEnclosures(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + <id>http://www.example.org/myfeed</id> + <title>My Podcast Feed</title> + <updated>2005-07-15T12:00:00Z</updated> + <author> + <name>John Doe</name> + </author> + <link href="http://example.org" /> + <link rel="self" href="http://example.org/myfeed" /> + <entry> + <id>http://www.example.org/entries/1</id> + <title>Atom 1.0</title> + <updated>2005-07-15T12:00:00Z</updated> + <link href="http://www.example.org/entries/1" /> + <summary>An overview of Atom 1.0</summary> + <link rel="enclosure" + type="audio/mpeg" + title="MP3" + href="http://www.example.org/myaudiofile.mp3" + length="1234" /> + <link rel="enclosure" + type="application/x-bittorrent" + title="BitTorrent" + href="http://www.example.org/myaudiofile.torrent" + length="4567" /> + <content type="xhtml"> + <div xmlns="http://www.w3.org/1999/xhtml"> + <h1>Show Notes</h1> + <ul> + <li>00:01:00 -- Introduction</li> + <li>00:15:00 -- Talking about Atom 1.0</li> + <li>00:30:00 -- Wrapping up</li> + </ul> + </div> + </content> + </entry> + </feed>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].URL != "http://www.example.org/entries/1" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if len(feed.Entries[0].Enclosures) != 2 { + t.Errorf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) + } + + if feed.Entries[0].Enclosures[0].URL != "http://www.example.org/myaudiofile.mp3" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[0].URL) + } + + if feed.Entries[0].Enclosures[0].MimeType != "audio/mpeg" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[0].MimeType) + } + + if feed.Entries[0].Enclosures[0].Size != 1234 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } + + if feed.Entries[0].Enclosures[1].URL != "http://www.example.org/myaudiofile.torrent" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[1].URL) + } + + if feed.Entries[0].Enclosures[1].MimeType != "application/x-bittorrent" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[1].MimeType) + } + + if feed.Entries[0].Enclosures[1].Size != 4567 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[1].Size) + } +} diff --git a/reader/feed/date/parser.go b/reader/feed/date/parser.go new file mode 100644 index 0000000..e573ff8 --- /dev/null +++ b/reader/feed/date/parser.go @@ -0,0 +1,203 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package date + +import ( + "fmt" + "strings" + "time" +) + +// DateFormats taken from github.com/mjibson/goread +var dateFormats = []string{ + time.RFC822, // RSS + time.RFC822Z, // RSS + time.RFC3339, // Atom + time.UnixDate, + time.RubyDate, + time.RFC850, + time.RFC1123Z, + time.RFC1123, + time.ANSIC, + "Mon, January 2 2006 15:04:05 -0700", + "Mon, January 02, 2006, 15:04:05 MST", + "Mon, January 02, 2006 15:04:05 MST", + "Mon, Jan 2, 2006 15:04 MST", + "Mon, Jan 2 2006 15:04 MST", + "Mon, Jan 2, 2006 15:04:05 MST", + "Mon, Jan 2 2006 15:04:05 -700", + "Mon, Jan 2 2006 15:04:05 -0700", + "Mon Jan 2 15:04 2006", + "Mon Jan 2 15:04:05 2006 MST", + "Mon Jan 02, 2006 3:04 pm", + "Mon, Jan 02,2006 15:04:05 MST", + "Mon Jan 02 2006 15:04:05 -0700", + "Monday, January 2, 2006 15:04:05 MST", + "Monday, January 2, 2006 03:04 PM", + "Monday, January 2, 2006", + "Monday, January 02, 2006", + "Monday, 2 January 2006 15:04:05 MST", + "Monday, 2 January 2006 15:04:05 -0700", + "Monday, 2 Jan 2006 15:04:05 MST", + "Monday, 2 Jan 2006 15:04:05 -0700", + "Monday, 02 January 2006 15:04:05 MST", + "Monday, 02 January 2006 15:04:05 -0700", + "Monday, 02 January 2006 15:04:05", + "Mon, 2 January 2006 15:04 MST", + "Mon, 2 January 2006, 15:04 -0700", + "Mon, 2 January 2006, 15:04:05 MST", + "Mon, 2 January 2006 15:04:05 MST", + "Mon, 2 January 2006 15:04:05 -0700", + "Mon, 2 January 2006", + "Mon, 2 Jan 2006 3:04:05 PM -0700", + "Mon, 2 Jan 2006 15:4:5 MST", + "Mon, 2 Jan 2006 15:4:5 -0700 GMT", + "Mon, 2, Jan 2006 15:4", + "Mon, 2 Jan 2006 15:04 MST", + "Mon, 2 Jan 2006, 15:04 -0700", + "Mon, 2 Jan 2006 15:04 -0700", + "Mon, 2 Jan 2006 15:04:05 UT", + "Mon, 2 Jan 2006 15:04:05MST", + "Mon, 2 Jan 2006 15:04:05 MST", + "Mon 2 Jan 2006 15:04:05 MST", + "mon,2 Jan 2006 15:04:05 MST", + "Mon, 2 Jan 2006 15:04:05 -0700 MST", + "Mon, 2 Jan 2006 15:04:05-0700", + "Mon, 2 Jan 2006 15:04:05 -0700", + "Mon, 2 Jan 2006 15:04:05", + "Mon, 2 Jan 2006 15:04", + "Mon,2 Jan 2006", + "Mon, 2 Jan 2006", + "Mon, 2 Jan 15:04:05 MST", + "Mon, 2 Jan 06 15:04:05 MST", + "Mon, 2 Jan 06 15:04:05 -0700", + "Mon, 2006-01-02 15:04", + "Mon,02 January 2006 14:04:05 MST", + "Mon, 02 January 2006", + "Mon, 02 Jan 2006 3:04:05 PM MST", + "Mon, 02 Jan 2006 15 -0700", + "Mon,02 Jan 2006 15:04 MST", + "Mon, 02 Jan 2006 15:04 MST", + "Mon, 02 Jan 2006 15:04 -0700", + "Mon, 02 Jan 2006 15:04:05 Z", + "Mon, 02 Jan 2006 15:04:05 UT", + "Mon, 02 Jan 2006 15:04:05 MST-07:00", + "Mon, 02 Jan 2006 15:04:05 MST -0700", + "Mon, 02 Jan 2006, 15:04:05 MST", + "Mon, 02 Jan 2006 15:04:05MST", + "Mon, 02 Jan 2006 15:04:05 MST", + "Mon , 02 Jan 2006 15:04:05 MST", + "Mon, 02 Jan 2006 15:04:05 GMT-0700", + "Mon,02 Jan 2006 15:04:05 -0700", + "Mon, 02 Jan 2006 15:04:05 -0700", + "Mon, 02 Jan 2006 15:04:05 -07:00", + "Mon, 02 Jan 2006 15:04:05 --0700", + "Mon 02 Jan 2006 15:04:05 -0700", + "Mon, 02 Jan 2006 15:04:05 -07", + "Mon, 02 Jan 2006 15:04:05 00", + "Mon, 02 Jan 2006 15:04:05", + "Mon, 02 Jan 2006", + "Mon, 02 Jan 06 15:04:05 MST", + "January 2, 2006 3:04 PM", + "January 2, 2006, 3:04 p.m.", + "January 2, 2006 15:04:05 MST", + "January 2, 2006 15:04:05", + "January 2, 2006 03:04 PM", + "January 2, 2006", + "January 02, 2006 15:04:05 MST", + "January 02, 2006 15:04", + "January 02, 2006 03:04 PM", + "January 02, 2006", + "Jan 2, 2006 3:04:05 PM MST", + "Jan 2, 2006 3:04:05 PM", + "Jan 2, 2006 15:04:05 MST", + "Jan 2, 2006", + "Jan 02 2006 03:04:05PM", + "Jan 02, 2006", + "6/1/2 15:04", + "6-1-2 15:04", + "2 January 2006 15:04:05 MST", + "2 January 2006 15:04:05 -0700", + "2 January 2006", + "2 Jan 2006 15:04:05 Z", + "2 Jan 2006 15:04:05 MST", + "2 Jan 2006 15:04:05 -0700", + "2 Jan 2006", + "2.1.2006 15:04:05", + "2/1/2006", + "2-1-2006", + "2006 January 02", + "2006-1-2T15:04:05Z", + "2006-1-2 15:04:05", + "2006-1-2", + "2006-1-02T15:04:05Z", + "2006-01-02T15:04Z", + "2006-01-02T15:04-07:00", + "2006-01-02T15:04:05Z", + "2006-01-02T15:04:05-07:00:00", + "2006-01-02T15:04:05:-0700", + "2006-01-02T15:04:05-0700", + "2006-01-02T15:04:05-07:00", + "2006-01-02T15:04:05 -0700", + "2006-01-02T15:04:05:00", + "2006-01-02T15:04:05", + "2006-01-02 at 15:04:05", + "2006-01-02 15:04:05Z", + "2006-01-02 15:04:05 MST", + "2006-01-02 15:04:05-0700", + "2006-01-02 15:04:05-07:00", + "2006-01-02 15:04:05 -0700", + "2006-01-02 15:04", + "2006-01-02 00:00:00.0 15:04:05.0 -0700", + "2006/01/02", + "2006-01-02", + "15:04 02.01.2006 -0700", + "1/2/2006 3:04 PM MST", + "1/2/2006 3:04:05 PM MST", + "1/2/2006 3:04:05 PM", + "1/2/2006 15:04:05 MST", + "1/2/2006", + "06/1/2 15:04", + "06-1-2 15:04", + "02 Monday, Jan 2006 15:04", + "02 Jan 2006 15:04 MST", + "02 Jan 2006 15:04:05 UT", + "02 Jan 2006 15:04:05 MST", + "02 Jan 2006 15:04:05 -0700", + "02 Jan 2006 15:04:05", + "02 Jan 2006", + "02/01/2006 15:04 MST", + "02-01-2006 15:04:05 MST", + "02.01.2006 15:04:05", + "02/01/2006 15:04:05", + "02.01.2006 15:04", + "02/01/2006 - 15:04", + "02.01.2006 -0700", + "02/01/2006", + "02-01-2006", + "01/02/2006 3:04 PM", + "01/02/2006 15:04:05 MST", + "01/02/2006 - 15:04", + "01/02/2006", + "01-02-2006", +} + +// Parse parses a given date string using a large +// list of commonly found feed date formats. +func Parse(ds string) (t time.Time, err error) { + d := strings.TrimSpace(ds) + if d == "" { + return t, fmt.Errorf("Date string is empty") + } + + for _, f := range dateFormats { + if t, err = time.Parse(f, d); err == nil { + return + } + } + + err = fmt.Errorf("Failed to parse date: %s", ds) + return +} diff --git a/reader/feed/handler.go b/reader/feed/handler.go new file mode 100644 index 0000000..27ff126 --- /dev/null +++ b/reader/feed/handler.go @@ -0,0 +1,152 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package feed + +import ( + "fmt" + "github.com/miniflux/miniflux2/errors" + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/http" + "github.com/miniflux/miniflux2/reader/icon" + "github.com/miniflux/miniflux2/storage" + "log" + "time" +) + +var ( + errRequestFailed = "Unable to execute request: %v" + errServerFailure = "Unable to fetch feed (statusCode=%d)." + errDuplicate = "This feed already exists (%s)." + errNotFound = "Feed %d not found" +) + +// Handler contains all the logic to create and refresh feeds. +type Handler struct { + store *storage.Storage +} + +// CreateFeed fetch, parse and store a new feed. +func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed, error) { + defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[Handler:CreateFeed] feedUrl=%s", url)) + + client := http.NewHttpClient(url) + response, err := client.Get() + if err != nil { + return nil, errors.NewLocalizedError(errRequestFailed, err) + } + + if response.HasServerFailure() { + return nil, errors.NewLocalizedError(errServerFailure, response.StatusCode) + } + + if h.store.FeedURLExists(userID, response.EffectiveURL) { + return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL) + } + + subscription, err := parseFeed(response.Body) + if err != nil { + return nil, err + } + + subscription.Category = &model.Category{ID: categoryID} + subscription.EtagHeader = response.ETag + subscription.LastModifiedHeader = response.LastModified + subscription.FeedURL = response.EffectiveURL + subscription.UserID = userID + + err = h.store.CreateFeed(subscription) + if err != nil { + return nil, err + } + + log.Println("[Handler:CreateFeed] Feed saved with ID:", subscription.ID) + + icon, err := icon.FindIcon(subscription.SiteURL) + if err != nil { + log.Println(err) + } else if icon == nil { + log.Printf("No icon found for feedID=%d\n", subscription.ID) + } else { + h.store.CreateFeedIcon(subscription, icon) + } + + return subscription, nil +} + +// RefreshFeed fetch and update a feed if necessary. +func (h *Handler) RefreshFeed(userID, feedID int64) error { + defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[Handler:RefreshFeed] feedID=%d", feedID)) + + originalFeed, err := h.store.GetFeedById(userID, feedID) + if err != nil { + return err + } + + if originalFeed == nil { + return errors.NewLocalizedError(errNotFound, feedID) + } + + client := http.NewHttpClientWithCacheHeaders(originalFeed.FeedURL, originalFeed.EtagHeader, originalFeed.LastModifiedHeader) + response, err := client.Get() + if err != nil { + customErr := errors.NewLocalizedError(errRequestFailed, err) + originalFeed.ParsingErrorCount++ + originalFeed.ParsingErrorMsg = customErr.Error() + h.store.UpdateFeed(originalFeed) + return customErr + } + + originalFeed.CheckedAt = time.Now() + + if response.HasServerFailure() { + err := errors.NewLocalizedError(errServerFailure, response.StatusCode) + originalFeed.ParsingErrorCount++ + originalFeed.ParsingErrorMsg = err.Error() + h.store.UpdateFeed(originalFeed) + return err + } + + if response.IsModified(originalFeed.EtagHeader, originalFeed.LastModifiedHeader) { + log.Printf("[Handler:RefreshFeed] Feed #%d has been modified\n", feedID) + + subscription, err := parseFeed(response.Body) + if err != nil { + originalFeed.ParsingErrorCount++ + originalFeed.ParsingErrorMsg = err.Error() + h.store.UpdateFeed(originalFeed) + return err + } + + originalFeed.EtagHeader = response.ETag + originalFeed.LastModifiedHeader = response.LastModified + + if err := h.store.UpdateEntries(originalFeed.UserID, originalFeed.ID, subscription.Entries); err != nil { + return err + } + + if !h.store.HasIcon(originalFeed.ID) { + log.Println("[Handler:RefreshFeed] Looking for feed icon") + icon, err := icon.FindIcon(originalFeed.SiteURL) + if err != nil { + log.Println("[Handler:RefreshFeed]", err) + } else { + h.store.CreateFeedIcon(originalFeed, icon) + } + } + } else { + log.Printf("[Handler:RefreshFeed] Feed #%d not modified\n", feedID) + } + + originalFeed.ParsingErrorCount = 0 + originalFeed.ParsingErrorMsg = "" + + return h.store.UpdateFeed(originalFeed) +} + +// NewFeedHandler returns a feed handler. +func NewFeedHandler(store *storage.Storage) *Handler { + return &Handler{store: store} +} diff --git a/reader/feed/json/json.go b/reader/feed/json/json.go new file mode 100644 index 0000000..e7d53c4 --- /dev/null +++ b/reader/feed/json/json.go @@ -0,0 +1,170 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package json + +import ( + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/feed/date" + "github.com/miniflux/miniflux2/reader/processor" + "github.com/miniflux/miniflux2/reader/sanitizer" + "log" + "strings" + "time" +) + +type JsonFeed struct { + Version string `json:"version"` + Title string `json:"title"` + SiteURL string `json:"home_page_url"` + FeedURL string `json:"feed_url"` + Author JsonAuthor `json:"author"` + Items []JsonItem `json:"items"` +} + +type JsonAuthor struct { + Name string `json:"name"` + URL string `json:"url"` +} + +type JsonItem struct { + ID string `json:"id"` + URL string `json:"url"` + Title string `json:"title"` + Summary string `json:"summary"` + Text string `json:"content_text"` + Html string `json:"content_html"` + DatePublished string `json:"date_published"` + DateModified string `json:"date_modified"` + Author JsonAuthor `json:"author"` + Attachments []JsonAttachment `json:"attachments"` +} + +type JsonAttachment struct { + URL string `json:"url"` + MimeType string `json:"mime_type"` + Title string `json:"title"` + Size int `json:"size_in_bytes"` + Duration int `json:"duration_in_seconds"` +} + +func (j *JsonFeed) GetAuthor() string { + return getAuthor(j.Author) +} + +func (j *JsonFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.FeedURL = j.FeedURL + feed.SiteURL = j.SiteURL + feed.Title = sanitizer.StripTags(j.Title) + + if feed.Title == "" { + feed.Title = feed.SiteURL + } + + for _, item := range j.Items { + entry := item.Transform() + if entry.Author == "" { + entry.Author = j.GetAuthor() + } + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} + +func (j *JsonItem) GetDate() time.Time { + for _, value := range []string{j.DatePublished, j.DateModified} { + if value != "" { + d, err := date.Parse(value) + if err != nil { + log.Println(err) + return time.Now() + } + + return d + } + } + + return time.Now() +} + +func (j *JsonItem) GetAuthor() string { + return getAuthor(j.Author) +} + +func (j *JsonItem) GetHash() string { + for _, value := range []string{j.ID, j.URL, j.Text + j.Html + j.Summary} { + if value != "" { + return helper.Hash(value) + } + } + + return "" +} + +func (j *JsonItem) GetTitle() string { + for _, value := range []string{j.Title, j.Summary, j.Text, j.Html} { + if value != "" { + return truncate(value) + } + } + + return j.URL +} + +func (j *JsonItem) GetContent() string { + for _, value := range []string{j.Html, j.Text, j.Summary} { + if value != "" { + return value + } + } + + return "" +} + +func (j *JsonItem) GetEnclosures() model.EnclosureList { + enclosures := make(model.EnclosureList, 0) + + for _, attachment := range j.Attachments { + enclosures = append(enclosures, &model.Enclosure{ + URL: attachment.URL, + MimeType: attachment.MimeType, + Size: attachment.Size, + }) + } + + return enclosures +} + +func (j *JsonItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.URL = j.URL + entry.Date = j.GetDate() + entry.Author = sanitizer.StripTags(j.GetAuthor()) + entry.Hash = j.GetHash() + entry.Content = processor.ItemContentProcessor(entry.URL, j.GetContent()) + entry.Title = sanitizer.StripTags(strings.Trim(j.GetTitle(), " \n\t")) + entry.Enclosures = j.GetEnclosures() + return entry +} + +func getAuthor(author JsonAuthor) string { + if author.Name != "" { + return author.Name + } + + return "" +} + +func truncate(str string) string { + max := 100 + if len(str) > max { + return str[:max] + "..." + } + + return str +} diff --git a/reader/feed/json/parser.go b/reader/feed/json/parser.go new file mode 100644 index 0000000..816d28e --- /dev/null +++ b/reader/feed/json/parser.go @@ -0,0 +1,23 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package json + +import ( + "encoding/json" + "fmt" + "github.com/miniflux/miniflux2/model" + "io" +) + +// Parse returns a normalized feed struct. +func Parse(data io.Reader) (*model.Feed, error) { + jsonFeed := new(JsonFeed) + decoder := json.NewDecoder(data) + if err := decoder.Decode(&jsonFeed); err != nil { + return nil, fmt.Errorf("Unable to parse JSON Feed: %v", err) + } + + return jsonFeed.Transform(), nil +} diff --git a/reader/feed/json/parser_test.go b/reader/feed/json/parser_test.go new file mode 100644 index 0000000..f2c97fc --- /dev/null +++ b/reader/feed/json/parser_test.go @@ -0,0 +1,345 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package json + +import ( + "bytes" + "strings" + "testing" + "time" +) + +func TestParseJsonFeed(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "id": "2", + "content_text": "This is a second item.", + "url": "https://example.org/second-item" + }, + { + "id": "1", + "content_html": "<p>Hello, world!</p>", + "url": "https://example.org/initial-post" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "My Example Feed" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "https://example.org/feed.json" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "https://example.org/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 2 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Hash != "d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "https://example.org/second-item" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "This is a second item." { + t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[0].Title) + } + + if feed.Entries[0].Content != "This is a second item." { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } + + if feed.Entries[1].Hash != "6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[1].Hash) + } + + if feed.Entries[1].URL != "https://example.org/initial-post" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL) + } + + if feed.Entries[1].Title != "Hello, world!" { + t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[1].Title) + } + + if feed.Entries[1].Content != "<p>Hello, world!</p>" { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[1].Content) + } +} + +func TestParsePodcast(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "user_comment": "This is a podcast feed. You can add this feed to your podcast client using the following URL: http://therecord.co/feed.json", + "title": "The Record", + "home_page_url": "http://therecord.co/", + "feed_url": "http://therecord.co/feed.json", + "items": [ + { + "id": "http://therecord.co/chris-parrish", + "title": "Special #1 - Chris Parrish", + "url": "http://therecord.co/chris-parrish", + "content_text": "Chris has worked at Adobe and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped Napkin, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on Bainbridge Island, a quick ferry ride from Seattle.", + "content_html": "Chris has worked at <a href=\"http://adobe.com/\">Adobe</a> and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped <a href=\"http://aged-and-distilled.com/napkin/\">Napkin</a>, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on <a href=\"http://www.ci.bainbridge-isl.wa.us/\">Bainbridge Island</a>, a quick ferry ride from Seattle.", + "summary": "Brent interviews Chris Parrish, co-host of The Record and one-half of Aged & Distilled.", + "date_published": "2014-05-09T14:04:00-07:00", + "attachments": [ + { + "url": "http://therecord.co/downloads/The-Record-sp1e1-ChrisParrish.m4a", + "mime_type": "audio/x-m4a", + "size_in_bytes": 89970236, + "duration_in_seconds": 6629 + } + ] + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "The Record" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "http://therecord.co/feed.json" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://therecord.co/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Hash != "6b678e57962a1b001e4e873756563cdc08bbd06ca561e764e0baa9a382485797" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://therecord.co/chris-parrish" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "Special #1 - Chris Parrish" { + t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[0].Title) + } + + if feed.Entries[0].Content != `Chris has worked at <a href="http://adobe.com/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Adobe</a> and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped <a href="http://aged-and-distilled.com/napkin/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Napkin</a>, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on <a href="http://www.ci.bainbridge-isl.wa.us/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Bainbridge Island</a>, a quick ferry ride from Seattle.` { + t.Errorf(`Incorrect entry content, got: "%s"`, feed.Entries[0].Content) + } + + location, _ := time.LoadLocation("America/Vancouver") + if !feed.Entries[0].Date.Equal(time.Date(2014, time.May, 9, 14, 4, 0, 0, location)) { + t.Errorf("Incorrect entry date, got: %v", feed.Entries[0].Date) + } + + if len(feed.Entries[0].Enclosures) != 1 { + t.Errorf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) + } + + if feed.Entries[0].Enclosures[0].URL != "http://therecord.co/downloads/The-Record-sp1e1-ChrisParrish.m4a" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[0].URL) + } + + if feed.Entries[0].Enclosures[0].MimeType != "audio/x-m4a" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[0].MimeType) + } + + if feed.Entries[0].Enclosures[0].Size != 89970236 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } +} + +func TestParseAuthor(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "user_comment": "This is a microblog feed. You can add this to your feed reader using the following URL: https://example.org/feed.json", + "title": "Brent Simmons’s Microblog", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "author": { + "name": "Brent Simmons", + "url": "http://example.org/", + "avatar": "https://example.org/avatar.png" + }, + "items": [ + { + "id": "2347259", + "url": "https://example.org/2347259", + "content_text": "Cats are neat. \n\nhttps://example.org/cats", + "date_published": "2016-02-09T14:22:00-07:00" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Author != "Brent Simmons" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseFeedWithoutTitle(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "id": "2347259", + "url": "https://example.org/2347259", + "content_text": "Cats are neat. \n\nhttps://example.org/cats", + "date_published": "2016-02-09T14:22:00-07:00" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "https://example.org/" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } +} + +func TestParseFeedItemWithInvalidDate(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "id": "2347259", + "url": "https://example.org/2347259", + "content_text": "Cats are neat. \n\nhttps://example.org/cats", + "date_published": "Tomorrow" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if !feed.Entries[0].Date.Before(time.Now()) { + t.Errorf("Incorrect entry date, got: %v", feed.Entries[0].Date) + } +} + +func TestParseFeedItemWithoutID(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "content_text": "Some text." + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Hash != "13b4c5aecd1b6d749afcee968fbf9c80f1ed1bbdbe1aaf25cb34ebd01144bbe9" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } +} + +func TestParseFeedItemWithoutTitle(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "url": "https://example.org/item" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "https://example.org/item" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseTruncateItemTitle(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "title": "` + strings.Repeat("a", 200) + `" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if len(feed.Entries[0].Title) != 103 { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} diff --git a/reader/feed/parser.go b/reader/feed/parser.go new file mode 100644 index 0000000..8df6b46 --- /dev/null +++ b/reader/feed/parser.go @@ -0,0 +1,82 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package feed + +import ( + "bytes" + "encoding/xml" + "errors" + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/feed/atom" + "github.com/miniflux/miniflux2/reader/feed/json" + "github.com/miniflux/miniflux2/reader/feed/rss" + "io" + "strings" + "time" + + "golang.org/x/net/html/charset" +) + +const ( + FormatRss = "rss" + FormatAtom = "atom" + FormatJson = "json" + FormatUnknown = "unknown" +) + +func DetectFeedFormat(data io.Reader) string { + defer helper.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]") + + var buffer bytes.Buffer + tee := io.TeeReader(data, &buffer) + + decoder := xml.NewDecoder(tee) + decoder.CharsetReader = charset.NewReaderLabel + + for { + token, _ := decoder.Token() + if token == nil { + break + } + + if element, ok := token.(xml.StartElement); ok { + switch element.Name.Local { + case "rss": + return FormatRss + case "feed": + return FormatAtom + } + } + } + + if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") { + return FormatJson + } + + return FormatUnknown +} + +func parseFeed(data io.Reader) (*model.Feed, error) { + defer helper.ExecutionTime(time.Now(), "[Feed:ParseFeed]") + + var buffer bytes.Buffer + io.Copy(&buffer, data) + + reader := bytes.NewReader(buffer.Bytes()) + format := DetectFeedFormat(reader) + reader.Seek(0, io.SeekStart) + + switch format { + case FormatAtom: + return atom.Parse(reader) + case FormatRss: + return rss.Parse(reader) + case FormatJson: + return json.Parse(reader) + default: + return nil, errors.New("Unsupported feed format") + } +} diff --git a/reader/feed/parser_test.go b/reader/feed/parser_test.go new file mode 100644 index 0000000..0dd8dd6 --- /dev/null +++ b/reader/feed/parser_test.go @@ -0,0 +1,169 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package feed + +import ( + "bytes" + "testing" +) + +func TestDetectRSS(t *testing.T) { + data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>` + format := DetectFeedFormat(bytes.NewBufferString(data)) + + if format != FormatRss { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatRss) + } +} + +func TestDetectAtom(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>` + format := DetectFeedFormat(bytes.NewBufferString(data)) + + if format != FormatAtom { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom) + } +} + +func TestDetectAtomWithISOCharset(t *testing.T) { + data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>` + format := DetectFeedFormat(bytes.NewBufferString(data)) + + if format != FormatAtom { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom) + } +} + +func TestDetectJSON(t *testing.T) { + data := ` + { + "version" : "https://jsonfeed.org/version/1", + "title" : "Example" + } + ` + format := DetectFeedFormat(bytes.NewBufferString(data)) + + if format != FormatJson { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatJson) + } +} + +func TestDetectUnknown(t *testing.T) { + data := ` + <!DOCTYPE html> <html> </html> + ` + format := DetectFeedFormat(bytes.NewBufferString(data)) + + if format != FormatUnknown { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatUnknown) + } +} + +func TestParseAtom(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + + <title>Example Feed</title> + <link href="http://example.org/"/> + <updated>2003-12-13T18:30:02Z</updated> + <author> + <name>John Doe</name> + </author> + <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> + + <entry> + <title>Atom-Powered Robots Run Amok</title> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <summary>Some text.</summary> + </entry> + + </feed>` + + feed, err := parseFeed(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "Example Feed" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } +} + +func TestParseRss(t *testing.T) { + data := `<?xml version="1.0"?> + <rss version="2.0"> + <channel> + <title>Liftoff News</title> + <link>http://liftoff.msfc.nasa.gov/</link> + <item> + <title>Star City</title> + <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link> + <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.</description> + <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate> + <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> + </item> + </channel> + </rss>` + + feed, err := parseFeed(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "Liftoff News" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } +} + +func TestParseJson(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "id": "2", + "content_text": "This is a second item.", + "url": "https://example.org/second-item" + }, + { + "id": "1", + "content_html": "<p>Hello, world!</p>", + "url": "https://example.org/initial-post" + } + ] + }` + + feed, err := parseFeed(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "My Example Feed" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } +} + +func TestParseUnknownFeed(t *testing.T) { + data := ` + <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> + <html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <title>Title of document</title> + </head> + <body> + some content + </body> + </html> + ` + + _, err := parseFeed(bytes.NewBufferString(data)) + if err == nil { + t.Error("ParseFeed must returns an error") + } +} diff --git a/reader/feed/rss/parser.go b/reader/feed/rss/parser.go new file mode 100644 index 0000000..52310e4 --- /dev/null +++ b/reader/feed/rss/parser.go @@ -0,0 +1,28 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss + +import ( + "encoding/xml" + "fmt" + "github.com/miniflux/miniflux2/model" + "io" + + "golang.org/x/net/html/charset" +) + +// Parse returns a normalized feed struct. +func Parse(data io.Reader) (*model.Feed, error) { + rssFeed := new(RssFeed) + decoder := xml.NewDecoder(data) + decoder.CharsetReader = charset.NewReaderLabel + + err := decoder.Decode(rssFeed) + if err != nil { + return nil, fmt.Errorf("Unable to parse RSS feed: %v", err) + } + + return rssFeed.Transform(), nil +} diff --git a/reader/feed/rss/parser_test.go b/reader/feed/rss/parser_test.go new file mode 100644 index 0000000..42dc965 --- /dev/null +++ b/reader/feed/rss/parser_test.go @@ -0,0 +1,466 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss + +import ( + "bytes" + "testing" + "time" +) + +func TestParseRss2Sample(t *testing.T) { + data := ` + <?xml version="1.0"?> + <rss version="2.0"> + <channel> + <title>Liftoff News</title> + <link>http://liftoff.msfc.nasa.gov/</link> + <description>Liftoff to Space Exploration.</description> + <language>en-us</language> + <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate> + <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate> + <docs>http://blogs.law.harvard.edu/tech/rss</docs> + <generator>Weblog Editor 2.0</generator> + <managingEditor>editor@example.com</managingEditor> + <webMaster>webmaster@example.com</webMaster> + <item> + <title>Star City</title> + <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link> + <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.</description> + <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate> + <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> + </item> + <item> + <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st.</description> + <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate> + <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid> + </item> + <item> + <title>The Engine That Does More</title> + <link>http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp</link> + <description>Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.</description> + <pubDate>Tue, 27 May 2003 08:37:32 GMT</pubDate> + <guid>http://liftoff.msfc.nasa.gov/2003/05/27.html#item571</guid> + </item> + <item> + <title>Astronauts' Dirty Laundry</title> + <link>http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp</link> + <description>Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.</description> + <pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate> + <guid>http://liftoff.msfc.nasa.gov/2003/05/20.html#item570</guid> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "Liftoff News" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://liftoff.msfc.nasa.gov/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 4 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expectedDate := time.Date(2003, time.June, 3, 9, 39, 21, 0, time.UTC) + if !feed.Entries[0].Date.Equal(expectedDate) { + t.Errorf("Incorrect entry date, got: %v, want: %v", feed.Entries[0].Date, expectedDate) + } + + if feed.Entries[0].Hash != "5b2b4ac2fe1786ddf0fd2da2f1b07f64e691264f41f2db3ea360f31bb6d9152b" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "Star City" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } + + if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Star City</a>.` { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } +} + +func TestParseFeedWithoutTitle(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0"> + <channel> + <link>https://example.org/</link> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "https://example.org/" { + t.Errorf("Incorrect feed title, got: %s", feed.Title) + } +} + +func TestParseEntryWithoutTitle(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0"> + <channel> + <link>https://example.org/</link> + <item> + <link>https://example.org/item</link> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Title != "https://example.org/item" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseFeedURLWithAtomLink(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> + <channel> + <title>Example</title> + <link>https://example.org/</link> + <atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.FeedURL != "https://example.org/rss" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "https://example.org/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } +} + +func TestParseEntryWithAtomAuthor(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> + <channel> + <title>Example</title> + <link>https://example.org/</link> + <atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link> + <item> + <title>Test</title> + <link>https://example.org/item</link> + <author xmlns:author="http://www.w3.org/2005/Atom"> + <name>Foo Bar</name> + <title>Vice President</title> + <department/> + <company>FooBar Inc.</company> + </author> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Foo Bar" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseEntryWithDublinCoreAuthor(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/"> + <channel> + <title>Example</title> + <link>https://example.org/</link> + <item> + <title>Test</title> + <link>https://example.org/item</link> + <dc:creator>Me (me@example.com)</dc:creator> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Me (me@example.com)" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseEntryWithItunesAuthor(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"> + <channel> + <title>Example</title> + <link>https://example.org/</link> + <item> + <title>Test</title> + <link>https://example.org/item</link> + <itunes:author>Someone</itunes:author> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Someone" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseFeedWithItunesAuthor(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"> + <channel> + <title>Example</title> + <link>https://example.org/</link> + <itunes:author>Someone</itunes:author> + <item> + <title>Test</title> + <link>https://example.org/item</link> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Someone" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseEntryWithDublinCoreDate(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/"> + <channel> + <title>Example</title> + <link>http://example.org/</link> + <item> + <title>Item 1</title> + <link>http://example.org/item1</link> + <description>Description.</description> + <guid isPermaLink="false">UUID</guid> + <dc:date>2002-09-29T23:40:06-05:00</dc:date> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + location, _ := time.LoadLocation("EST") + expectedDate := time.Date(2002, time.September, 29, 23, 40, 06, 0, location) + if !feed.Entries[0].Date.Equal(expectedDate) { + t.Errorf("Incorrect entry date, got: %v, want: %v", feed.Entries[0].Date, expectedDate) + } +} + +func TestParseEntryWithContentEncoded(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/"> + <channel> + <title>Example</title> + <link>http://example.org/</link> + <item> + <title>Item 1</title> + <link>http://example.org/item1</link> + <description>Description.</description> + <guid isPermaLink="false">UUID</guid> + <content:encoded><![CDATA[<p><a href="http://www.example.org/">Example</a>.</p>]]></content:encoded> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Content != `<p><a href="http://www.example.org/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Example</a>.</p>` { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } +} + +func TestParseEntryWithFeedBurnerLink(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0"> + <channel> + <title>Example</title> + <link>http://example.org/</link> + <item> + <title>Item 1</title> + <link>http://example.org/item1</link> + <feedburner:origLink>http://example.org/original</feedburner:origLink> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].URL != "http://example.org/original" { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].URL) + } +} + +func TestParseEntryTitleWithWhitespaces(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0"> + <channel> + <title>Example</title> + <link>http://example.org</link> + <item> + <title> + Some Title + </title> + <link>http://www.example.org/entries/1</link> + <pubDate>Fri, 15 Jul 2005 00:00:00 -0500</pubDate> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Title != "Some Title" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseEntryWithEnclosures(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0"> + <channel> + <title>My Podcast Feed</title> + <link>http://example.org</link> + <author>some.email@example.org</author> + <item> + <title>Podcasting with RSS</title> + <link>http://www.example.org/entries/1</link> + <description>An overview of RSS podcasting</description> + <pubDate>Fri, 15 Jul 2005 00:00:00 -0500</pubDate> + <guid isPermaLink="true">http://www.example.org/entries/1</guid> + <enclosure url="http://www.example.org/myaudiofile.mp3" + length="12345" + type="audio/mpeg" /> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].URL != "http://www.example.org/entries/1" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if len(feed.Entries[0].Enclosures) != 1 { + t.Errorf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) + } + + if feed.Entries[0].Enclosures[0].URL != "http://www.example.org/myaudiofile.mp3" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[0].URL) + } + + if feed.Entries[0].Enclosures[0].MimeType != "audio/mpeg" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[0].MimeType) + } + + if feed.Entries[0].Enclosures[0].Size != 12345 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } +} + +func TestParseEntryWithFeedBurnerEnclosures(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0"> + <channel> + <title>My Example Feed</title> + <link>http://example.org</link> + <author>some.email@example.org</author> + <item> + <title>Example Item</title> + <link>http://www.example.org/entries/1</link> + <enclosure + url="http://feedproxy.google.com/~r/example/~5/lpMyFSCvubs/File.mp3" + length="76192460" + type="audio/mpeg" /> + <feedburner:origEnclosureLink>http://example.org/67ca416c-f22a-4228-a681-68fc9998ec10/File.mp3</feedburner:origEnclosureLink> + </item> + </channel> + </rss>` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].URL != "http://www.example.org/entries/1" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if len(feed.Entries[0].Enclosures) != 1 { + t.Errorf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) + } + + if feed.Entries[0].Enclosures[0].URL != "http://example.org/67ca416c-f22a-4228-a681-68fc9998ec10/File.mp3" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[0].URL) + } + + if feed.Entries[0].Enclosures[0].MimeType != "audio/mpeg" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[0].MimeType) + } + + if feed.Entries[0].Enclosures[0].Size != 76192460 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } +} diff --git a/reader/feed/rss/rss.go b/reader/feed/rss/rss.go new file mode 100644 index 0000000..d88ed6b --- /dev/null +++ b/reader/feed/rss/rss.go @@ -0,0 +1,207 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss + +import ( + "encoding/xml" + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/feed/date" + "github.com/miniflux/miniflux2/reader/processor" + "github.com/miniflux/miniflux2/reader/sanitizer" + "log" + "path" + "strconv" + "strings" + "time" +) + +type RssLink struct { + XMLName xml.Name + Data string `xml:",chardata"` + Href string `xml:"href,attr"` +} + +type RssFeed struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"channel>title"` + Links []RssLink `xml:"channel>link"` + Language string `xml:"channel>language"` + Description string `xml:"channel>description"` + PubDate string `xml:"channel>pubDate"` + ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` + Items []RssItem `xml:"channel>item"` +} + +type RssItem struct { + Guid string `xml:"guid"` + Title string `xml:"title"` + Link string `xml:"link"` + OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` + Description string `xml:"description"` + Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` + PubDate string `xml:"pubDate"` + Date string `xml:"http://purl.org/dc/elements/1.1/ date"` + Authors []RssAuthor `xml:"author"` + Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + Enclosures []RssEnclosure `xml:"enclosure"` + OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` +} + +type RssAuthor struct { + XMLName xml.Name + Data string `xml:",chardata"` + Name string `xml:"name"` +} + +type RssEnclosure struct { + Url string `xml:"url,attr"` + Type string `xml:"type,attr"` + Length string `xml:"length,attr"` +} + +func (r *RssFeed) GetSiteURL() string { + for _, elem := range r.Links { + if elem.XMLName.Space == "" { + return elem.Data + } + } + + return "" +} + +func (r *RssFeed) GetFeedURL() string { + for _, elem := range r.Links { + if elem.XMLName.Space == "http://www.w3.org/2005/Atom" { + return elem.Href + } + } + + return "" +} + +func (r *RssFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.SiteURL = r.GetSiteURL() + feed.FeedURL = r.GetFeedURL() + feed.Title = sanitizer.StripTags(r.Title) + + if feed.Title == "" { + feed.Title = feed.SiteURL + } + + for _, item := range r.Items { + entry := item.Transform() + + if entry.Author == "" && r.ItunesAuthor != "" { + entry.Author = r.ItunesAuthor + } + entry.Author = sanitizer.StripTags(entry.Author) + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} +func (i *RssItem) GetDate() time.Time { + value := i.PubDate + if i.Date != "" { + value = i.Date + } + + if value != "" { + result, err := date.Parse(value) + if err != nil { + log.Println(err) + return time.Now() + } + + return result + } + + return time.Now() +} + +func (i *RssItem) GetAuthor() string { + for _, element := range i.Authors { + if element.Name != "" { + return element.Name + } + + if element.Data != "" { + return element.Data + } + } + + return i.Creator +} + +func (i *RssItem) GetHash() string { + for _, value := range []string{i.Guid, i.Link} { + if value != "" { + return helper.Hash(value) + } + } + + return "" +} + +func (i *RssItem) GetContent() string { + if i.Content != "" { + return i.Content + } + + return i.Description +} + +func (i *RssItem) GetURL() string { + if i.OriginalLink != "" { + return i.OriginalLink + } + + return i.Link +} + +func (i *RssItem) GetEnclosures() model.EnclosureList { + enclosures := make(model.EnclosureList, 0) + + for _, enclosure := range i.Enclosures { + length, _ := strconv.Atoi(enclosure.Length) + enclosureURL := enclosure.Url + + if i.OrigEnclosureLink != "" { + filename := path.Base(i.OrigEnclosureLink) + if strings.Contains(enclosureURL, filename) { + enclosureURL = i.OrigEnclosureLink + } + } + + enclosures = append(enclosures, &model.Enclosure{ + URL: enclosureURL, + MimeType: enclosure.Type, + Size: length, + }) + } + + return enclosures +} + +func (i *RssItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.URL = i.GetURL() + entry.Date = i.GetDate() + entry.Author = i.GetAuthor() + entry.Hash = i.GetHash() + entry.Content = processor.ItemContentProcessor(entry.URL, i.GetContent()) + entry.Title = sanitizer.StripTags(strings.Trim(i.Title, " \n\t")) + entry.Enclosures = i.GetEnclosures() + + if entry.Title == "" { + entry.Title = entry.URL + } + + return entry +} diff --git a/reader/http/client.go b/reader/http/client.go new file mode 100644 index 0000000..745ff0d --- /dev/null +++ b/reader/http/client.go @@ -0,0 +1,95 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package http + +import ( + "crypto/tls" + "fmt" + "github.com/miniflux/miniflux2/helper" + "log" + "net/http" + "net/url" + "time" +) + +const HTTP_USER_AGENT = "Miniflux <https://miniflux.net/>" + +type HttpClient struct { + url string + etagHeader string + lastModifiedHeader string + Insecure bool +} + +func (h *HttpClient) Get() (*ServerResponse, error) { + defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[HttpClient:Get] url=%s", h.url)) + u, _ := url.Parse(h.url) + + req := &http.Request{ + URL: u, + Method: "GET", + Header: h.buildHeaders(), + } + + client := h.buildClient() + resp, err := client.Do(req) + if err != nil { + return nil, err + } + + response := &ServerResponse{ + Body: resp.Body, + StatusCode: resp.StatusCode, + EffectiveURL: resp.Request.URL.String(), + LastModified: resp.Header.Get("Last-Modified"), + ETag: resp.Header.Get("ETag"), + ContentType: resp.Header.Get("Content-Type"), + } + + log.Println("[HttpClient:Get]", + "OriginalURL:", h.url, + "StatusCode:", response.StatusCode, + "ETag:", response.ETag, + "LastModified:", response.LastModified, + "EffectiveURL:", response.EffectiveURL, + ) + + return response, err +} + +func (h *HttpClient) buildClient() http.Client { + if h.Insecure { + transport := &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + } + + return http.Client{Transport: transport} + } + + return http.Client{} +} + +func (h *HttpClient) buildHeaders() http.Header { + headers := make(http.Header) + headers.Add("User-Agent", HTTP_USER_AGENT) + + if h.etagHeader != "" { + headers.Add("If-None-Match", h.etagHeader) + } + + if h.lastModifiedHeader != "" { + headers.Add("If-Modified-Since", h.lastModifiedHeader) + } + + return headers +} + +func NewHttpClient(url string) *HttpClient { + return &HttpClient{url: url, Insecure: false} +} + +func NewHttpClientWithCacheHeaders(url, etagHeader, lastModifiedHeader string) *HttpClient { + return &HttpClient{url: url, etagHeader: etagHeader, lastModifiedHeader: lastModifiedHeader, Insecure: false} +} diff --git a/reader/http/response.go b/reader/http/response.go new file mode 100644 index 0000000..49e9f19 --- /dev/null +++ b/reader/http/response.go @@ -0,0 +1,32 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package http + +import "io" + +type ServerResponse struct { + Body io.Reader + StatusCode int + EffectiveURL string + LastModified string + ETag string + ContentType string +} + +func (s *ServerResponse) HasServerFailure() bool { + return s.StatusCode >= 400 +} + +func (s *ServerResponse) IsModified(etag, lastModified string) bool { + if s.StatusCode == 304 { + return false + } + + if s.ETag != "" && s.LastModified != "" && (s.ETag == etag || s.LastModified == lastModified) { + return false + } + + return true +} diff --git a/reader/icon/finder.go b/reader/icon/finder.go new file mode 100644 index 0000000..54d509f --- /dev/null +++ b/reader/icon/finder.go @@ -0,0 +1,109 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package icon + +import ( + "fmt" + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/http" + "github.com/miniflux/miniflux2/reader/url" + "io" + "io/ioutil" + "log" + + "github.com/PuerkitoBio/goquery" +) + +// FindIcon try to find the website's icon. +func FindIcon(websiteURL string) (*model.Icon, error) { + rootURL := url.GetRootURL(websiteURL) + client := http.NewHttpClient(rootURL) + response, err := client.Get() + if err != nil { + return nil, fmt.Errorf("unable to download website index page: %v", err) + } + + if response.HasServerFailure() { + return nil, fmt.Errorf("unable to download website index page: status=%d", response.StatusCode) + } + + iconURL, err := parseDocument(rootURL, response.Body) + if err != nil { + return nil, err + } + + log.Println("[FindIcon] Fetching icon =>", iconURL) + icon, err := downloadIcon(iconURL) + if err != nil { + return nil, err + } + + return icon, nil +} + +func parseDocument(websiteURL string, data io.Reader) (string, error) { + queries := []string{ + "link[rel='shortcut icon']", + "link[rel='Shortcut Icon']", + "link[rel='icon shortcut']", + "link[rel='icon']", + } + + doc, err := goquery.NewDocumentFromReader(data) + if err != nil { + return "", fmt.Errorf("unable to read document: %v", err) + } + + var iconURL string + for _, query := range queries { + doc.Find(query).Each(func(i int, s *goquery.Selection) { + if href, exists := s.Attr("href"); exists { + iconURL = href + } + }) + + if iconURL != "" { + break + } + } + + if iconURL == "" { + iconURL = url.GetRootURL(websiteURL) + "favicon.ico" + } else { + iconURL, _ = url.GetAbsoluteURL(websiteURL, iconURL) + } + + return iconURL, nil +} + +func downloadIcon(iconURL string) (*model.Icon, error) { + client := http.NewHttpClient(iconURL) + response, err := client.Get() + if err != nil { + return nil, fmt.Errorf("unable to download iconURL: %v", err) + } + + if response.HasServerFailure() { + return nil, fmt.Errorf("unable to download icon: status=%d", response.StatusCode) + } + + body, err := ioutil.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("unable to read downloaded icon: %v", err) + } + + if len(body) == 0 { + return nil, fmt.Errorf("downloaded icon is empty, iconURL=%s", iconURL) + } + + icon := &model.Icon{ + Hash: helper.HashFromBytes(body), + MimeType: response.ContentType, + Content: body, + } + + return icon, nil +} diff --git a/reader/opml/handler.go b/reader/opml/handler.go new file mode 100644 index 0000000..6150d91 --- /dev/null +++ b/reader/opml/handler.go @@ -0,0 +1,94 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package opml + +import ( + "errors" + "fmt" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/storage" + "io" + "log" +) + +type OpmlHandler struct { + store *storage.Storage +} + +func (o *OpmlHandler) Export(userID int64) (string, error) { + feeds, err := o.store.GetFeeds(userID) + if err != nil { + log.Println(err) + return "", errors.New("Unable to fetch feeds.") + } + + var subscriptions SubcriptionList + for _, feed := range feeds { + subscriptions = append(subscriptions, &Subcription{ + Title: feed.Title, + FeedURL: feed.FeedURL, + SiteURL: feed.SiteURL, + CategoryName: feed.Category.Title, + }) + } + + return Serialize(subscriptions), nil +} + +func (o *OpmlHandler) Import(userID int64, data io.Reader) (err error) { + subscriptions, err := Parse(data) + if err != nil { + return err + } + + for _, subscription := range subscriptions { + if !o.store.FeedURLExists(userID, subscription.FeedURL) { + var category *model.Category + + if subscription.CategoryName == "" { + category, err = o.store.GetFirstCategory(userID) + if err != nil { + log.Println(err) + return errors.New("Unable to find first category.") + } + } else { + category, err = o.store.GetCategoryByTitle(userID, subscription.CategoryName) + if err != nil { + log.Println(err) + return errors.New("Unable to search category by title.") + } + + if category == nil { + category = &model.Category{ + UserID: userID, + Title: subscription.CategoryName, + } + + err := o.store.CreateCategory(category) + if err != nil { + log.Println(err) + return fmt.Errorf(`Unable to create this category: "%s".`, subscription.CategoryName) + } + } + } + + feed := &model.Feed{ + UserID: userID, + Title: subscription.Title, + FeedURL: subscription.FeedURL, + SiteURL: subscription.SiteURL, + Category: category, + } + + o.store.CreateFeed(feed) + } + } + + return nil +} + +func NewOpmlHandler(store *storage.Storage) *OpmlHandler { + return &OpmlHandler{store: store} +} diff --git a/reader/opml/opml.go b/reader/opml/opml.go new file mode 100644 index 0000000..d5278a7 --- /dev/null +++ b/reader/opml/opml.go @@ -0,0 +1,82 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package opml + +import "encoding/xml" + +type Opml struct { + XMLName xml.Name `xml:"opml"` + Version string `xml:"version,attr"` + Outlines []Outline `xml:"body>outline"` +} + +type Outline struct { + Title string `xml:"title,attr,omitempty"` + Text string `xml:"text,attr"` + FeedURL string `xml:"xmlUrl,attr,omitempty"` + SiteURL string `xml:"htmlUrl,attr,omitempty"` + Outlines []Outline `xml:"outline,omitempty"` +} + +func (o *Outline) GetTitle() string { + if o.Title != "" { + return o.Title + } + + if o.Text != "" { + return o.Text + } + + if o.SiteURL != "" { + return o.SiteURL + } + + if o.FeedURL != "" { + return o.FeedURL + } + + return "" +} + +func (o *Outline) GetSiteURL() string { + if o.SiteURL != "" { + return o.SiteURL + } + + return o.FeedURL +} + +func (o *Outline) IsCategory() bool { + return o.Text != "" && o.SiteURL == "" && o.FeedURL == "" +} + +func (o *Outline) Append(subscriptions SubcriptionList, category string) SubcriptionList { + if o.FeedURL != "" { + subscriptions = append(subscriptions, &Subcription{ + Title: o.GetTitle(), + FeedURL: o.FeedURL, + SiteURL: o.GetSiteURL(), + CategoryName: category, + }) + } + + return subscriptions +} + +func (o *Opml) Transform() SubcriptionList { + var subscriptions SubcriptionList + + for _, outline := range o.Outlines { + if outline.IsCategory() { + for _, element := range outline.Outlines { + subscriptions = element.Append(subscriptions, outline.Text) + } + } else { + subscriptions = outline.Append(subscriptions, "") + } + } + + return subscriptions +} diff --git a/reader/opml/parser.go b/reader/opml/parser.go new file mode 100644 index 0000000..5d8babd --- /dev/null +++ b/reader/opml/parser.go @@ -0,0 +1,26 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package opml + +import ( + "encoding/xml" + "fmt" + "io" + + "golang.org/x/net/html/charset" +) + +func Parse(data io.Reader) (SubcriptionList, error) { + opml := new(Opml) + decoder := xml.NewDecoder(data) + decoder.CharsetReader = charset.NewReaderLabel + + err := decoder.Decode(opml) + if err != nil { + return nil, fmt.Errorf("Unable to parse OPML file: %v\n", err) + } + + return opml.Transform(), nil +} diff --git a/reader/opml/parser_test.go b/reader/opml/parser_test.go new file mode 100644 index 0000000..02543df --- /dev/null +++ b/reader/opml/parser_test.go @@ -0,0 +1,138 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package opml + +import "testing" +import "bytes" + +func TestParseOpmlWithoutCategories(t *testing.T) { + data := `<?xml version="1.0" encoding="ISO-8859-1"?> + <opml version="2.0"> + <head> + <title>mySubscriptions.opml</title> + </head> + <body> + <outline text="CNET News.com" description="Tech news and business reports by CNET News.com. Focused on information technology, core topics include computers, hardware, software, networking, and Internet media." htmlUrl="http://news.com.com/" language="unknown" title="CNET News.com" type="rss" version="RSS2" xmlUrl="http://news.com.com/2547-1_3-0-5.xml"/> + <outline text="washingtonpost.com - Politics" description="Politics" htmlUrl="http://www.washingtonpost.com/wp-dyn/politics?nav=rss_politics" language="unknown" title="washingtonpost.com - Politics" type="rss" version="RSS2" xmlUrl="http://www.washingtonpost.com/wp-srv/politics/rssheadlines.xml"/> + <outline text="Scobleizer: Microsoft Geek Blogger" description="Robert Scoble's look at geek and Microsoft life." htmlUrl="http://radio.weblogs.com/0001011/" language="unknown" title="Scobleizer: Microsoft Geek Blogger" type="rss" version="RSS2" xmlUrl="http://radio.weblogs.com/0001011/rss.xml"/> + <outline text="Yahoo! News: Technology" description="Technology" htmlUrl="http://news.yahoo.com/news?tmpl=index&cid=738" language="unknown" title="Yahoo! News: Technology" type="rss" version="RSS2" xmlUrl="http://rss.news.yahoo.com/rss/tech"/> + <outline text="Workbench" description="Programming and publishing news and comment" htmlUrl="http://www.cadenhead.org/workbench/" language="unknown" title="Workbench" type="rss" version="RSS2" xmlUrl="http://www.cadenhead.org/workbench/rss.xml"/> + <outline text="Christian Science Monitor | Top Stories" description="Read the front page stories of csmonitor.com." htmlUrl="http://csmonitor.com" language="unknown" title="Christian Science Monitor | Top Stories" type="rss" version="RSS" xmlUrl="http://www.csmonitor.com/rss/top.rss"/> + <outline text="Dictionary.com Word of the Day" description="A new word is presented every day with its definition and example sentences from actual published works." htmlUrl="http://dictionary.reference.com/wordoftheday/" language="unknown" title="Dictionary.com Word of the Day" type="rss" version="RSS" xmlUrl="http://www.dictionary.com/wordoftheday/wotd.rss"/> + <outline text="The Motley Fool" description="To Educate, Amuse, and Enrich" htmlUrl="http://www.fool.com" language="unknown" title="The Motley Fool" type="rss" version="RSS" xmlUrl="http://www.fool.com/xml/foolnews_rss091.xml"/> + <outline text="InfoWorld: Top News" description="The latest on Top News from InfoWorld" htmlUrl="http://www.infoworld.com/news/index.html" language="unknown" title="InfoWorld: Top News" type="rss" version="RSS2" xmlUrl="http://www.infoworld.com/rss/news.xml"/> + <outline text="NYT > Business" description="Find breaking news & business news on Wall Street, media & advertising, international business, banking, interest rates, the stock market, currencies & funds." htmlUrl="http://www.nytimes.com/pages/business/index.html?partner=rssnyt" language="unknown" title="NYT > Business" type="rss" version="RSS2" xmlUrl="http://www.nytimes.com/services/xml/rss/nyt/Business.xml"/> + <outline text="NYT > Technology" description="" htmlUrl="http://www.nytimes.com/pages/technology/index.html?partner=rssnyt" language="unknown" title="NYT > Technology" type="rss" version="RSS2" xmlUrl="http://www.nytimes.com/services/xml/rss/nyt/Technology.xml"/> + <outline text="Scripting News" description="It's even worse than it appears." htmlUrl="http://www.scripting.com/" language="unknown" title="Scripting News" type="rss" version="RSS2" xmlUrl="http://www.scripting.com/rss.xml"/> + <outline text="Wired News" description="Technology, and the way we do business, is changing the world we know. Wired News is a technology - and business-oriented news service feeding an intelligent, discerning audience. What role does technology play in the day-to-day living of your life? Wired News tells you. How has evolving technology changed the face of the international business world? Wired News puts you in the picture." htmlUrl="http://www.wired.com/" language="unknown" title="Wired News" type="rss" version="RSS" xmlUrl="http://www.wired.com/news_drop/netcenter/netcenter.rdf"/> + </body> + </opml> + ` + + var expected SubcriptionList + expected = append(expected, &Subcription{Title: "CNET News.com", FeedURL: "http://news.com.com/2547-1_3-0-5.xml", SiteURL: "http://news.com.com/"}) + + subscriptions, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(subscriptions) != 13 { + t.Errorf("Wrong number of subscriptions: %d instead of %d", len(subscriptions), 13) + } + + if !subscriptions[0].Equals(expected[0]) { + t.Errorf(`Subscription are different: "%v" vs "%v"`, subscriptions[0], expected[0]) + } +} + +func TestParseOpmlWithCategories(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <opml version="2.0"> + <head> + <title>mySubscriptions.opml</title> + </head> + <body> + <outline text="My Category 1"> + <outline text="Feed 1" xmlUrl="http://example.org/feed1/" htmlUrl="http://example.org/1"/> + <outline text="Feed 2" xmlUrl="http://example.org/feed2/" htmlUrl="http://example.org/2"/> + </outline> + <outline text="My Category 2"> + <outline text="Feed 3" xmlUrl="http://example.org/feed3/" htmlUrl="http://example.org/3"/> + </outline> + </body> + </opml> + ` + + var expected SubcriptionList + expected = append(expected, &Subcription{Title: "Feed 1", FeedURL: "http://example.org/feed1/", SiteURL: "http://example.org/1", CategoryName: "My Category 1"}) + expected = append(expected, &Subcription{Title: "Feed 2", FeedURL: "http://example.org/feed2/", SiteURL: "http://example.org/2", CategoryName: "My Category 1"}) + expected = append(expected, &Subcription{Title: "Feed 3", FeedURL: "http://example.org/feed3/", SiteURL: "http://example.org/3", CategoryName: "My Category 2"}) + + subscriptions, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(subscriptions) != 3 { + t.Errorf("Wrong number of subscriptions: %d instead of %d", len(subscriptions), 3) + } + + for i := 0; i < len(subscriptions); i++ { + if !subscriptions[i].Equals(expected[i]) { + t.Errorf(`Subscription are different: "%v" vs "%v"`, subscriptions[i], expected[i]) + } + } +} + +func TestParseOpmlWithEmptyTitleAndEmptySiteURL(t *testing.T) { + data := `<?xml version="1.0" encoding="ISO-8859-1"?> + <opml version="2.0"> + <head> + <title>mySubscriptions.opml</title> + </head> + <body> + <outline xmlUrl="http://example.org/feed1/" htmlUrl="http://example.org/1"/> + <outline xmlUrl="http://example.org/feed2/"/> + </body> + </opml> + ` + + var expected SubcriptionList + expected = append(expected, &Subcription{Title: "http://example.org/1", FeedURL: "http://example.org/feed1/", SiteURL: "http://example.org/1", CategoryName: ""}) + expected = append(expected, &Subcription{Title: "http://example.org/feed2/", FeedURL: "http://example.org/feed2/", SiteURL: "http://example.org/feed2/", CategoryName: ""}) + + subscriptions, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(subscriptions) != 2 { + t.Errorf("Wrong number of subscriptions: %d instead of %d", len(subscriptions), 2) + } + + for i := 0; i < len(subscriptions); i++ { + if !subscriptions[i].Equals(expected[i]) { + t.Errorf(`Subscription are different: "%v" vs "%v"`, subscriptions[i], expected[i]) + } + } +} + +func TestParseInvalidXML(t *testing.T) { + data := `<?xml version="1.0" encoding="ISO-8859-1"?> + <opml version="2.0"> + <head> + </head> + <body> + <outline + </body> + </opml> + ` + + _, err := Parse(bytes.NewBufferString(data)) + if err == nil { + t.Error(err) + } +} diff --git a/reader/opml/serializer.go b/reader/opml/serializer.go new file mode 100644 index 0000000..20c7046 --- /dev/null +++ b/reader/opml/serializer.go @@ -0,0 +1,58 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package opml + +import ( + "bufio" + "bytes" + "encoding/xml" + "log" +) + +func Serialize(subscriptions SubcriptionList) string { + var b bytes.Buffer + writer := bufio.NewWriter(&b) + writer.WriteString(xml.Header) + + opml := new(Opml) + opml.Version = "2.0" + for categoryName, subs := range groupSubscriptionsByFeed(subscriptions) { + outline := Outline{Text: categoryName} + + for _, subscription := range subs { + outline.Outlines = append(outline.Outlines, Outline{ + Title: subscription.Title, + Text: subscription.Title, + FeedURL: subscription.FeedURL, + SiteURL: subscription.SiteURL, + }) + } + + opml.Outlines = append(opml.Outlines, outline) + } + + encoder := xml.NewEncoder(writer) + encoder.Indent(" ", " ") + if err := encoder.Encode(opml); err != nil { + log.Println(err) + return "" + } + + return b.String() +} + +func groupSubscriptionsByFeed(subscriptions SubcriptionList) map[string]SubcriptionList { + groups := make(map[string]SubcriptionList) + + for _, subscription := range subscriptions { + // if subs, ok := groups[subscription.CategoryName]; !ok { + // groups[subscription.CategoryName] = SubcriptionList{} + // } + + groups[subscription.CategoryName] = append(groups[subscription.CategoryName], subscription) + } + + return groups +} diff --git a/reader/opml/serializer_test.go b/reader/opml/serializer_test.go new file mode 100644 index 0000000..b1ef2a6 --- /dev/null +++ b/reader/opml/serializer_test.go @@ -0,0 +1,31 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package opml + +import "testing" +import "bytes" + +func TestSerialize(t *testing.T) { + var subscriptions SubcriptionList + subscriptions = append(subscriptions, &Subcription{Title: "Feed 1", FeedURL: "http://example.org/feed/1", SiteURL: "http://example.org/1", CategoryName: "Category 1"}) + subscriptions = append(subscriptions, &Subcription{Title: "Feed 2", FeedURL: "http://example.org/feed/2", SiteURL: "http://example.org/2", CategoryName: "Category 1"}) + subscriptions = append(subscriptions, &Subcription{Title: "Feed 3", FeedURL: "http://example.org/feed/3", SiteURL: "http://example.org/3", CategoryName: "Category 2"}) + + output := Serialize(subscriptions) + feeds, err := Parse(bytes.NewBufferString(output)) + if err != nil { + t.Error(err) + } + + if len(feeds) != 3 { + t.Errorf("Wrong number of subscriptions: %d instead of %d", len(feeds), 3) + } + + for i := 0; i < len(feeds); i++ { + if !feeds[i].Equals(subscriptions[i]) { + t.Errorf(`Subscription are different: "%v" vs "%v"`, subscriptions[i], feeds[i]) + } + } +} diff --git a/reader/opml/subscription.go b/reader/opml/subscription.go new file mode 100644 index 0000000..b968bb0 --- /dev/null +++ b/reader/opml/subscription.go @@ -0,0 +1,18 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package opml + +type Subcription struct { + Title string + SiteURL string + FeedURL string + CategoryName string +} + +func (s Subcription) Equals(subscription *Subcription) bool { + return s.Title == subscription.Title && s.SiteURL == subscription.SiteURL && s.FeedURL == subscription.FeedURL && s.CategoryName == subscription.CategoryName +} + +type SubcriptionList []*Subcription diff --git a/reader/processor/processor.go b/reader/processor/processor.go new file mode 100644 index 0000000..a4f4de1 --- /dev/null +++ b/reader/processor/processor.go @@ -0,0 +1,15 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package processor + +import ( + "github.com/miniflux/miniflux2/reader/rewrite" + "github.com/miniflux/miniflux2/reader/sanitizer" +) + +func ItemContentProcessor(url, content string) string { + content = sanitizer.Sanitize(url, content) + return rewrite.Rewriter(url, content) +} diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go new file mode 100644 index 0000000..f23c0af --- /dev/null +++ b/reader/rewrite/rewriter.go @@ -0,0 +1,47 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rewrite + +import ( + "regexp" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +var rewriteRules = []func(string, string) string{ + func(url, content string) string { + re := regexp.MustCompile(`youtube\.com/watch\?v=(.*)`) + matches := re.FindStringSubmatch(url) + + if len(matches) == 2 { + video := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/` + matches[1] + `" allowfullscreen></iframe>` + return video + "<p>" + content + "</p>" + } + return content + }, + func(url, content string) string { + if strings.HasPrefix(url, "https://xkcd.com") { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) + if err != nil { + return content + } + + imgTag := doc.Find("img").First() + if titleAttr, found := imgTag.Attr("title"); found { + return content + `<blockquote cite="` + url + `">` + titleAttr + "</blockquote>" + } + } + return content + }, +} + +func Rewriter(url, content string) string { + for _, rewriteRule := range rewriteRules { + content = rewriteRule(url, content) + } + + return content +} diff --git a/reader/rewrite/rewriter_test.go b/reader/rewrite/rewriter_test.go new file mode 100644 index 0000000..a666406 --- /dev/null +++ b/reader/rewrite/rewriter_test.go @@ -0,0 +1,34 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rewrite + +import "testing" + +func TestRewriteWithNoMatchingRule(t *testing.T) { + output := Rewriter("https://example.org/article", `Some text.`) + expected := `Some text.` + + if expected != output { + t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) + } +} + +func TestRewriteWithYoutubeLink(t *testing.T) { + output := Rewriter("https://www.youtube.com/watch?v=1234", `Video Description`) + expected := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/1234" allowfullscreen></iframe><p>Video Description</p>` + + if expected != output { + t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) + } +} + +func TestRewriteWithXkcdLink(t *testing.T) { + description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" title="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />` + output := Rewriter("https://xkcd.com/1912/", description) + expected := description + `<blockquote cite="https://xkcd.com/1912/">Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you.</blockquote>` + if expected != output { + t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected) + } +} diff --git a/reader/sanitizer/sanitizer.go b/reader/sanitizer/sanitizer.go new file mode 100644 index 0000000..6af034c --- /dev/null +++ b/reader/sanitizer/sanitizer.go @@ -0,0 +1,360 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import ( + "bytes" + "fmt" + "github.com/miniflux/miniflux2/reader/url" + "io" + "strings" + + "golang.org/x/net/html" +) + +// Sanitize returns safe HTML. +func Sanitize(baseURL, input string) string { + tokenizer := html.NewTokenizer(bytes.NewBufferString(input)) + var buffer bytes.Buffer + var tagStack []string + + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + return buffer.String() + } + + return "" + } + + token := tokenizer.Token() + switch token.Type { + case html.TextToken: + buffer.WriteString(token.Data) + case html.StartTagToken: + tagName := token.DataAtom.String() + + if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) { + attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) + + if hasRequiredAttributes(tagName, attrNames) { + if len(attrNames) > 0 { + buffer.WriteString("<" + tagName + " " + htmlAttributes + ">") + } else { + buffer.WriteString("<" + tagName + ">") + } + + tagStack = append(tagStack, tagName) + } + } + case html.EndTagToken: + tagName := token.DataAtom.String() + if isValidTag(tagName) && inList(tagName, tagStack) { + buffer.WriteString(fmt.Sprintf("</%s>", tagName)) + } + case html.SelfClosingTagToken: + tagName := token.DataAtom.String() + if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) { + attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) + + if hasRequiredAttributes(tagName, attrNames) { + if len(attrNames) > 0 { + buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>") + } else { + buffer.WriteString("<" + tagName + "/>") + } + } + } + } + } +} + +func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) (attrNames []string, html string) { + var htmlAttrs []string + var err error + + for _, attribute := range attributes { + value := attribute.Val + + if !isValidAttribute(tagName, attribute.Key) { + continue + } + + if isExternalResourceAttribute(attribute.Key) { + if tagName == "iframe" && !isValidIframeSource(attribute.Val) { + continue + } else { + value, err = url.GetAbsoluteURL(baseURL, value) + if err != nil { + continue + } + + if !hasValidScheme(value) || isBlacklistedResource(value) { + continue + } + } + } + + attrNames = append(attrNames, attribute.Key) + htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, value)) + } + + extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName) + if len(extraAttrNames) > 0 { + attrNames = append(attrNames, extraAttrNames...) + htmlAttrs = append(htmlAttrs, extraHTMLAttributes...) + } + + return attrNames, strings.Join(htmlAttrs, " ") +} + +func getExtraAttributes(tagName string) ([]string, []string) { + if tagName == "a" { + return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`} + } + + if tagName == "video" || tagName == "audio" { + return []string{"controls"}, []string{"controls"} + } + + return nil, nil +} + +func isValidTag(tagName string) bool { + for element := range getTagWhitelist() { + if tagName == element { + return true + } + } + + return false +} + +func isValidAttribute(tagName, attributeName string) bool { + for element, attributes := range getTagWhitelist() { + if tagName == element { + if inList(attributeName, attributes) { + return true + } + } + } + + return false +} + +func isExternalResourceAttribute(attribute string) bool { + switch attribute { + case "src", "href", "poster", "cite": + return true + default: + return false + } +} + +func isPixelTracker(tagName string, attributes []html.Attribute) bool { + if tagName == "img" { + hasHeight := false + hasWidth := false + + for _, attribute := range attributes { + if attribute.Key == "height" && attribute.Val == "1" { + hasHeight = true + } + + if attribute.Key == "width" && attribute.Val == "1" { + hasWidth = true + } + } + + return hasHeight && hasWidth + } + + return false +} + +func hasRequiredAttributes(tagName string, attributes []string) bool { + elements := make(map[string][]string) + elements["a"] = []string{"href"} + elements["iframe"] = []string{"src"} + elements["img"] = []string{"src"} + elements["source"] = []string{"src"} + + for element, attrs := range elements { + if tagName == element { + for _, attribute := range attributes { + for _, attr := range attrs { + if attr == attribute { + return true + } + } + } + + return false + } + } + + return true +} + +func hasValidScheme(src string) bool { + // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + whitelist := []string{ + "apt://", + "bitcoin://", + "callto://", + "ed2k://", + "facetime://", + "feed://", + "ftp://", + "geo://", + "gopher://", + "git://", + "http://", + "https://", + "irc://", + "irc6://", + "ircs://", + "itms://", + "jabber://", + "magnet://", + "mailto://", + "maps://", + "news://", + "nfs://", + "nntp://", + "rtmp://", + "sip://", + "sips://", + "skype://", + "smb://", + "sms://", + "spotify://", + "ssh://", + "sftp://", + "steam://", + "svn://", + "tel://", + "webcal://", + "xmpp://", + } + + for _, prefix := range whitelist { + if strings.HasPrefix(src, prefix) { + return true + } + } + + return false +} + +func isBlacklistedResource(src string) bool { + blacklist := []string{ + "feedsportal.com", + "api.flattr.com", + "stats.wordpress.com", + "plus.google.com/share", + "twitter.com/share", + "feeds.feedburner.com", + } + + for _, element := range blacklist { + if strings.Contains(src, element) { + return true + } + } + + return false +} + +func isValidIframeSource(src string) bool { + whitelist := []string{ + "http://www.youtube.com", + "https://www.youtube.com", + "http://player.vimeo.com", + "https://player.vimeo.com", + "http://www.dailymotion.com", + "https://www.dailymotion.com", + "http://vk.com", + "https://vk.com", + } + + for _, prefix := range whitelist { + if strings.HasPrefix(src, prefix) { + return true + } + } + + return false +} + +func getTagWhitelist() map[string][]string { + whitelist := make(map[string][]string) + whitelist["img"] = []string{"alt", "title", "src"} + whitelist["audio"] = []string{"src"} + whitelist["video"] = []string{"poster", "height", "width", "src"} + whitelist["source"] = []string{"src", "type"} + whitelist["dt"] = []string{} + whitelist["dd"] = []string{} + whitelist["dl"] = []string{} + whitelist["table"] = []string{} + whitelist["caption"] = []string{} + whitelist["thead"] = []string{} + whitelist["tfooter"] = []string{} + whitelist["tr"] = []string{} + whitelist["td"] = []string{"rowspan", "colspan"} + whitelist["th"] = []string{"rowspan", "colspan"} + whitelist["h1"] = []string{} + whitelist["h2"] = []string{} + whitelist["h3"] = []string{} + whitelist["h4"] = []string{} + whitelist["h5"] = []string{} + whitelist["h6"] = []string{} + whitelist["strong"] = []string{} + whitelist["em"] = []string{} + whitelist["code"] = []string{} + whitelist["pre"] = []string{} + whitelist["blockquote"] = []string{} + whitelist["q"] = []string{"cite"} + whitelist["p"] = []string{} + whitelist["ul"] = []string{} + whitelist["li"] = []string{} + whitelist["ol"] = []string{} + whitelist["br"] = []string{} + whitelist["del"] = []string{} + whitelist["a"] = []string{"href", "title"} + whitelist["figure"] = []string{} + whitelist["figcaption"] = []string{} + whitelist["cite"] = []string{} + whitelist["time"] = []string{"datetime"} + whitelist["abbr"] = []string{"title"} + whitelist["acronym"] = []string{"title"} + whitelist["wbr"] = []string{} + whitelist["dfn"] = []string{} + whitelist["sub"] = []string{} + whitelist["sup"] = []string{} + whitelist["var"] = []string{} + whitelist["samp"] = []string{} + whitelist["s"] = []string{} + whitelist["del"] = []string{} + whitelist["ins"] = []string{} + whitelist["kbd"] = []string{} + whitelist["rp"] = []string{} + whitelist["rt"] = []string{} + whitelist["rtc"] = []string{} + whitelist["ruby"] = []string{} + whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"} + return whitelist +} + +func inList(needle string, haystack []string) bool { + for _, element := range haystack { + if element == needle { + return true + } + } + + return false +} diff --git a/reader/sanitizer/sanitizer_test.go b/reader/sanitizer/sanitizer_test.go new file mode 100644 index 0000000..73862d3 --- /dev/null +++ b/reader/sanitizer/sanitizer_test.go @@ -0,0 +1,144 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "testing" + +func TestValidInput(t *testing.T) { + input := `<p>This is a <strong>text</strong> with an image: <img src="http://example.org/" alt="Test">.</p>` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestSelfClosingTags(t *testing.T) { + input := `<p>This <br> is a <strong>text</strong> <br/>with an image: <img src="http://example.org/" alt="Test"/>.</p>` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestTable(t *testing.T) { + input := `<table><tr><th>A</th><th colspan="2">B</th></tr><tr><td>C</td><td>D</td><td>E</td></tr></table>` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestRelativeURL(t *testing.T) { + input := `This <a href="/test.html">link is relative</a> and this image: <img src="../folder/image.png"/>` + expected := `This <a href="http://example.org/test.html" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">link is relative</a> and this image: <img src="http://example.org/folder/image.png"/>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestProtocolRelativeURL(t *testing.T) { + input := `This <a href="//static.example.org/index.html">link is relative</a>.` + expected := `This <a href="https://static.example.org/index.html" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">link is relative</a>.` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidTag(t *testing.T) { + input := `<p>My invalid <b>tag</b>.</p>` + expected := `<p>My invalid tag.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestVideoTag(t *testing.T) { + input := `<p>My valid <video src="videofile.webm" autoplay poster="posterimage.jpg">fallback</video>.</p>` + expected := `<p>My valid <video src="http://example.org/videofile.webm" poster="http://example.org/posterimage.jpg" controls>fallback</video>.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestAudioAndSourceTag(t *testing.T) { + input := `<p>My music <audio controls="controls"><source src="foo.wav" type="audio/wav"></audio>.</p>` + expected := `<p>My music <audio controls><source src="http://example.org/foo.wav" type="audio/wav"></audio>.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestUnknownTag(t *testing.T) { + input := `<p>My invalid <unknown>tag</unknown>.</p>` + expected := `<p>My invalid tag.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidNestedTag(t *testing.T) { + input := `<p>My invalid <b>tag with some <em>valid</em> tag</b>.</p>` + expected := `<p>My invalid tag with some <em>valid</em> tag.</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidIFrame(t *testing.T) { + input := `<iframe src="http://example.org/"></iframe>` + expected := `` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidURLScheme(t *testing.T) { + input := `<p>This link is <a src="file:///etc/passwd">not valid</a></p>` + expected := `<p>This link is not valid</p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestBlacklistedLink(t *testing.T) { + input := `<p>This image is not valid <img src="https://stats.wordpress.com/some-tracker"></p>` + expected := `<p>This image is not valid </p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestPixelTracker(t *testing.T) { + input := `<p><img src="https://tracker1.example.org/" height="1" width="1"> and <img src="https://tracker2.example.org/" height="1" width="1"/></p>` + expected := `<p> and </p>` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} diff --git a/reader/sanitizer/strip_tags.go b/reader/sanitizer/strip_tags.go new file mode 100644 index 0000000..f3cecd1 --- /dev/null +++ b/reader/sanitizer/strip_tags.go @@ -0,0 +1,35 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import ( + "bytes" + "io" + + "golang.org/x/net/html" +) + +// StripTags removes all HTML/XML tags from the input string. +func StripTags(input string) string { + tokenizer := html.NewTokenizer(bytes.NewBufferString(input)) + var buffer bytes.Buffer + + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + return buffer.String() + } + + return "" + } + + token := tokenizer.Token() + switch token.Type { + case html.TextToken: + buffer.WriteString(token.Data) + } + } +} diff --git a/reader/sanitizer/strip_tags_test.go b/reader/sanitizer/strip_tags_test.go new file mode 100644 index 0000000..763ccc9 --- /dev/null +++ b/reader/sanitizer/strip_tags_test.go @@ -0,0 +1,17 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "testing" + +func TestStripTags(t *testing.T) { + input := `This <a href="/test.html">link is relative</a> and <strong>this</strong> image: <img src="../folder/image.png"/>` + expected := `This link is relative and this image: ` + output := StripTags(input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} diff --git a/reader/subscription/finder.go b/reader/subscription/finder.go new file mode 100644 index 0000000..7314644 --- /dev/null +++ b/reader/subscription/finder.go @@ -0,0 +1,96 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package subscription + +import ( + "bytes" + "fmt" + "github.com/miniflux/miniflux2/errors" + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/reader/feed" + "github.com/miniflux/miniflux2/reader/http" + "github.com/miniflux/miniflux2/reader/url" + "io" + "log" + "time" + + "github.com/PuerkitoBio/goquery" +) + +var ( + errConnectionFailure = "Unable to open this link: %v" + errUnreadableDoc = "Unable to analyze this page: %v" +) + +// FindSubscriptions downloads and try to find one or more subscriptions from an URL. +func FindSubscriptions(websiteURL string) (Subscriptions, error) { + defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[FindSubscriptions] url=%s", websiteURL)) + + client := http.NewHttpClient(websiteURL) + response, err := client.Get() + if err != nil { + return nil, errors.NewLocalizedError(errConnectionFailure, err) + } + + var buffer bytes.Buffer + io.Copy(&buffer, response.Body) + reader := bytes.NewReader(buffer.Bytes()) + + if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown { + var subscriptions Subscriptions + subscriptions = append(subscriptions, &Subscription{ + Title: response.EffectiveURL, + URL: response.EffectiveURL, + Type: format, + }) + + return subscriptions, nil + } + + reader.Seek(0, io.SeekStart) + return parseDocument(response.EffectiveURL, bytes.NewReader(buffer.Bytes())) +} + +func parseDocument(websiteURL string, data io.Reader) (Subscriptions, error) { + var subscriptions Subscriptions + queries := map[string]string{ + "link[type='application/rss+xml']": "rss", + "link[type='application/atom+xml']": "atom", + "link[type='application/json']": "json", + } + + doc, err := goquery.NewDocumentFromReader(data) + if err != nil { + return nil, errors.NewLocalizedError(errUnreadableDoc, err) + } + + for query, kind := range queries { + doc.Find(query).Each(func(i int, s *goquery.Selection) { + subscription := new(Subscription) + subscription.Type = kind + + if title, exists := s.Attr("title"); exists { + subscription.Title = title + } else { + subscription.Title = "Feed" + } + + if feedURL, exists := s.Attr("href"); exists { + subscription.URL, _ = url.GetAbsoluteURL(websiteURL, feedURL) + } + + if subscription.Title == "" { + subscription.Title = subscription.URL + } + + if subscription.URL != "" { + log.Println("[FindSubscriptions]", subscription) + subscriptions = append(subscriptions, subscription) + } + }) + } + + return subscriptions, nil +} diff --git a/reader/subscription/subscription.go b/reader/subscription/subscription.go new file mode 100644 index 0000000..f619f73 --- /dev/null +++ b/reader/subscription/subscription.go @@ -0,0 +1,21 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package subscription + +import "fmt" + +// Subscription represents a feed subscription. +type Subscription struct { + Title string `json:"title"` + URL string `json:"url"` + Type string `json:"type"` +} + +func (s Subscription) String() string { + return fmt.Sprintf(`Title="%s", URL="%s", Type="%s"`, s.Title, s.URL, s.Type) +} + +// Subscriptions represents a list of subscription. +type Subscriptions []*Subscription diff --git a/reader/url/url.go b/reader/url/url.go new file mode 100644 index 0000000..eea658b --- /dev/null +++ b/reader/url/url.go @@ -0,0 +1,61 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package url + +import "net/url" +import "fmt" +import "strings" + +// GetAbsoluteURL converts the input URL as absolute URL if necessary. +func GetAbsoluteURL(baseURL, input string) (string, error) { + if strings.HasPrefix(input, "//") { + input = "https://" + input[2:] + } + + u, err := url.Parse(input) + if err != nil { + return "", fmt.Errorf("unable to parse input URL: %v", err) + } + + if u.IsAbs() { + return u.String(), nil + } + + base, err := url.Parse(baseURL) + if err != nil { + return "", fmt.Errorf("unable to parse base URL: %v", err) + } + + return base.ResolveReference(u).String(), nil +} + +// GetRootURL returns absolute URL without the path. +func GetRootURL(websiteURL string) string { + if strings.HasPrefix(websiteURL, "//") { + websiteURL = "https://" + websiteURL[2:] + } + + absoluteURL, err := GetAbsoluteURL(websiteURL, "") + if err != nil { + return websiteURL + } + + u, err := url.Parse(absoluteURL) + if err != nil { + return absoluteURL + } + + return u.Scheme + "://" + u.Host + "/" +} + +// IsHTTPS returns true if the URL is using HTTPS. +func IsHTTPS(websiteURL string) bool { + parsedURL, err := url.Parse(websiteURL) + if err != nil { + return false + } + + return strings.ToLower(parsedURL.Scheme) == "https" +} diff --git a/reader/url/url_test.go b/reader/url/url_test.go new file mode 100644 index 0000000..0a97086 --- /dev/null +++ b/reader/url/url_test.go @@ -0,0 +1,107 @@ +package url + +import "testing" + +func TestGetAbsoluteURLWithAbsolutePath(t *testing.T) { + expected := `https://example.org/path/file.ext` + input := `/path/file.ext` + output, err := GetAbsoluteURL("https://example.org/folder/", input) + + if err != nil { + t.Error(err) + } + + if expected != output { + t.Errorf(`Unexpected output, got "%s" instead of "%s"`, output, expected) + } +} + +func TestGetAbsoluteURLWithRelativePath(t *testing.T) { + expected := `https://example.org/folder/path/file.ext` + input := `path/file.ext` + output, err := GetAbsoluteURL("https://example.org/folder/", input) + + if err != nil { + t.Error(err) + } + + if expected != output { + t.Errorf(`Unexpected output, got "%s" instead of "%s"`, output, expected) + } +} + +func TestGetAbsoluteURLWithRelativePaths(t *testing.T) { + expected := `https://example.org/path/file.ext` + input := `path/file.ext` + output, err := GetAbsoluteURL("https://example.org/folder", input) + + if err != nil { + t.Error(err) + } + + if expected != output { + t.Errorf(`Unexpected output, got "%s" instead of "%s"`, output, expected) + } +} + +func TestWhenInputIsAlreadyAbsolute(t *testing.T) { + expected := `https://example.org/path/file.ext` + input := `https://example.org/path/file.ext` + output, err := GetAbsoluteURL("https://example.org/folder/", input) + + if err != nil { + t.Error(err) + } + + if expected != output { + t.Errorf(`Unexpected output, got "%s" instead of "%s"`, output, expected) + } +} + +func TestGetAbsoluteURLWithProtocolRelative(t *testing.T) { + expected := `https://static.example.org/path/file.ext` + input := `//static.example.org/path/file.ext` + output, err := GetAbsoluteURL("https://www.example.org/", input) + + if err != nil { + t.Error(err) + } + + if expected != output { + t.Errorf(`Unexpected output, got "%s" instead of "%s"`, output, expected) + } +} + +func TestGetRootURL(t *testing.T) { + expected := `https://example.org/` + input := `https://example.org/path/file.ext` + output := GetRootURL(input) + + if expected != output { + t.Errorf(`Unexpected output, got "%s" instead of "%s"`, output, expected) + } +} + +func TestGetRootURLWithProtocolRelativePath(t *testing.T) { + expected := `https://static.example.org/` + input := `//static.example.org/path/file.ext` + output := GetRootURL(input) + + if expected != output { + t.Errorf(`Unexpected output, got "%s" instead of "%s"`, output, expected) + } +} + +func TestIsHTTPS(t *testing.T) { + if !IsHTTPS("https://example.org/") { + t.Error("Unable to recognize HTTPS URL") + } + + if IsHTTPS("http://example.org/") { + t.Error("Unable to recognize HTTP URL") + } + + if IsHTTPS("") { + t.Error("Unable to recognize malformed URL") + } +} |