From d5838b67340ca83dcc32f6d1775c183188ec5e7a Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Mon, 20 Nov 2017 19:17:04 -0800 Subject: Move feed parsers packages in reader package --- reader/json/json.go | 171 +++++++++++++++++++++ reader/json/parser.go | 24 +++ reader/json/parser_test.go | 359 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 554 insertions(+) create mode 100644 reader/json/json.go create mode 100644 reader/json/parser.go create mode 100644 reader/json/parser_test.go (limited to 'reader/json') diff --git a/reader/json/json.go b/reader/json/json.go new file mode 100644 index 0000000..cd6a1c8 --- /dev/null +++ b/reader/json/json.go @@ -0,0 +1,171 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package json + +import ( + "log" + "strings" + "time" + + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/date" + "github.com/miniflux/miniflux2/reader/processor" + "github.com/miniflux/miniflux2/reader/sanitizer" +) + +type jsonFeed struct { + Version string `json:"version"` + Title string `json:"title"` + SiteURL string `json:"home_page_url"` + FeedURL string `json:"feed_url"` + Author jsonAuthor `json:"author"` + Items []jsonItem `json:"items"` +} + +type jsonAuthor struct { + Name string `json:"name"` + URL string `json:"url"` +} + +type jsonItem struct { + ID string `json:"id"` + URL string `json:"url"` + Title string `json:"title"` + Summary string `json:"summary"` + Text string `json:"content_text"` + HTML string `json:"content_html"` + DatePublished string `json:"date_published"` + DateModified string `json:"date_modified"` + Author jsonAuthor `json:"author"` + Attachments []jsonAttachment `json:"attachments"` +} + +type jsonAttachment struct { + URL string `json:"url"` + MimeType string `json:"mime_type"` + Title string `json:"title"` + Size int `json:"size_in_bytes"` + Duration int `json:"duration_in_seconds"` +} + +func (j *jsonFeed) GetAuthor() string { + return getAuthor(j.Author) +} + +func (j *jsonFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.FeedURL = j.FeedURL + feed.SiteURL = j.SiteURL + feed.Title = sanitizer.StripTags(j.Title) + + if feed.Title == "" { + feed.Title = feed.SiteURL + } + + for _, item := range j.Items { + entry := item.Transform() + if entry.Author == "" { + entry.Author = j.GetAuthor() + } + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} + +func (j *jsonItem) GetDate() time.Time { + for _, value := range []string{j.DatePublished, j.DateModified} { + if value != "" { + d, err := date.Parse(value) + if err != nil { + log.Println(err) + return time.Now() + } + + return d + } + } + + return time.Now() +} + +func (j *jsonItem) GetAuthor() string { + return getAuthor(j.Author) +} + +func (j *jsonItem) GetHash() string { + for _, value := range []string{j.ID, j.URL, j.Text + j.HTML + j.Summary} { + if value != "" { + return helper.Hash(value) + } + } + + return "" +} + +func (j *jsonItem) GetTitle() string { + for _, value := range []string{j.Title, j.Summary, j.Text, j.HTML} { + if value != "" { + return truncate(value) + } + } + + return j.URL +} + +func (j *jsonItem) GetContent() string { + for _, value := range []string{j.HTML, j.Text, j.Summary} { + if value != "" { + return value + } + } + + return "" +} + +func (j *jsonItem) GetEnclosures() model.EnclosureList { + enclosures := make(model.EnclosureList, 0) + + for _, attachment := range j.Attachments { + enclosures = append(enclosures, &model.Enclosure{ + URL: attachment.URL, + MimeType: attachment.MimeType, + Size: attachment.Size, + }) + } + + return enclosures +} + +func (j *jsonItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.URL = j.URL + entry.Date = j.GetDate() + entry.Author = sanitizer.StripTags(j.GetAuthor()) + entry.Hash = j.GetHash() + entry.Content = processor.ItemContentProcessor(entry.URL, j.GetContent()) + entry.Title = sanitizer.StripTags(strings.Trim(j.GetTitle(), " \n\t")) + entry.Enclosures = j.GetEnclosures() + return entry +} + +func getAuthor(author jsonAuthor) string { + if author.Name != "" { + return author.Name + } + + return "" +} + +func truncate(str string) string { + max := 100 + if len(str) > max { + return str[:max] + "..." + } + + return str +} diff --git a/reader/json/parser.go b/reader/json/parser.go new file mode 100644 index 0000000..18329e7 --- /dev/null +++ b/reader/json/parser.go @@ -0,0 +1,24 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package json + +import ( + "encoding/json" + "io" + + "github.com/miniflux/miniflux2/errors" + "github.com/miniflux/miniflux2/model" +) + +// Parse returns a normalized feed struct from a JON feed. +func Parse(data io.Reader) (*model.Feed, error) { + feed := new(jsonFeed) + decoder := json.NewDecoder(data) + if err := decoder.Decode(&feed); err != nil { + return nil, errors.NewLocalizedError("Unable to parse JSON Feed: %v", err) + } + + return feed.Transform(), nil +} diff --git a/reader/json/parser_test.go b/reader/json/parser_test.go new file mode 100644 index 0000000..ecb11a1 --- /dev/null +++ b/reader/json/parser_test.go @@ -0,0 +1,359 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package json + +import ( + "bytes" + "strings" + "testing" + "time" + + "github.com/miniflux/miniflux2/errors" +) + +func TestParseJsonFeed(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "id": "2", + "content_text": "This is a second item.", + "url": "https://example.org/second-item" + }, + { + "id": "1", + "content_html": "

Hello, world!

", + "url": "https://example.org/initial-post" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "My Example Feed" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "https://example.org/feed.json" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "https://example.org/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 2 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Hash != "d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "https://example.org/second-item" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "This is a second item." { + t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[0].Title) + } + + if feed.Entries[0].Content != "This is a second item." { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } + + if feed.Entries[1].Hash != "6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[1].Hash) + } + + if feed.Entries[1].URL != "https://example.org/initial-post" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL) + } + + if feed.Entries[1].Title != "Hello, world!" { + t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[1].Title) + } + + if feed.Entries[1].Content != "

Hello, world!

" { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[1].Content) + } +} + +func TestParsePodcast(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "user_comment": "This is a podcast feed. You can add this feed to your podcast client using the following URL: http://therecord.co/feed.json", + "title": "The Record", + "home_page_url": "http://therecord.co/", + "feed_url": "http://therecord.co/feed.json", + "items": [ + { + "id": "http://therecord.co/chris-parrish", + "title": "Special #1 - Chris Parrish", + "url": "http://therecord.co/chris-parrish", + "content_text": "Chris has worked at Adobe and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped Napkin, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on Bainbridge Island, a quick ferry ride from Seattle.", + "content_html": "Chris has worked at Adobe and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped Napkin, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on Bainbridge Island, a quick ferry ride from Seattle.", + "summary": "Brent interviews Chris Parrish, co-host of The Record and one-half of Aged & Distilled.", + "date_published": "2014-05-09T14:04:00-07:00", + "attachments": [ + { + "url": "http://therecord.co/downloads/The-Record-sp1e1-ChrisParrish.m4a", + "mime_type": "audio/x-m4a", + "size_in_bytes": 89970236, + "duration_in_seconds": 6629 + } + ] + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "The Record" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "http://therecord.co/feed.json" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://therecord.co/" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Hash != "6b678e57962a1b001e4e873756563cdc08bbd06ca561e764e0baa9a382485797" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://therecord.co/chris-parrish" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "Special #1 - Chris Parrish" { + t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[0].Title) + } + + if feed.Entries[0].Content != `Chris has worked at Adobe and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped Napkin, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on Bainbridge Island, a quick ferry ride from Seattle.` { + t.Errorf(`Incorrect entry content, got: "%s"`, feed.Entries[0].Content) + } + + location, _ := time.LoadLocation("America/Vancouver") + if !feed.Entries[0].Date.Equal(time.Date(2014, time.May, 9, 14, 4, 0, 0, location)) { + t.Errorf("Incorrect entry date, got: %v", feed.Entries[0].Date) + } + + if len(feed.Entries[0].Enclosures) != 1 { + t.Errorf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) + } + + if feed.Entries[0].Enclosures[0].URL != "http://therecord.co/downloads/The-Record-sp1e1-ChrisParrish.m4a" { + t.Errorf("Incorrect enclosure URL, got: %s", feed.Entries[0].Enclosures[0].URL) + } + + if feed.Entries[0].Enclosures[0].MimeType != "audio/x-m4a" { + t.Errorf("Incorrect enclosure type, got: %s", feed.Entries[0].Enclosures[0].MimeType) + } + + if feed.Entries[0].Enclosures[0].Size != 89970236 { + t.Errorf("Incorrect enclosure length, got: %d", feed.Entries[0].Enclosures[0].Size) + } +} + +func TestParseAuthor(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "user_comment": "This is a microblog feed. You can add this to your feed reader using the following URL: https://example.org/feed.json", + "title": "Brent Simmons’s Microblog", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "author": { + "name": "Brent Simmons", + "url": "http://example.org/", + "avatar": "https://example.org/avatar.png" + }, + "items": [ + { + "id": "2347259", + "url": "https://example.org/2347259", + "content_text": "Cats are neat. \n\nhttps://example.org/cats", + "date_published": "2016-02-09T14:22:00-07:00" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Author != "Brent Simmons" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseFeedWithoutTitle(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "id": "2347259", + "url": "https://example.org/2347259", + "content_text": "Cats are neat. \n\nhttps://example.org/cats", + "date_published": "2016-02-09T14:22:00-07:00" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "https://example.org/" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } +} + +func TestParseFeedItemWithInvalidDate(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "id": "2347259", + "url": "https://example.org/2347259", + "content_text": "Cats are neat. \n\nhttps://example.org/cats", + "date_published": "Tomorrow" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if !feed.Entries[0].Date.Before(time.Now()) { + t.Errorf("Incorrect entry date, got: %v", feed.Entries[0].Date) + } +} + +func TestParseFeedItemWithoutID(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "content_text": "Some text." + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Hash != "13b4c5aecd1b6d749afcee968fbf9c80f1ed1bbdbe1aaf25cb34ebd01144bbe9" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } +} + +func TestParseFeedItemWithoutTitle(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "url": "https://example.org/item" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "https://example.org/item" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseTruncateItemTitle(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "title": "` + strings.Repeat("a", 200) + `" + } + ] + }` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if len(feed.Entries[0].Title) != 103 { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseInvalidJSON(t *testing.T) { + data := `garbage` + _, err := Parse(bytes.NewBufferString(data)) + if err == nil { + t.Error("Parse should returns an error") + } + + if _, ok := err.(errors.LocalizedError); !ok { + t.Error("The error returned must be a LocalizedError") + } +} -- cgit v1.2.3