From 89307010adab9c1b0c8b963f50898ee449bbe42f Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Mon, 20 Nov 2017 18:34:11 -0800 Subject: Add parser for RDF feeds --- reader/feed/atom/parser.go | 4 +- reader/feed/parser.go | 27 ++-- reader/feed/parser_test.go | 48 ++++++- reader/feed/rdf/parser.go | 28 ++++ reader/feed/rdf/parser_test.go | 307 +++++++++++++++++++++++++++++++++++++++++ reader/feed/rdf/rdf.go | 71 ++++++++++ reader/feed/rss/parser.go | 2 +- 7 files changed, 470 insertions(+), 17 deletions(-) create mode 100644 reader/feed/rdf/parser.go create mode 100644 reader/feed/rdf/parser_test.go create mode 100644 reader/feed/rdf/rdf.go (limited to 'reader/feed') diff --git a/reader/feed/atom/parser.go b/reader/feed/atom/parser.go index cb21c04..ec0d6b4 100644 --- a/reader/feed/atom/parser.go +++ b/reader/feed/atom/parser.go @@ -14,7 +14,7 @@ import ( "golang.org/x/net/html/charset" ) -// Parse returns a normalized feed struct. +// Parse returns a normalized feed struct from a Atom feed. func Parse(data io.Reader) (*model.Feed, error) { atomFeed := new(AtomFeed) decoder := xml.NewDecoder(data) @@ -22,7 +22,7 @@ func Parse(data io.Reader) (*model.Feed, error) { err := decoder.Decode(atomFeed) if err != nil { - return nil, errors.NewLocalizedError("Unable to parse Atom feed: %v", err) + return nil, errors.NewLocalizedError("Unable to parse Atom feed: %v.", err) } return atomFeed.Transform(), nil diff --git a/reader/feed/parser.go b/reader/feed/parser.go index 8df6b46..d94d72b 100644 --- a/reader/feed/parser.go +++ b/reader/feed/parser.go @@ -8,25 +8,30 @@ import ( "bytes" "encoding/xml" "errors" + "io" + "strings" + "time" + "github.com/miniflux/miniflux2/helper" "github.com/miniflux/miniflux2/model" "github.com/miniflux/miniflux2/reader/feed/atom" "github.com/miniflux/miniflux2/reader/feed/json" + "github.com/miniflux/miniflux2/reader/feed/rdf" "github.com/miniflux/miniflux2/reader/feed/rss" - "io" - "strings" - "time" "golang.org/x/net/html/charset" ) +// List of feed formats. const ( - FormatRss = "rss" + FormatRDF = "rdf" + FormatRSS = "rss" FormatAtom = "atom" - FormatJson = "json" + FormatJSON = "json" FormatUnknown = "unknown" ) +// DetectFeedFormat detect feed format from input data. func DetectFeedFormat(data io.Reader) string { defer helper.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]") @@ -45,15 +50,17 @@ func DetectFeedFormat(data io.Reader) string { if element, ok := token.(xml.StartElement); ok { switch element.Name.Local { case "rss": - return FormatRss + return FormatRSS case "feed": return FormatAtom + case "RDF": + return FormatRDF } } } if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") { - return FormatJson + return FormatJSON } return FormatUnknown @@ -72,10 +79,12 @@ func parseFeed(data io.Reader) (*model.Feed, error) { switch format { case FormatAtom: return atom.Parse(reader) - case FormatRss: + case FormatRSS: return rss.Parse(reader) - case FormatJson: + case FormatJSON: return json.Parse(reader) + case FormatRDF: + return rdf.Parse(reader) default: return nil, errors.New("Unsupported feed format") } diff --git a/reader/feed/parser_test.go b/reader/feed/parser_test.go index 0dd8dd6..b201caf 100644 --- a/reader/feed/parser_test.go +++ b/reader/feed/parser_test.go @@ -9,12 +9,21 @@ import ( "testing" ) +func TestDetectRDF(t *testing.T) { + data := `` + format := DetectFeedFormat(bytes.NewBufferString(data)) + + if format != FormatRDF { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF) + } +} + func TestDetectRSS(t *testing.T) { data := `` format := DetectFeedFormat(bytes.NewBufferString(data)) - if format != FormatRss { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatRss) + if format != FormatRSS { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS) } } @@ -45,8 +54,8 @@ func TestDetectJSON(t *testing.T) { ` format := DetectFeedFormat(bytes.NewBufferString(data)) - if format != FormatJson { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatJson) + if format != FormatJSON { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON) } } @@ -93,7 +102,7 @@ func TestParseAtom(t *testing.T) { } } -func TestParseRss(t *testing.T) { +func TestParseRSS(t *testing.T) { data := ` @@ -119,6 +128,35 @@ func TestParseRss(t *testing.T) { } } +func TestParseRDF(t *testing.T) { + data := ` + + + + RDF Example + http://example.org/ + + + + Title + http://example.org/item + Test + + ` + + feed, err := parseFeed(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "RDF Example" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } +} + func TestParseJson(t *testing.T) { data := `{ "version": "https://jsonfeed.org/version/1", diff --git a/reader/feed/rdf/parser.go b/reader/feed/rdf/parser.go new file mode 100644 index 0000000..f854a97 --- /dev/null +++ b/reader/feed/rdf/parser.go @@ -0,0 +1,28 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rdf + +import ( + "encoding/xml" + "io" + + "github.com/miniflux/miniflux2/errors" + "github.com/miniflux/miniflux2/model" + "golang.org/x/net/html/charset" +) + +// Parse returns a normalized feed struct from a RDF feed. +func Parse(data io.Reader) (*model.Feed, error) { + feed := new(rdfFeed) + decoder := xml.NewDecoder(data) + decoder.CharsetReader = charset.NewReaderLabel + + err := decoder.Decode(feed) + if err != nil { + return nil, errors.NewLocalizedError("Unable to parse RDF feed: %v.", err) + } + + return feed.Transform(), nil +} diff --git a/reader/feed/rdf/parser_test.go b/reader/feed/rdf/parser_test.go new file mode 100644 index 0000000..dadca6f --- /dev/null +++ b/reader/feed/rdf/parser_test.go @@ -0,0 +1,307 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rdf + +import ( + "bytes" + "strings" + "testing" + + "github.com/miniflux/miniflux2/errors" +) + +func TestParseRDFSample(t *testing.T) { + data := ` + + + + + + XML.com + http://xml.com/pub + + XML.com features a rich mix of information and services + for the XML community. + + + + + + + + + + + + + + + + + XML.com + http://www.xml.com + http://xml.com/universal/images/xml_tiny.gif + + + + Processing Inclusions with XSLT + http://xml.com/pub/2000/08/09/xslt/xslt.html + + Processing document inclusions with general XML tools can be + problematic. This article proposes a way of preserving inclusion + information through SAX-based processing. + + + + + Putting RDF to Work + http://xml.com/pub/2000/08/09/rdfdb/index.html + + Tool and API support for the Resource Description Framework + is slowly coming of age. Edd Dumbill takes a look at RDFDB, + one of the most exciting new RDF toolkits. + + + + + Search XML.com + Search XML.com's XML collection + s + http://search.xml.com + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "XML.com" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://xml.com/pub" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 2 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[1].Hash != "8aaeee5d3ab50351422fbded41078ee88c73bf1441085b16a8c09fd90a7db321" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[1].URL != "http://xml.com/pub/2000/08/09/rdfdb/index.html" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[1].Title != "Putting RDF to Work" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } + + if strings.HasSuffix(feed.Entries[1].Content, "Tool and API support") { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } +} + +func TestParseRDFSampleWithDublinCore(t *testing.T) { + data := ` + + + + + Meerkat + http://meerkat.oreillynet.com + Meerkat: An Open Wire Service + The O'Reilly Network + Rael Dornfest (mailto:rael@oreilly.com) + Copyright © 2000 O'Reilly & Associates, Inc. + 2000-01-01T12:00+00:00 + hourly + 2 + 2000-01-01T12:00+00:00 + + + + + + + + + + + + + + + Meerkat Powered! + http://meerkat.oreillynet.com/icons/meerkat-powered.jpg + http://meerkat.oreillynet.com + + + + XML: A Disruptive Technology + http://c.moreover.com/click/here.pl?r123 + + XML is placing increasingly heavy loads on the existing technical + infrastructure of the Internet. + + The O'Reilly Network + Simon St.Laurent (mailto:simonstl@simonstl.com) + Copyright © 2000 O'Reilly & Associates, Inc. + XML + XML.com + NASDAQ + XML + + + + Search Meerkat + Search Meerkat's RSS Database... + s + http://meerkat.oreillynet.com/ + search + regex + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "Meerkat" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://meerkat.oreillynet.com" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Hash != "fa4ef7c300b175ca66f92f226b5dba5caa2a9619f031101bf56e5b884b02cd97" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://c.moreover.com/click/here.pl?r123" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "XML: A Disruptive Technology" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } + + if strings.HasSuffix(feed.Entries[0].Content, "XML is placing increasingly") { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } + + if feed.Entries[0].Author != "Simon St.Laurent (mailto:simonstl@simonstl.com)" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseItemWithOnlyFeedAuthor(t *testing.T) { + data := ` + + + + + Meerkat + http://meerkat.oreillynet.com + Rael Dornfest (mailto:rael@oreilly.com) + + + + XML: A Disruptive Technology + http://c.moreover.com/click/here.pl?r123 + + XML is placing increasingly heavy loads on the existing technical + infrastructure of the Internet. + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Rael Dornfest (mailto:rael@oreilly.com)" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseItemWithoutLink(t *testing.T) { + data := ` + + + + + Meerkat + http://meerkat.oreillynet.com + + + + Title + Test + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Hash != "37f5223ebd58639aa62a49afbb61df960efb7dc5db5181dfb3cedd9a49ad34c6" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://meerkat.oreillynet.com" { + t.Errorf("Incorrect entry url, got: %s", feed.Entries[0].URL) + } +} + +func TestParseInvalidXml(t *testing.T) { + data := `garbage` + _, err := Parse(bytes.NewBufferString(data)) + if err == nil { + t.Error("Parse should returns an error") + } + + if _, ok := err.(errors.LocalizedError); !ok { + t.Error("The error returned must be a LocalizedError") + } +} diff --git a/reader/feed/rdf/rdf.go b/reader/feed/rdf/rdf.go new file mode 100644 index 0000000..05281ca --- /dev/null +++ b/reader/feed/rdf/rdf.go @@ -0,0 +1,71 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rdf + +import ( + "encoding/xml" + + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/reader/processor" + "github.com/miniflux/miniflux2/reader/sanitizer" + + "github.com/miniflux/miniflux2/model" +) + +type rdfFeed struct { + XMLName xml.Name `xml:"RDF"` + Title string `xml:"channel>title"` + Link string `xml:"channel>link"` + Creator string `xml:"channel>creator"` + Items []rdfItem `xml:"item"` +} + +func (r *rdfFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.Title = sanitizer.StripTags(r.Title) + feed.SiteURL = r.Link + + for _, item := range r.Items { + entry := item.Transform() + + if entry.Author == "" && r.Creator != "" { + entry.Author = sanitizer.StripTags(r.Creator) + } + + if entry.URL == "" { + entry.URL = feed.SiteURL + } + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} + +type rdfItem struct { + Title string `xml:"title"` + Link string `xml:"link"` + Description string `xml:"description"` + Creator string `xml:"creator"` +} + +func (r *rdfItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.Title = sanitizer.StripTags(r.Title) + entry.Author = sanitizer.StripTags(r.Creator) + entry.URL = r.Link + entry.Content = processor.ItemContentProcessor(entry.URL, r.Description) + entry.Hash = getHash(r) + return entry +} + +func getHash(r *rdfItem) string { + value := r.Link + if value == "" { + value = r.Title + r.Description + } + + return helper.Hash(value) +} diff --git a/reader/feed/rss/parser.go b/reader/feed/rss/parser.go index 4eceb45..a5b4434 100644 --- a/reader/feed/rss/parser.go +++ b/reader/feed/rss/parser.go @@ -22,7 +22,7 @@ func Parse(data io.Reader) (*model.Feed, error) { err := decoder.Decode(feed) if err != nil { - return nil, errors.NewLocalizedError("Unable to parse RSS feed: %v", err) + return nil, errors.NewLocalizedError("Unable to parse RSS feed: %v.", err) } return feed.Transform(), nil -- cgit v1.2.3