Add parser for RDF feeds

author: Frédéric Guillot <fred@miniflux.net> 2017-11-20 18:34:11 -0800
committer: Frédéric Guillot <fred@miniflux.net> 2017-11-20 18:34:11 -0800
commit: 89307010adab9c1b0c8b963f50898ee449bbe42f (patch)
tree: 0d4035227103db256e12af492a026570ba206396 /reader/feed
parent: c5cd38de832fffb20cbf97a60babe8cbc42fde1e (diff)
7 files changed, 470 insertions, 17 deletions
diff --git a/reader/feed/atom/parser.go b/reader/feed/atom/parser.go
index cb21c04..ec0d6b4 100644
--- a/reader/feed/atom/parser.go
+++ b/reader/feed/atom/parser.go
@@ -14,7 +14,7 @@ import (
 	"golang.org/x/net/html/charset"
 )
 
-// Parse returns a normalized feed struct.
+// Parse returns a normalized feed struct from a Atom feed.
 func Parse(data io.Reader) (*model.Feed, error) {
 	atomFeed := new(AtomFeed)
 	decoder := xml.NewDecoder(data)
@@ -22,7 +22,7 @@ func Parse(data io.Reader) (*model.Feed, error) {
 
 	err := decoder.Decode(atomFeed)
 	if err != nil {
-		return nil, errors.NewLocalizedError("Unable to parse Atom feed: %v", err)
+		return nil, errors.NewLocalizedError("Unable to parse Atom feed: %v.", err)
 	}
 
 	return atomFeed.Transform(), nil
diff --git a/reader/feed/parser.go b/reader/feed/parser.go
index 8df6b46..d94d72b 100644
--- a/reader/feed/parser.go
+++ b/reader/feed/parser.go
@@ -8,25 +8,30 @@ import (
 	"bytes"
 	"encoding/xml"
 	"errors"
+	"io"
+	"strings"
+	"time"
+
 	"github.com/miniflux/miniflux2/helper"
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/feed/atom"
 	"github.com/miniflux/miniflux2/reader/feed/json"
+	"github.com/miniflux/miniflux2/reader/feed/rdf"
 	"github.com/miniflux/miniflux2/reader/feed/rss"
-	"io"
-	"strings"
-	"time"
 
 	"golang.org/x/net/html/charset"
 )
 
+// List of feed formats.
 const (
-	FormatRss     = "rss"
+	FormatRDF     = "rdf"
+	FormatRSS     = "rss"
 	FormatAtom    = "atom"
-	FormatJson    = "json"
+	FormatJSON    = "json"
 	FormatUnknown = "unknown"
 )
 
+// DetectFeedFormat detect feed format from input data.
 func DetectFeedFormat(data io.Reader) string {
 	defer helper.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
 
@@ -45,15 +50,17 @@ func DetectFeedFormat(data io.Reader) string {
 		if element, ok := token.(xml.StartElement); ok {
 			switch element.Name.Local {
 			case "rss":
-				return FormatRss
+				return FormatRSS
 			case "feed":
 				return FormatAtom
+			case "RDF":
+				return FormatRDF
 			}
 		}
 	}
 
 	if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
-		return FormatJson
+		return FormatJSON
 	}
 
 	return FormatUnknown
@@ -72,10 +79,12 @@ func parseFeed(data io.Reader) (*model.Feed, error) {
 	switch format {
 	case FormatAtom:
 		return atom.Parse(reader)
-	case FormatRss:
+	case FormatRSS:
 		return rss.Parse(reader)
-	case FormatJson:
+	case FormatJSON:
 		return json.Parse(reader)
+	case FormatRDF:
+		return rdf.Parse(reader)
 	default:
 		return nil, errors.New("Unsupported feed format")
 	}
diff --git a/reader/feed/parser_test.go b/reader/feed/parser_test.go
index 0dd8dd6..b201caf 100644
--- a/reader/feed/parser_test.go
+++ b/reader/feed/parser_test.go
@@ -9,12 +9,21 @@ import (
 	"testing"
 )
 
+func TestDetectRDF(t *testing.T) {
+	data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
+	format := DetectFeedFormat(bytes.NewBufferString(data))
+
+	if format != FormatRDF {
+		t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF)
+	}
+}
+
 func TestDetectRSS(t *testing.T) {
 	data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
 	format := DetectFeedFormat(bytes.NewBufferString(data))
 
-	if format != FormatRss {
-		t.Errorf("Wrong format detected: %s instead of %s", format, FormatRss)
+	if format != FormatRSS {
+		t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS)
 	}
 }
 
@@ -45,8 +54,8 @@ func TestDetectJSON(t *testing.T) {
 	`
 	format := DetectFeedFormat(bytes.NewBufferString(data))
 
-	if format != FormatJson {
-		t.Errorf("Wrong format detected: %s instead of %s", format, FormatJson)
+	if format != FormatJSON {
+		t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON)
 	}
 }
 
@@ -93,7 +102,7 @@ func TestParseAtom(t *testing.T) {
 	}
 }
 
-func TestParseRss(t *testing.T) {
+func TestParseRSS(t *testing.T) {
 	data := `<?xml version="1.0"?>
 	<rss version="2.0">
 	<channel>
@@ -119,6 +128,35 @@ func TestParseRss(t *testing.T) {
 	}
 }
 
+func TestParseRDF(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rdf:RDF
+		  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+		  xmlns="http://purl.org/rss/1.0/"
+		>
+
+		  <channel>
+			<title>RDF Example</title>
+			<link>http://example.org/</link>
+		  </channel>
+
+		  <item>
+			<title>Title</title>
+			<link>http://example.org/item</link>
+			<description>Test</description>
+		  </item>
+		</rdf:RDF>`
+
+	feed, err := parseFeed(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Title != "RDF Example" {
+		t.Errorf("Incorrect title, got: %s", feed.Title)
+	}
+}
+
 func TestParseJson(t *testing.T) {
 	data := `{
 		"version": "https://jsonfeed.org/version/1",
diff --git a/reader/feed/rdf/parser.go b/reader/feed/rdf/parser.go
new file mode 100644
index 0000000..f854a97
--- /dev/null
+++ b/reader/feed/rdf/parser.go
@@ -0,0 +1,28 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package rdf
+
+import (
+	"encoding/xml"
+	"io"
+
+	"github.com/miniflux/miniflux2/errors"
+	"github.com/miniflux/miniflux2/model"
+	"golang.org/x/net/html/charset"
+)
+
+// Parse returns a normalized feed struct from a RDF feed.
+func Parse(data io.Reader) (*model.Feed, error) {
+	feed := new(rdfFeed)
+	decoder := xml.NewDecoder(data)
+	decoder.CharsetReader = charset.NewReaderLabel
+
+	err := decoder.Decode(feed)
+	if err != nil {
+		return nil, errors.NewLocalizedError("Unable to parse RDF feed: %v.", err)
+	}
+
+	return feed.Transform(), nil
+}
diff --git a/reader/feed/rdf/parser_test.go b/reader/feed/rdf/parser_test.go
new file mode 100644
index 0000000..dadca6f
--- /dev/null
+++ b/reader/feed/rdf/parser_test.go
@@ -0,0 +1,307 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package rdf
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"github.com/miniflux/miniflux2/errors"
+)
+
+func TestParseRDFSample(t *testing.T) {
+	data := `
+	<?xml version="1.0"?>
+
+	<rdf:RDF
+	  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+	  xmlns="http://purl.org/rss/1.0/"
+	>
+
+	  <channel rdf:about="http://www.xml.com/xml/news.rss">
+		<title>XML.com</title>
+		<link>http://xml.com/pub</link>
+		<description>
+		  XML.com features a rich mix of information and services
+		  for the XML community.
+		</description>
+
+		<image rdf:resource="http://xml.com/universal/images/xml_tiny.gif" />
+
+		<items>
+		  <rdf:Seq>
+			<rdf:li resource="http://xml.com/pub/2000/08/09/xslt/xslt.html" />
+			<rdf:li resource="http://xml.com/pub/2000/08/09/rdfdb/index.html" />
+		  </rdf:Seq>
+		</items>
+
+		<textinput rdf:resource="http://search.xml.com" />
+
+	  </channel>
+
+	  <image rdf:about="http://xml.com/universal/images/xml_tiny.gif">
+		<title>XML.com</title>
+		<link>http://www.xml.com</link>
+		<url>http://xml.com/universal/images/xml_tiny.gif</url>
+	  </image>
+
+	  <item rdf:about="http://xml.com/pub/2000/08/09/xslt/xslt.html">
+		<title>Processing Inclusions with XSLT</title>
+		<link>http://xml.com/pub/2000/08/09/xslt/xslt.html</link>
+		<description>
+		 Processing document inclusions with general XML tools can be
+		 problematic. This article proposes a way of preserving inclusion
+		 information through SAX-based processing.
+		</description>
+	  </item>
+
+	  <item rdf:about="http://xml.com/pub/2000/08/09/rdfdb/index.html">
+		<title>Putting RDF to Work</title>
+		<link>http://xml.com/pub/2000/08/09/rdfdb/index.html</link>
+		<description>
+		 Tool and API support for the Resource Description Framework
+		 is slowly coming of age. Edd Dumbill takes a look at RDFDB,
+		 one of the most exciting new RDF toolkits.
+		</description>
+	  </item>
+
+	  <textinput rdf:about="http://search.xml.com">
+		<title>Search XML.com</title>
+		<description>Search XML.com's XML collection</description>
+		<name>s</name>
+		<link>http://search.xml.com</link>
+	  </textinput>
+
+	</rdf:RDF>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Title != "XML.com" {
+		t.Errorf("Incorrect title, got: %s", feed.Title)
+	}
+
+	if feed.FeedURL != "" {
+		t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
+	}
+
+	if feed.SiteURL != "http://xml.com/pub" {
+		t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
+	}
+
+	if len(feed.Entries) != 2 {
+		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if feed.Entries[1].Hash != "8aaeee5d3ab50351422fbded41078ee88c73bf1441085b16a8c09fd90a7db321" {
+		t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash)
+	}
+
+	if feed.Entries[1].URL != "http://xml.com/pub/2000/08/09/rdfdb/index.html" {
+		t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
+	}
+
+	if feed.Entries[1].Title != "Putting RDF to Work" {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+
+	if strings.HasSuffix(feed.Entries[1].Content, "Tool and API support") {
+		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
+	}
+}
+
+func TestParseRDFSampleWithDublinCore(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+
+	<rdf:RDF
+	  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+	  xmlns:dc="http://purl.org/dc/elements/1.1/"
+	  xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
+	  xmlns:co="http://purl.org/rss/1.0/modules/company/"
+	  xmlns:ti="http://purl.org/rss/1.0/modules/textinput/"
+	  xmlns="http://purl.org/rss/1.0/"
+	>
+
+	  <channel rdf:about="http://meerkat.oreillynet.com/?_fl=rss1.0">
+		<title>Meerkat</title>
+		<link>http://meerkat.oreillynet.com</link>
+		<description>Meerkat: An Open Wire Service</description>
+		<dc:publisher>The O'Reilly Network</dc:publisher>
+		<dc:creator>Rael Dornfest (mailto:rael@oreilly.com)</dc:creator>
+		<dc:rights>Copyright &#169; 2000 O'Reilly &amp; Associates, Inc.</dc:rights>
+		<dc:date>2000-01-01T12:00+00:00</dc:date>
+		<sy:updatePeriod>hourly</sy:updatePeriod>
+		<sy:updateFrequency>2</sy:updateFrequency>
+		<sy:updateBase>2000-01-01T12:00+00:00</sy:updateBase>
+
+		<image rdf:resource="http://meerkat.oreillynet.com/icons/meerkat-powered.jpg" />
+
+		<items>
+		  <rdf:Seq>
+			<rdf:li resource="http://c.moreover.com/click/here.pl?r123" />
+		  </rdf:Seq>
+		</items>
+
+		<textinput rdf:resource="http://meerkat.oreillynet.com" />
+
+	  </channel>
+
+	  <image rdf:about="http://meerkat.oreillynet.com/icons/meerkat-powered.jpg">
+		<title>Meerkat Powered!</title>
+		<url>http://meerkat.oreillynet.com/icons/meerkat-powered.jpg</url>
+		<link>http://meerkat.oreillynet.com</link>
+	  </image>
+
+	  <item rdf:about="http://c.moreover.com/click/here.pl?r123">
+		<title>XML: A Disruptive Technology</title>
+		<link>http://c.moreover.com/click/here.pl?r123</link>
+		<dc:description>
+		  XML is placing increasingly heavy loads on the existing technical
+		  infrastructure of the Internet.
+		</dc:description>
+		<dc:publisher>The O'Reilly Network</dc:publisher>
+		<dc:creator>Simon St.Laurent (mailto:simonstl@simonstl.com)</dc:creator>
+		<dc:rights>Copyright &#169; 2000 O'Reilly &amp; Associates, Inc.</dc:rights>
+		<dc:subject>XML</dc:subject>
+		<co:name>XML.com</co:name>
+		<co:market>NASDAQ</co:market>
+		<co:symbol>XML</co:symbol>
+	  </item>
+
+	  <textinput rdf:about="http://meerkat.oreillynet.com">
+		<title>Search Meerkat</title>
+		<description>Search Meerkat's RSS Database...</description>
+		<name>s</name>
+		<link>http://meerkat.oreillynet.com/</link>
+		<ti:function>search</ti:function>
+		<ti:inputType>regex</ti:inputType>
+	  </textinput>
+
+	</rdf:RDF>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Title != "Meerkat" {
+		t.Errorf("Incorrect title, got: %s", feed.Title)
+	}
+
+	if feed.FeedURL != "" {
+		t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
+	}
+
+	if feed.SiteURL != "http://meerkat.oreillynet.com" {
+		t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if feed.Entries[0].Hash != "fa4ef7c300b175ca66f92f226b5dba5caa2a9619f031101bf56e5b884b02cd97" {
+		t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash)
+	}
+
+	if feed.Entries[0].URL != "http://c.moreover.com/click/here.pl?r123" {
+		t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
+	}
+
+	if feed.Entries[0].Title != "XML: A Disruptive Technology" {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+
+	if strings.HasSuffix(feed.Entries[0].Content, "XML is placing increasingly") {
+		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
+	}
+
+	if feed.Entries[0].Author != "Simon St.Laurent (mailto:simonstl@simonstl.com)" {
+		t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author)
+	}
+}
+
+func TestParseItemWithOnlyFeedAuthor(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+
+	<rdf:RDF
+	  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+	  xmlns:dc="http://purl.org/dc/elements/1.1/"
+	  xmlns="http://purl.org/rss/1.0/"
+	>
+
+	  <channel rdf:about="http://meerkat.oreillynet.com/?_fl=rss1.0">
+		<title>Meerkat</title>
+		<link>http://meerkat.oreillynet.com</link>
+		<dc:creator>Rael Dornfest (mailto:rael@oreilly.com)</dc:creator>
+	  </channel>
+
+	  <item rdf:about="http://c.moreover.com/click/here.pl?r123">
+		<title>XML: A Disruptive Technology</title>
+		<link>http://c.moreover.com/click/here.pl?r123</link>
+		<dc:description>
+		  XML is placing increasingly heavy loads on the existing technical
+		  infrastructure of the Internet.
+		</dc:description>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Entries[0].Author != "Rael Dornfest (mailto:rael@oreilly.com)" {
+		t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author)
+	}
+}
+
+func TestParseItemWithoutLink(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+
+	<rdf:RDF
+	  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+	  xmlns="http://purl.org/rss/1.0/"
+	>
+
+	  <channel rdf:about="http://meerkat.oreillynet.com/?_fl=rss1.0">
+		<title>Meerkat</title>
+		<link>http://meerkat.oreillynet.com</link>
+	  </channel>
+
+	  <item rdf:about="http://c.moreover.com/click/here.pl?r123">
+		<title>Title</title>
+		<description>Test</description>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Entries[0].Hash != "37f5223ebd58639aa62a49afbb61df960efb7dc5db5181dfb3cedd9a49ad34c6" {
+		t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash)
+	}
+
+	if feed.Entries[0].URL != "http://meerkat.oreillynet.com" {
+		t.Errorf("Incorrect entry url, got: %s", feed.Entries[0].URL)
+	}
+}
+
+func TestParseInvalidXml(t *testing.T) {
+	data := `garbage`
+	_, err := Parse(bytes.NewBufferString(data))
+	if err == nil {
+		t.Error("Parse should returns an error")
+	}
+
+	if _, ok := err.(errors.LocalizedError); !ok {
+		t.Error("The error returned must be a LocalizedError")
+	}
+}
diff --git a/reader/feed/rdf/rdf.go b/reader/feed/rdf/rdf.go
new file mode 100644
index 0000000..05281ca
--- /dev/null
+++ b/reader/feed/rdf/rdf.go
@@ -0,0 +1,71 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package rdf
+
+import (
+	"encoding/xml"
+
+	"github.com/miniflux/miniflux2/helper"
+	"github.com/miniflux/miniflux2/reader/processor"
+	"github.com/miniflux/miniflux2/reader/sanitizer"
+
+	"github.com/miniflux/miniflux2/model"
+)
+
+type rdfFeed struct {
+	XMLName xml.Name  `xml:"RDF"`
+	Title   string    `xml:"channel>title"`
+	Link    string    `xml:"channel>link"`
+	Creator string    `xml:"channel>creator"`
+	Items   []rdfItem `xml:"item"`
+}
+
+func (r *rdfFeed) Transform() *model.Feed {
+	feed := new(model.Feed)
+	feed.Title = sanitizer.StripTags(r.Title)
+	feed.SiteURL = r.Link
+
+	for _, item := range r.Items {
+		entry := item.Transform()
+
+		if entry.Author == "" && r.Creator != "" {
+			entry.Author = sanitizer.StripTags(r.Creator)
+		}
+
+		if entry.URL == "" {
+			entry.URL = feed.SiteURL
+		}
+
+		feed.Entries = append(feed.Entries, entry)
+	}
+
+	return feed
+}
+
+type rdfItem struct {
+	Title       string `xml:"title"`
+	Link        string `xml:"link"`
+	Description string `xml:"description"`
+	Creator     string `xml:"creator"`
+}
+
+func (r *rdfItem) Transform() *model.Entry {
+	entry := new(model.Entry)
+	entry.Title = sanitizer.StripTags(r.Title)
+	entry.Author = sanitizer.StripTags(r.Creator)
+	entry.URL = r.Link
+	entry.Content = processor.ItemContentProcessor(entry.URL, r.Description)
+	entry.Hash = getHash(r)
+	return entry
+}
+
+func getHash(r *rdfItem) string {
+	value := r.Link
+	if value == "" {
+		value = r.Title + r.Description
+	}
+
+	return helper.Hash(value)
+}
diff --git a/reader/feed/rss/parser.go b/reader/feed/rss/parser.go
index 4eceb45..a5b4434 100644
--- a/reader/feed/rss/parser.go
+++ b/reader/feed/rss/parser.go
@@ -22,7 +22,7 @@ func Parse(data io.Reader) (*model.Feed, error) {
 
 	err := decoder.Decode(feed)
 	if err != nil {
-		return nil, errors.NewLocalizedError("Unable to parse RSS feed: %v", err)
+		return nil, errors.NewLocalizedError("Unable to parse RSS feed: %v.", err)
 	}
 
 	return feed.Transform(), nil
author	Frédéric Guillot <fred@miniflux.net>	2017-11-20 18:34:11 -0800
committer	Frédéric Guillot <fred@miniflux.net>	2017-11-20 18:34:11 -0800
commit	89307010adab9c1b0c8b963f50898ee449bbe42f (patch)
tree	0d4035227103db256e12af492a026570ba206396 /reader/feed
parent	c5cd38de832fffb20cbf97a60babe8cbc42fde1e (diff)