aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader/feed
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2018-10-14 11:46:41 -0700
committerGravatar Frédéric Guillot <fred@miniflux.net>2018-10-14 11:46:41 -0700
commit5870f0426002c8e26a9ff472b23e15d7bf1235f7 (patch)
treed303a45c49d0433a5d2f7bed1bcce25d85a8bc0f /reader/feed
parentd5ff4191b6087b96b88221c6d939f73931001f2a (diff)
Simplify feed parser and format detection
- Avoid doing multiple buffer copies - Move parser and format detection logic to its own package
Diffstat (limited to 'reader/feed')
-rw-r--r--reader/feed/doc.go2
-rw-r--r--reader/feed/handler.go11
-rw-r--r--reader/feed/parser.go118
-rw-r--r--reader/feed/parser_test.go214
4 files changed, 6 insertions, 339 deletions
diff --git a/reader/feed/doc.go b/reader/feed/doc.go
index de2e5ab..0b101fd 100644
--- a/reader/feed/doc.go
+++ b/reader/feed/doc.go
@@ -4,7 +4,7 @@
/*
-Package feed provides a generic feed parser that abstracts all different formats.
+Package feed handles feed updates and creation.
*/
package feed // import "miniflux.app/reader/feed"
diff --git a/reader/feed/handler.go b/reader/feed/handler.go
index 0b81c67..0945948 100644
--- a/reader/feed/handler.go
+++ b/reader/feed/handler.go
@@ -14,6 +14,7 @@ import (
"miniflux.app/logger"
"miniflux.app/model"
"miniflux.app/reader/icon"
+ "miniflux.app/reader/parser"
"miniflux.app/reader/processor"
"miniflux.app/storage"
"miniflux.app/timer"
@@ -67,12 +68,11 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool,
return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL)
}
- body, err := response.NormalizeBodyEncoding()
- if err != nil {
+ if err := response.EnsureUnicodeBody(); err != nil {
return nil, errors.NewLocalizedError(errEncoding, err)
}
- subscription, feedErr := parseFeed(body)
+ subscription, feedErr := parser.ParseFeed(response.String())
if feedErr != nil {
return nil, feedErr
}
@@ -183,12 +183,11 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
return err
}
- body, err := response.NormalizeBodyEncoding()
- if err != nil {
+ if err := response.EnsureUnicodeBody(); err != nil {
return errors.NewLocalizedError(errEncoding, err)
}
- subscription, parseErr := parseFeed(body)
+ subscription, parseErr := parser.ParseFeed(response.String())
if parseErr != nil {
originalFeed.ParsingErrorCount++
originalFeed.ParsingErrorMsg = parseErr.Localize(printer)
diff --git a/reader/feed/parser.go b/reader/feed/parser.go
deleted file mode 100644
index 0c7f51c..0000000
--- a/reader/feed/parser.go
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright 2017 Frédéric Guillot. All rights reserved.
-// Use of this source code is governed by the Apache 2.0
-// license that can be found in the LICENSE file.
-
-package feed // import "miniflux.app/reader/feed"
-
-import (
- "bytes"
- "encoding/xml"
- "io"
- "strings"
- "time"
-
- "miniflux.app/errors"
- "miniflux.app/logger"
- "miniflux.app/model"
- "miniflux.app/reader/atom"
- "miniflux.app/reader/encoding"
- "miniflux.app/reader/json"
- "miniflux.app/reader/rdf"
- "miniflux.app/reader/rss"
- "miniflux.app/timer"
-)
-
-// List of feed formats.
-const (
- FormatRDF = "rdf"
- FormatRSS = "rss"
- FormatAtom = "atom"
- FormatJSON = "json"
- FormatUnknown = "unknown"
-)
-
-// DetectFeedFormat detect feed format from input data.
-func DetectFeedFormat(r io.Reader) string {
- defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
-
- var buffer bytes.Buffer
- tee := io.TeeReader(r, &buffer)
-
- decoder := xml.NewDecoder(tee)
- decoder.CharsetReader = encoding.CharsetReader
-
- for {
- token, _ := decoder.Token()
- if token == nil {
- break
- }
-
- if element, ok := token.(xml.StartElement); ok {
- switch element.Name.Local {
- case "rss":
- return FormatRSS
- case "feed":
- return FormatAtom
- case "RDF":
- return FormatRDF
- }
- }
- }
-
- if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
- return FormatJSON
- }
-
- return FormatUnknown
-}
-
-func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) {
- defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
-
- var buffer bytes.Buffer
- size, _ := io.Copy(&buffer, r)
- if size == 0 {
- return nil, errors.NewLocalizedError(errEmptyFeed)
- }
-
- str := stripInvalidXMLCharacters(buffer.String())
- reader := strings.NewReader(str)
- format := DetectFeedFormat(reader)
- reader.Seek(0, io.SeekStart)
-
- switch format {
- case FormatAtom:
- return atom.Parse(reader)
- case FormatRSS:
- return rss.Parse(reader)
- case FormatJSON:
- return json.Parse(reader)
- case FormatRDF:
- return rdf.Parse(reader)
- default:
- return nil, errors.NewLocalizedError("Unsupported feed format")
- }
-}
-
-func stripInvalidXMLCharacters(input string) string {
- return strings.Map(func(r rune) rune {
- if isInCharacterRange(r) {
- return r
- }
-
- logger.Debug("Strip invalid XML characters: %U", r)
- return -1
- }, input)
-}
-
-// Decide whether the given rune is in the XML Character Range, per
-// the Char production of http://www.xml.com/axml/testaxml.htm,
-// Section 2.2 Characters.
-func isInCharacterRange(r rune) (inrange bool) {
- return r == 0x09 ||
- r == 0x0A ||
- r == 0x0D ||
- r >= 0x20 && r <= 0xDF77 ||
- r >= 0xE000 && r <= 0xFFFD ||
- r >= 0x10000 && r <= 0x10FFFF
-}
diff --git a/reader/feed/parser_test.go b/reader/feed/parser_test.go
deleted file mode 100644
index 46dc34e..0000000
--- a/reader/feed/parser_test.go
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright 2017 Frédéric Guillot. All rights reserved.
-// Use of this source code is governed by the Apache 2.0
-// license that can be found in the LICENSE file.
-
-package feed // import "miniflux.app/reader/feed"
-
-import (
- "bytes"
- "testing"
-)
-
-func TestDetectRDF(t *testing.T) {
- data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatRDF {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF)
- }
-}
-
-func TestDetectRSS(t *testing.T) {
- data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatRSS {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS)
- }
-}
-
-func TestDetectAtom(t *testing.T) {
- data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatAtom {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom)
- }
-}
-
-func TestDetectAtomWithISOCharset(t *testing.T) {
- data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatAtom {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom)
- }
-}
-
-func TestDetectJSON(t *testing.T) {
- data := `
- {
- "version" : "https://jsonfeed.org/version/1",
- "title" : "Example"
- }
- `
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatJSON {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON)
- }
-}
-
-func TestDetectUnknown(t *testing.T) {
- data := `
- <!DOCTYPE html> <html> </html>
- `
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatUnknown {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatUnknown)
- }
-}
-
-func TestParseAtom(t *testing.T) {
- data := `<?xml version="1.0" encoding="utf-8"?>
- <feed xmlns="http://www.w3.org/2005/Atom">
-
- <title>Example Feed</title>
- <link href="http://example.org/"/>
- <updated>2003-12-13T18:30:02Z</updated>
- <author>
- <name>John Doe</name>
- </author>
- <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
-
- <entry>
- <title>Atom-Powered Robots Run Amok</title>
- <link href="http://example.org/2003/12/13/atom03"/>
- <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
- <updated>2003-12-13T18:30:02Z</updated>
- <summary>Some text.</summary>
- </entry>
-
- </feed>`
-
- feed, err := parseFeed(bytes.NewBufferString(data))
- if err != nil {
- t.Error(err)
- }
-
- if feed.Title != "Example Feed" {
- t.Errorf("Incorrect title, got: %s", feed.Title)
- }
-}
-
-func TestParseRSS(t *testing.T) {
- data := `<?xml version="1.0"?>
- <rss version="2.0">
- <channel>
- <title>Liftoff News</title>
- <link>http://liftoff.msfc.nasa.gov/</link>
- <item>
- <title>Star City</title>
- <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
- <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
- <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
- <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
- </item>
- </channel>
- </rss>`
-
- feed, err := parseFeed(bytes.NewBufferString(data))
- if err != nil {
- t.Error(err)
- }
-
- if feed.Title != "Liftoff News" {
- t.Errorf("Incorrect title, got: %s", feed.Title)
- }
-}
-
-func TestParseRDF(t *testing.T) {
- data := `<?xml version="1.0" encoding="utf-8"?>
- <rdf:RDF
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
- xmlns="http://purl.org/rss/1.0/"
- >
-
- <channel>
- <title>RDF Example</title>
- <link>http://example.org/</link>
- </channel>
-
- <item>
- <title>Title</title>
- <link>http://example.org/item</link>
- <description>Test</description>
- </item>
- </rdf:RDF>`
-
- feed, err := parseFeed(bytes.NewBufferString(data))
- if err != nil {
- t.Error(err)
- }
-
- if feed.Title != "RDF Example" {
- t.Errorf("Incorrect title, got: %s", feed.Title)
- }
-}
-
-func TestParseJson(t *testing.T) {
- data := `{
- "version": "https://jsonfeed.org/version/1",
- "title": "My Example Feed",
- "home_page_url": "https://example.org/",
- "feed_url": "https://example.org/feed.json",
- "items": [
- {
- "id": "2",
- "content_text": "This is a second item.",
- "url": "https://example.org/second-item"
- },
- {
- "id": "1",
- "content_html": "<p>Hello, world!</p>",
- "url": "https://example.org/initial-post"
- }
- ]
- }`
-
- feed, err := parseFeed(bytes.NewBufferString(data))
- if err != nil {
- t.Error(err)
- }
-
- if feed.Title != "My Example Feed" {
- t.Errorf("Incorrect title, got: %s", feed.Title)
- }
-}
-
-func TestParseUnknownFeed(t *testing.T) {
- data := `
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml">
- <head>
- <title>Title of document</title>
- </head>
- <body>
- some content
- </body>
- </html>
- `
-
- _, err := parseFeed(bytes.NewBufferString(data))
- if err == nil {
- t.Error("ParseFeed must returns an error")
- }
-}
-
-func TestParseEmptyFeed(t *testing.T) {
- _, err := parseFeed(bytes.NewBufferString(""))
- if err == nil {
- t.Error("ParseFeed must returns an error")
- }
-}