diff options
author | Frédéric Guillot <fred@miniflux.net> | 2018-10-14 11:46:41 -0700 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2018-10-14 11:46:41 -0700 |
commit | 5870f0426002c8e26a9ff472b23e15d7bf1235f7 (patch) | |
tree | d303a45c49d0433a5d2f7bed1bcce25d85a8bc0f /reader/feed | |
parent | d5ff4191b6087b96b88221c6d939f73931001f2a (diff) |
Simplify feed parser and format detection
- Avoid doing multiple buffer copies
- Move parser and format detection logic to its own package
Diffstat (limited to 'reader/feed')
-rw-r--r-- | reader/feed/doc.go | 2 | ||||
-rw-r--r-- | reader/feed/handler.go | 11 | ||||
-rw-r--r-- | reader/feed/parser.go | 118 | ||||
-rw-r--r-- | reader/feed/parser_test.go | 214 |
4 files changed, 6 insertions, 339 deletions
diff --git a/reader/feed/doc.go b/reader/feed/doc.go index de2e5ab..0b101fd 100644 --- a/reader/feed/doc.go +++ b/reader/feed/doc.go @@ -4,7 +4,7 @@ /* -Package feed provides a generic feed parser that abstracts all different formats. +Package feed handles feed updates and creation. */ package feed // import "miniflux.app/reader/feed" diff --git a/reader/feed/handler.go b/reader/feed/handler.go index 0b81c67..0945948 100644 --- a/reader/feed/handler.go +++ b/reader/feed/handler.go @@ -14,6 +14,7 @@ import ( "miniflux.app/logger" "miniflux.app/model" "miniflux.app/reader/icon" + "miniflux.app/reader/parser" "miniflux.app/reader/processor" "miniflux.app/storage" "miniflux.app/timer" @@ -67,12 +68,11 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool, return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL) } - body, err := response.NormalizeBodyEncoding() - if err != nil { + if err := response.EnsureUnicodeBody(); err != nil { return nil, errors.NewLocalizedError(errEncoding, err) } - subscription, feedErr := parseFeed(body) + subscription, feedErr := parser.ParseFeed(response.String()) if feedErr != nil { return nil, feedErr } @@ -183,12 +183,11 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error { return err } - body, err := response.NormalizeBodyEncoding() - if err != nil { + if err := response.EnsureUnicodeBody(); err != nil { return errors.NewLocalizedError(errEncoding, err) } - subscription, parseErr := parseFeed(body) + subscription, parseErr := parser.ParseFeed(response.String()) if parseErr != nil { originalFeed.ParsingErrorCount++ originalFeed.ParsingErrorMsg = parseErr.Localize(printer) diff --git a/reader/feed/parser.go b/reader/feed/parser.go deleted file mode 100644 index 0c7f51c..0000000 --- a/reader/feed/parser.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2017 Frédéric Guillot. All rights reserved. -// Use of this source code is governed by the Apache 2.0 -// license that can be found in the LICENSE file. - -package feed // import "miniflux.app/reader/feed" - -import ( - "bytes" - "encoding/xml" - "io" - "strings" - "time" - - "miniflux.app/errors" - "miniflux.app/logger" - "miniflux.app/model" - "miniflux.app/reader/atom" - "miniflux.app/reader/encoding" - "miniflux.app/reader/json" - "miniflux.app/reader/rdf" - "miniflux.app/reader/rss" - "miniflux.app/timer" -) - -// List of feed formats. -const ( - FormatRDF = "rdf" - FormatRSS = "rss" - FormatAtom = "atom" - FormatJSON = "json" - FormatUnknown = "unknown" -) - -// DetectFeedFormat detect feed format from input data. -func DetectFeedFormat(r io.Reader) string { - defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]") - - var buffer bytes.Buffer - tee := io.TeeReader(r, &buffer) - - decoder := xml.NewDecoder(tee) - decoder.CharsetReader = encoding.CharsetReader - - for { - token, _ := decoder.Token() - if token == nil { - break - } - - if element, ok := token.(xml.StartElement); ok { - switch element.Name.Local { - case "rss": - return FormatRSS - case "feed": - return FormatAtom - case "RDF": - return FormatRDF - } - } - } - - if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") { - return FormatJSON - } - - return FormatUnknown -} - -func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) { - defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]") - - var buffer bytes.Buffer - size, _ := io.Copy(&buffer, r) - if size == 0 { - return nil, errors.NewLocalizedError(errEmptyFeed) - } - - str := stripInvalidXMLCharacters(buffer.String()) - reader := strings.NewReader(str) - format := DetectFeedFormat(reader) - reader.Seek(0, io.SeekStart) - - switch format { - case FormatAtom: - return atom.Parse(reader) - case FormatRSS: - return rss.Parse(reader) - case FormatJSON: - return json.Parse(reader) - case FormatRDF: - return rdf.Parse(reader) - default: - return nil, errors.NewLocalizedError("Unsupported feed format") - } -} - -func stripInvalidXMLCharacters(input string) string { - return strings.Map(func(r rune) rune { - if isInCharacterRange(r) { - return r - } - - logger.Debug("Strip invalid XML characters: %U", r) - return -1 - }, input) -} - -// Decide whether the given rune is in the XML Character Range, per -// the Char production of http://www.xml.com/axml/testaxml.htm, -// Section 2.2 Characters. -func isInCharacterRange(r rune) (inrange bool) { - return r == 0x09 || - r == 0x0A || - r == 0x0D || - r >= 0x20 && r <= 0xDF77 || - r >= 0xE000 && r <= 0xFFFD || - r >= 0x10000 && r <= 0x10FFFF -} diff --git a/reader/feed/parser_test.go b/reader/feed/parser_test.go deleted file mode 100644 index 46dc34e..0000000 --- a/reader/feed/parser_test.go +++ /dev/null @@ -1,214 +0,0 @@ -// Copyright 2017 Frédéric Guillot. All rights reserved. -// Use of this source code is governed by the Apache 2.0 -// license that can be found in the LICENSE file. - -package feed // import "miniflux.app/reader/feed" - -import ( - "bytes" - "testing" -) - -func TestDetectRDF(t *testing.T) { - data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatRDF { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF) - } -} - -func TestDetectRSS(t *testing.T) { - data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatRSS { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS) - } -} - -func TestDetectAtom(t *testing.T) { - data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatAtom { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom) - } -} - -func TestDetectAtomWithISOCharset(t *testing.T) { - data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatAtom { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom) - } -} - -func TestDetectJSON(t *testing.T) { - data := ` - { - "version" : "https://jsonfeed.org/version/1", - "title" : "Example" - } - ` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatJSON { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON) - } -} - -func TestDetectUnknown(t *testing.T) { - data := ` - <!DOCTYPE html> <html> </html> - ` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatUnknown { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatUnknown) - } -} - -func TestParseAtom(t *testing.T) { - data := `<?xml version="1.0" encoding="utf-8"?> - <feed xmlns="http://www.w3.org/2005/Atom"> - - <title>Example Feed</title> - <link href="http://example.org/"/> - <updated>2003-12-13T18:30:02Z</updated> - <author> - <name>John Doe</name> - </author> - <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> - - <entry> - <title>Atom-Powered Robots Run Amok</title> - <link href="http://example.org/2003/12/13/atom03"/> - <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> - <updated>2003-12-13T18:30:02Z</updated> - <summary>Some text.</summary> - </entry> - - </feed>` - - feed, err := parseFeed(bytes.NewBufferString(data)) - if err != nil { - t.Error(err) - } - - if feed.Title != "Example Feed" { - t.Errorf("Incorrect title, got: %s", feed.Title) - } -} - -func TestParseRSS(t *testing.T) { - data := `<?xml version="1.0"?> - <rss version="2.0"> - <channel> - <title>Liftoff News</title> - <link>http://liftoff.msfc.nasa.gov/</link> - <item> - <title>Star City</title> - <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link> - <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.</description> - <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate> - <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> - </item> - </channel> - </rss>` - - feed, err := parseFeed(bytes.NewBufferString(data)) - if err != nil { - t.Error(err) - } - - if feed.Title != "Liftoff News" { - t.Errorf("Incorrect title, got: %s", feed.Title) - } -} - -func TestParseRDF(t *testing.T) { - data := `<?xml version="1.0" encoding="utf-8"?> - <rdf:RDF - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" - xmlns="http://purl.org/rss/1.0/" - > - - <channel> - <title>RDF Example</title> - <link>http://example.org/</link> - </channel> - - <item> - <title>Title</title> - <link>http://example.org/item</link> - <description>Test</description> - </item> - </rdf:RDF>` - - feed, err := parseFeed(bytes.NewBufferString(data)) - if err != nil { - t.Error(err) - } - - if feed.Title != "RDF Example" { - t.Errorf("Incorrect title, got: %s", feed.Title) - } -} - -func TestParseJson(t *testing.T) { - data := `{ - "version": "https://jsonfeed.org/version/1", - "title": "My Example Feed", - "home_page_url": "https://example.org/", - "feed_url": "https://example.org/feed.json", - "items": [ - { - "id": "2", - "content_text": "This is a second item.", - "url": "https://example.org/second-item" - }, - { - "id": "1", - "content_html": "<p>Hello, world!</p>", - "url": "https://example.org/initial-post" - } - ] - }` - - feed, err := parseFeed(bytes.NewBufferString(data)) - if err != nil { - t.Error(err) - } - - if feed.Title != "My Example Feed" { - t.Errorf("Incorrect title, got: %s", feed.Title) - } -} - -func TestParseUnknownFeed(t *testing.T) { - data := ` - <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> - <html xmlns="http://www.w3.org/1999/xhtml"> - <head> - <title>Title of document</title> - </head> - <body> - some content - </body> - </html> - ` - - _, err := parseFeed(bytes.NewBufferString(data)) - if err == nil { - t.Error("ParseFeed must returns an error") - } -} - -func TestParseEmptyFeed(t *testing.T) { - _, err := parseFeed(bytes.NewBufferString("")) - if err == nil { - t.Error("ParseFeed must returns an error") - } -} |