aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--http/client/response.go20
-rw-r--r--reader/feed/doc.go2
-rw-r--r--reader/feed/handler.go11
-rw-r--r--reader/feed/parser.go118
-rw-r--r--reader/parser/doc.go10
-rw-r--r--reader/parser/format.go51
-rw-r--r--reader/parser/format_test.go70
-rw-r--r--reader/parser/parser.go58
-rw-r--r--reader/parser/parser_test.go (renamed from reader/feed/parser_test.go)76
-rw-r--r--reader/scraper/scraper.go13
-rw-r--r--reader/subscription/finder.go21
11 files changed, 229 insertions, 221 deletions
diff --git a/http/client/response.go b/http/client/response.go
index 0efd8e2..c084824 100644
--- a/http/client/response.go
+++ b/http/client/response.go
@@ -6,6 +6,7 @@ package client // import "miniflux.app/http/client"
import (
"io"
+ "io/ioutil"
"mime"
"strings"
@@ -56,23 +57,32 @@ func (r *Response) IsModified(etag, lastModified string) bool {
return true
}
-// NormalizeBodyEncoding make sure the body is encoded in UTF-8.
+// EnsureUnicodeBody makes sure the body is encoded in UTF-8.
//
// If a charset other than UTF-8 is detected, we convert the document to UTF-8.
// This is used by the scraper and feed readers.
//
// Do not forget edge cases:
// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
-func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
+func (r *Response) EnsureUnicodeBody() error {
_, params, err := mime.ParseMediaType(r.ContentType)
if err == nil {
if enc, found := params["charset"]; found {
enc = strings.ToLower(enc)
if enc != "utf-8" && enc != "utf8" && enc != "" {
- logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc)
- return charset.NewReader(r.Body, r.ContentType)
+ logger.Debug("[EnsureUnicodeBody] Convert body to utf-8 from %s", enc)
+ r.Body, err = charset.NewReader(r.Body, r.ContentType)
+ if err != nil {
+ return err
+ }
}
}
}
- return r.Body, nil
+ return nil
+}
+
+// String returns the response body as string.
+func (r *Response) String() string {
+ bytes, _ := ioutil.ReadAll(r.Body)
+ return string(bytes)
}
diff --git a/reader/feed/doc.go b/reader/feed/doc.go
index de2e5ab..0b101fd 100644
--- a/reader/feed/doc.go
+++ b/reader/feed/doc.go
@@ -4,7 +4,7 @@
/*
-Package feed provides a generic feed parser that abstracts all different formats.
+Package feed handles feed updates and creation.
*/
package feed // import "miniflux.app/reader/feed"
diff --git a/reader/feed/handler.go b/reader/feed/handler.go
index 0b81c67..0945948 100644
--- a/reader/feed/handler.go
+++ b/reader/feed/handler.go
@@ -14,6 +14,7 @@ import (
"miniflux.app/logger"
"miniflux.app/model"
"miniflux.app/reader/icon"
+ "miniflux.app/reader/parser"
"miniflux.app/reader/processor"
"miniflux.app/storage"
"miniflux.app/timer"
@@ -67,12 +68,11 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool,
return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL)
}
- body, err := response.NormalizeBodyEncoding()
- if err != nil {
+ if err := response.EnsureUnicodeBody(); err != nil {
return nil, errors.NewLocalizedError(errEncoding, err)
}
- subscription, feedErr := parseFeed(body)
+ subscription, feedErr := parser.ParseFeed(response.String())
if feedErr != nil {
return nil, feedErr
}
@@ -183,12 +183,11 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
return err
}
- body, err := response.NormalizeBodyEncoding()
- if err != nil {
+ if err := response.EnsureUnicodeBody(); err != nil {
return errors.NewLocalizedError(errEncoding, err)
}
- subscription, parseErr := parseFeed(body)
+ subscription, parseErr := parser.ParseFeed(response.String())
if parseErr != nil {
originalFeed.ParsingErrorCount++
originalFeed.ParsingErrorMsg = parseErr.Localize(printer)
diff --git a/reader/feed/parser.go b/reader/feed/parser.go
deleted file mode 100644
index 0c7f51c..0000000
--- a/reader/feed/parser.go
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright 2017 Frédéric Guillot. All rights reserved.
-// Use of this source code is governed by the Apache 2.0
-// license that can be found in the LICENSE file.
-
-package feed // import "miniflux.app/reader/feed"
-
-import (
- "bytes"
- "encoding/xml"
- "io"
- "strings"
- "time"
-
- "miniflux.app/errors"
- "miniflux.app/logger"
- "miniflux.app/model"
- "miniflux.app/reader/atom"
- "miniflux.app/reader/encoding"
- "miniflux.app/reader/json"
- "miniflux.app/reader/rdf"
- "miniflux.app/reader/rss"
- "miniflux.app/timer"
-)
-
-// List of feed formats.
-const (
- FormatRDF = "rdf"
- FormatRSS = "rss"
- FormatAtom = "atom"
- FormatJSON = "json"
- FormatUnknown = "unknown"
-)
-
-// DetectFeedFormat detect feed format from input data.
-func DetectFeedFormat(r io.Reader) string {
- defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
-
- var buffer bytes.Buffer
- tee := io.TeeReader(r, &buffer)
-
- decoder := xml.NewDecoder(tee)
- decoder.CharsetReader = encoding.CharsetReader
-
- for {
- token, _ := decoder.Token()
- if token == nil {
- break
- }
-
- if element, ok := token.(xml.StartElement); ok {
- switch element.Name.Local {
- case "rss":
- return FormatRSS
- case "feed":
- return FormatAtom
- case "RDF":
- return FormatRDF
- }
- }
- }
-
- if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
- return FormatJSON
- }
-
- return FormatUnknown
-}
-
-func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) {
- defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
-
- var buffer bytes.Buffer
- size, _ := io.Copy(&buffer, r)
- if size == 0 {
- return nil, errors.NewLocalizedError(errEmptyFeed)
- }
-
- str := stripInvalidXMLCharacters(buffer.String())
- reader := strings.NewReader(str)
- format := DetectFeedFormat(reader)
- reader.Seek(0, io.SeekStart)
-
- switch format {
- case FormatAtom:
- return atom.Parse(reader)
- case FormatRSS:
- return rss.Parse(reader)
- case FormatJSON:
- return json.Parse(reader)
- case FormatRDF:
- return rdf.Parse(reader)
- default:
- return nil, errors.NewLocalizedError("Unsupported feed format")
- }
-}
-
-func stripInvalidXMLCharacters(input string) string {
- return strings.Map(func(r rune) rune {
- if isInCharacterRange(r) {
- return r
- }
-
- logger.Debug("Strip invalid XML characters: %U", r)
- return -1
- }, input)
-}
-
-// Decide whether the given rune is in the XML Character Range, per
-// the Char production of http://www.xml.com/axml/testaxml.htm,
-// Section 2.2 Characters.
-func isInCharacterRange(r rune) (inrange bool) {
- return r == 0x09 ||
- r == 0x0A ||
- r == 0x0D ||
- r >= 0x20 && r <= 0xDF77 ||
- r >= 0xE000 && r <= 0xFFFD ||
- r >= 0x10000 && r <= 0x10FFFF
-}
diff --git a/reader/parser/doc.go b/reader/parser/doc.go
new file mode 100644
index 0000000..11670fa
--- /dev/null
+++ b/reader/parser/doc.go
@@ -0,0 +1,10 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+/*
+
+Package parser provides a generic feed parser that abstract all different formats.
+
+*/
+package parser // import "miniflux.app/reader/parser"
diff --git a/reader/parser/format.go b/reader/parser/format.go
new file mode 100644
index 0000000..87963d3
--- /dev/null
+++ b/reader/parser/format.go
@@ -0,0 +1,51 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package parser // import "miniflux.app/reader/parser"
+
+import (
+ "encoding/xml"
+ "strings"
+
+ "miniflux.app/reader/encoding"
+)
+
+// List of feed formats.
+const (
+ FormatRDF = "rdf"
+ FormatRSS = "rss"
+ FormatAtom = "atom"
+ FormatJSON = "json"
+ FormatUnknown = "unknown"
+)
+
+// DetectFeedFormat tries to guess the feed format from input data.
+func DetectFeedFormat(data string) string {
+ if strings.HasPrefix(strings.TrimSpace(data), "{") {
+ return FormatJSON
+ }
+
+ decoder := xml.NewDecoder(strings.NewReader(data))
+ decoder.CharsetReader = encoding.CharsetReader
+
+ for {
+ token, _ := decoder.Token()
+ if token == nil {
+ break
+ }
+
+ if element, ok := token.(xml.StartElement); ok {
+ switch element.Name.Local {
+ case "rss":
+ return FormatRSS
+ case "feed":
+ return FormatAtom
+ case "RDF":
+ return FormatRDF
+ }
+ }
+ }
+
+ return FormatUnknown
+}
diff --git a/reader/parser/format_test.go b/reader/parser/format_test.go
new file mode 100644
index 0000000..3795541
--- /dev/null
+++ b/reader/parser/format_test.go
@@ -0,0 +1,70 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package parser // import "miniflux.app/reader/parser"
+
+import (
+ "testing"
+)
+
+func TestDetectRDF(t *testing.T) {
+ data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
+ format := DetectFeedFormat(data)
+
+ if format != FormatRDF {
+ t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRDF)
+ }
+}
+
+func TestDetectRSS(t *testing.T) {
+ data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
+ format := DetectFeedFormat(data)
+
+ if format != FormatRSS {
+ t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRSS)
+ }
+}
+
+func TestDetectAtom(t *testing.T) {
+ data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
+ format := DetectFeedFormat(data)
+
+ if format != FormatAtom {
+ t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
+ }
+}
+
+func TestDetectAtomWithISOCharset(t *testing.T) {
+ data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
+ format := DetectFeedFormat(data)
+
+ if format != FormatAtom {
+ t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
+ }
+}
+
+func TestDetectJSON(t *testing.T) {
+ data := `
+ {
+ "version" : "https://jsonfeed.org/version/1",
+ "title" : "Example"
+ }
+ `
+ format := DetectFeedFormat(data)
+
+ if format != FormatJSON {
+ t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatJSON)
+ }
+}
+
+func TestDetectUnknown(t *testing.T) {
+ data := `
+ <!DOCTYPE html> <html> </html>
+ `
+ format := DetectFeedFormat(data)
+
+ if format != FormatUnknown {
+ t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatUnknown)
+ }
+}
diff --git a/reader/parser/parser.go b/reader/parser/parser.go
new file mode 100644
index 0000000..30fc603
--- /dev/null
+++ b/reader/parser/parser.go
@@ -0,0 +1,58 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package parser // import "miniflux.app/reader/parser"
+
+import (
+ "strings"
+
+ "miniflux.app/errors"
+ "miniflux.app/logger"
+ "miniflux.app/model"
+ "miniflux.app/reader/atom"
+ "miniflux.app/reader/json"
+ "miniflux.app/reader/rdf"
+ "miniflux.app/reader/rss"
+)
+
+// ParseFeed analyzes the input data and returns a normalized feed object.
+func ParseFeed(data string) (*model.Feed, *errors.LocalizedError) {
+ data = stripInvalidXMLCharacters(data)
+
+ switch DetectFeedFormat(data) {
+ case FormatAtom:
+ return atom.Parse(strings.NewReader(data))
+ case FormatRSS:
+ return rss.Parse(strings.NewReader(data))
+ case FormatJSON:
+ return json.Parse(strings.NewReader(data))
+ case FormatRDF:
+ return rdf.Parse(strings.NewReader(data))
+ default:
+ return nil, errors.NewLocalizedError("Unsupported feed format")
+ }
+}
+
+func stripInvalidXMLCharacters(input string) string {
+ return strings.Map(func(r rune) rune {
+ if isInCharacterRange(r) {
+ return r
+ }
+
+ logger.Debug("Strip invalid XML characters: %U", r)
+ return -1
+ }, input)
+}
+
+// Decide whether the given rune is in the XML Character Range, per
+// the Char production of http://www.xml.com/axml/testaxml.htm,
+// Section 2.2 Characters.
+func isInCharacterRange(r rune) (inrange bool) {
+ return r == 0x09 ||
+ r == 0x0A ||
+ r == 0x0D ||
+ r >= 0x20 && r <= 0xDF77 ||
+ r >= 0xE000 && r <= 0xFFFD ||
+ r >= 0x10000 && r <= 0x10FFFF
+}
diff --git a/reader/feed/parser_test.go b/reader/parser/parser_test.go
index 46dc34e..7e3f900 100644
--- a/reader/feed/parser_test.go
+++ b/reader/parser/parser_test.go
@@ -2,74 +2,12 @@
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
-package feed // import "miniflux.app/reader/feed"
+package parser // import "miniflux.app/reader/parser"
import (
- "bytes"
"testing"
)
-func TestDetectRDF(t *testing.T) {
- data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatRDF {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF)
- }
-}
-
-func TestDetectRSS(t *testing.T) {
- data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatRSS {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS)
- }
-}
-
-func TestDetectAtom(t *testing.T) {
- data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatAtom {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom)
- }
-}
-
-func TestDetectAtomWithISOCharset(t *testing.T) {
- data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatAtom {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom)
- }
-}
-
-func TestDetectJSON(t *testing.T) {
- data := `
- {
- "version" : "https://jsonfeed.org/version/1",
- "title" : "Example"
- }
- `
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatJSON {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON)
- }
-}
-
-func TestDetectUnknown(t *testing.T) {
- data := `
- <!DOCTYPE html> <html> </html>
- `
- format := DetectFeedFormat(bytes.NewBufferString(data))
-
- if format != FormatUnknown {
- t.Errorf("Wrong format detected: %s instead of %s", format, FormatUnknown)
- }
-}
-
func TestParseAtom(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
@@ -92,7 +30,7 @@ func TestParseAtom(t *testing.T) {
</feed>`
- feed, err := parseFeed(bytes.NewBufferString(data))
+ feed, err := ParseFeed(data)
if err != nil {
t.Error(err)
}
@@ -118,7 +56,7 @@ func TestParseRSS(t *testing.T) {
</channel>
</rss>`
- feed, err := parseFeed(bytes.NewBufferString(data))
+ feed, err := ParseFeed(data)
if err != nil {
t.Error(err)
}
@@ -147,7 +85,7 @@ func TestParseRDF(t *testing.T) {
</item>
</rdf:RDF>`
- feed, err := parseFeed(bytes.NewBufferString(data))
+ feed, err := ParseFeed(data)
if err != nil {
t.Error(err)
}
@@ -177,7 +115,7 @@ func TestParseJson(t *testing.T) {
]
}`
- feed, err := parseFeed(bytes.NewBufferString(data))
+ feed, err := ParseFeed(data)
if err != nil {
t.Error(err)
}
@@ -200,14 +138,14 @@ func TestParseUnknownFeed(t *testing.T) {
</html>
`
- _, err := parseFeed(bytes.NewBufferString(data))
+ _, err := ParseFeed(data)
if err == nil {
t.Error("ParseFeed must returns an error")
}
}
func TestParseEmptyFeed(t *testing.T) {
- _, err := parseFeed(bytes.NewBufferString(""))
+ _, err := ParseFeed("")
if err == nil {
t.Error("ParseFeed must returns an error")
}
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go
index 7aa7084..b62d1ca 100644
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@@ -18,7 +18,7 @@ import (
"github.com/PuerkitoBio/goquery"
)
-// Fetch downloads a web page a returns relevant contents.
+// Fetch downloads a web page and returns relevant contents.
func Fetch(websiteURL, rules, userAgent string) (string, error) {
clt := client.New(websiteURL)
if userAgent != "" {
@@ -38,8 +38,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
}
- page, err := response.NormalizeBodyEncoding()
- if err != nil {
+ if err = response.EnsureUnicodeBody(); err != nil {
return "", err
}
@@ -52,11 +51,11 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
var content string
if rules != "" {
- logger.Debug(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
- content, err = scrapContent(page, rules)
+ logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
+ content, err = scrapContent(response.Body, rules)
} else {
- logger.Debug(`[Scraper] Using readability for "%s"`, websiteURL)
- content, err = readability.ExtractContent(page)
+ logger.Debug(`[Scraper] Using readability for "%q`, websiteURL)
+ content, err = readability.ExtractContent(response.Body)
}
if err != nil {
diff --git a/reader/subscription/finder.go b/reader/subscription/finder.go
index 027e810..c59c002 100644
--- a/reader/subscription/finder.go
+++ b/reader/subscription/finder.go
@@ -5,15 +5,15 @@
package subscription // import "miniflux.app/reader/subscription"
import (
- "bytes"
"fmt"
"io"
+ "strings"
"time"
"miniflux.app/errors"
"miniflux.app/http/client"
"miniflux.app/logger"
- "miniflux.app/reader/feed"
+ "miniflux.app/reader/parser"
"miniflux.app/timer"
"miniflux.app/url"
@@ -56,20 +56,12 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr
return nil, errors.NewLocalizedError(errEmptyBody)
}
- body, err := response.NormalizeBodyEncoding()
- if err != nil {
+ if err := response.EnsureUnicodeBody(); err != nil {
return nil, err
}
- var buffer bytes.Buffer
- size, _ := io.Copy(&buffer, body)
- if size == 0 {
- return nil, errors.NewLocalizedError(errEmptyBody)
- }
-
- reader := bytes.NewReader(buffer.Bytes())
-
- if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown {
+ body := response.String()
+ if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown {
var subscriptions Subscriptions
subscriptions = append(subscriptions, &Subscription{
Title: response.EffectiveURL,
@@ -80,8 +72,7 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr
return subscriptions, nil
}
- reader.Seek(0, io.SeekStart)
- return parseDocument(response.EffectiveURL, bytes.NewReader(buffer.Bytes()))
+ return parseDocument(response.EffectiveURL, strings.NewReader(body))
}
func parseDocument(websiteURL string, data io.Reader) (Subscriptions, error) {