aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2018-01-19 22:42:55 -0800
committerGravatar Frédéric Guillot <fred@miniflux.net>2018-01-20 13:25:21 -0800
commit713b38e34c6d4a232af871659793bb9674e020f7 (patch)
tree7f7797e140334d6ad69974cbce7e5f3746d31c6b
parent3b62f904d6d3fdad168d3b212ff7c465c01b50f4 (diff)
Handle more encoding edge cases
- Feeds with charset specified only in Content-Type header and not in XML document - Feeds with charset specified in both places - Feeds with charset specified only in XML document and not in HTTP header
-rw-r--r--http/client.go3
-rw-r--r--http/response.go19
-rw-r--r--reader/atom/parser.go5
-rw-r--r--reader/encoding/doc.go10
-rw-r--r--reader/encoding/encoding.go38
-rw-r--r--reader/feed/parser.go13
-rw-r--r--reader/opml/parser.go4
-rw-r--r--reader/rdf/parser.go4
-rw-r--r--reader/rss/parser.go5
-rw-r--r--reader/subscription/finder.go7
10 files changed, 87 insertions, 21 deletions
diff --git a/http/client.go b/http/client.go
index cdff0ce..b8541a2 100644
--- a/http/client.go
+++ b/http/client.go
@@ -95,11 +95,12 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) {
ContentLength: resp.ContentLength,
}
- logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ETag=%s, LastModified=%s, EffectiveURL=%s",
+ logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ContentType=%s, ETag=%s, LastModified=%s, EffectiveURL=%s",
request.Method,
c.url,
response.StatusCode,
resp.ContentLength,
+ response.ContentType,
response.ETag,
response.LastModified,
response.EffectiveURL,
diff --git a/http/response.go b/http/response.go
index d9e9db6..a0cfc3f 100644
--- a/http/response.go
+++ b/http/response.go
@@ -6,8 +6,10 @@ package http
import (
"io"
+ "mime"
"strings"
+ "github.com/miniflux/miniflux/logger"
"golang.org/x/net/html/charset"
)
@@ -45,9 +47,22 @@ func (r *Response) IsModified(etag, lastModified string) bool {
}
// NormalizeBodyEncoding make sure the body is encoded in UTF-8.
+//
+// If a charset other than UTF-8 is detected, we convert the document to UTF-8.
+// This is used by the scraper and feed readers.
+//
+// Do not forget edge cases:
+// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
- if strings.Contains(r.ContentType, "charset=") {
- return charset.NewReader(r.Body, r.ContentType)
+ _, params, err := mime.ParseMediaType(r.ContentType)
+ if err == nil {
+ if enc, found := params["charset"]; found {
+ enc = strings.ToLower(enc)
+ if enc != "utf-8" && enc != "utf8" && enc != "" {
+ logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc)
+ return charset.NewReader(r.Body, r.ContentType)
+ }
+ }
}
return r.Body, nil
}
diff --git a/reader/atom/parser.go b/reader/atom/parser.go
index 8e83bc9..94b50c7 100644
--- a/reader/atom/parser.go
+++ b/reader/atom/parser.go
@@ -10,15 +10,14 @@ import (
"github.com/miniflux/miniflux/errors"
"github.com/miniflux/miniflux/model"
-
- "golang.org/x/net/html/charset"
+ "github.com/miniflux/miniflux/reader/encoding"
)
// Parse returns a normalized feed struct from a Atom feed.
func Parse(data io.Reader) (*model.Feed, error) {
atomFeed := new(atomFeed)
decoder := xml.NewDecoder(data)
- decoder.CharsetReader = charset.NewReaderLabel
+ decoder.CharsetReader = encoding.CharsetReader
err := decoder.Decode(atomFeed)
if err != nil {
diff --git a/reader/encoding/doc.go b/reader/encoding/doc.go
new file mode 100644
index 0000000..bd3a42e
--- /dev/null
+++ b/reader/encoding/doc.go
@@ -0,0 +1,10 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+/*
+
+Package encoding handles workarounds to deal with encoding edge cases found into the wild.
+
+*/
+package encoding
diff --git a/reader/encoding/encoding.go b/reader/encoding/encoding.go
new file mode 100644
index 0000000..7f726ad
--- /dev/null
+++ b/reader/encoding/encoding.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package encoding
+
+import (
+ "bytes"
+ "io"
+ "unicode/utf8"
+
+ "golang.org/x/net/html/charset"
+)
+
+// CharsetReader is used when the XML encoding is specified for the input document.
+//
+// The document is converted in UTF-8 only if a different encoding is specified
+// and the document is not already UTF-8.
+//
+// Several edge cases could exists:
+//
+// - Feeds with charset specified only in Content-Type header and not in XML document
+// - Feeds with charset specified in both places
+// - Feeds with charset specified only in XML document and not in HTTP header
+func CharsetReader(label string, input io.Reader) (io.Reader, error) {
+ var buf1, buf2 bytes.Buffer
+ w := io.MultiWriter(&buf1, &buf2)
+ io.Copy(w, input)
+ r := bytes.NewReader(buf2.Bytes())
+
+ if !utf8.Valid(buf1.Bytes()) {
+ // Transform document to UTF-8 from the specified XML encoding.
+ return charset.NewReaderLabel(label, r)
+ }
+
+ // The document is already UTF-8, do not do anything (avoid double-encoding)
+ return r, nil
+}
diff --git a/reader/feed/parser.go b/reader/feed/parser.go
index e612f39..a2b39ed 100644
--- a/reader/feed/parser.go
+++ b/reader/feed/parser.go
@@ -14,12 +14,11 @@ import (
"github.com/miniflux/miniflux/model"
"github.com/miniflux/miniflux/reader/atom"
+ "github.com/miniflux/miniflux/reader/encoding"
"github.com/miniflux/miniflux/reader/json"
"github.com/miniflux/miniflux/reader/rdf"
"github.com/miniflux/miniflux/reader/rss"
"github.com/miniflux/miniflux/timer"
-
- "golang.org/x/net/html/charset"
)
// List of feed formats.
@@ -32,14 +31,14 @@ const (
)
// DetectFeedFormat detect feed format from input data.
-func DetectFeedFormat(data io.Reader) string {
+func DetectFeedFormat(r io.Reader) string {
defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
var buffer bytes.Buffer
- tee := io.TeeReader(data, &buffer)
+ tee := io.TeeReader(r, &buffer)
decoder := xml.NewDecoder(tee)
- decoder.CharsetReader = charset.NewReaderLabel
+ decoder.CharsetReader = encoding.CharsetReader
for {
token, _ := decoder.Token()
@@ -66,11 +65,11 @@ func DetectFeedFormat(data io.Reader) string {
return FormatUnknown
}
-func parseFeed(data io.Reader) (*model.Feed, error) {
+func parseFeed(r io.Reader) (*model.Feed, error) {
defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
var buffer bytes.Buffer
- io.Copy(&buffer, data)
+ io.Copy(&buffer, r)
reader := bytes.NewReader(buffer.Bytes())
format := DetectFeedFormat(reader)
diff --git a/reader/opml/parser.go b/reader/opml/parser.go
index 8d61aa8..140acc5 100644
--- a/reader/opml/parser.go
+++ b/reader/opml/parser.go
@@ -9,14 +9,14 @@ import (
"io"
"github.com/miniflux/miniflux/errors"
- "golang.org/x/net/html/charset"
+ "github.com/miniflux/miniflux/reader/encoding"
)
// Parse reads an OPML file and returns a SubcriptionList.
func Parse(data io.Reader) (SubcriptionList, error) {
feeds := new(opml)
decoder := xml.NewDecoder(data)
- decoder.CharsetReader = charset.NewReaderLabel
+ decoder.CharsetReader = encoding.CharsetReader
err := decoder.Decode(feeds)
if err != nil {
diff --git a/reader/rdf/parser.go b/reader/rdf/parser.go
index 7bd3092..da82639 100644
--- a/reader/rdf/parser.go
+++ b/reader/rdf/parser.go
@@ -10,14 +10,14 @@ import (
"github.com/miniflux/miniflux/errors"
"github.com/miniflux/miniflux/model"
- "golang.org/x/net/html/charset"
+ "github.com/miniflux/miniflux/reader/encoding"
)
// Parse returns a normalized feed struct from a RDF feed.
func Parse(data io.Reader) (*model.Feed, error) {
feed := new(rdfFeed)
decoder := xml.NewDecoder(data)
- decoder.CharsetReader = charset.NewReaderLabel
+ decoder.CharsetReader = encoding.CharsetReader
err := decoder.Decode(feed)
if err != nil {
diff --git a/reader/rss/parser.go b/reader/rss/parser.go
index 0e9738a..f5de561 100644
--- a/reader/rss/parser.go
+++ b/reader/rss/parser.go
@@ -10,15 +10,14 @@ import (
"github.com/miniflux/miniflux/errors"
"github.com/miniflux/miniflux/model"
-
- "golang.org/x/net/html/charset"
+ "github.com/miniflux/miniflux/reader/encoding"
)
// Parse returns a normalized feed struct from a RSS feed.
func Parse(data io.Reader) (*model.Feed, error) {
feed := new(rssFeed)
decoder := xml.NewDecoder(data)
- decoder.CharsetReader = charset.NewReaderLabel
+ decoder.CharsetReader = encoding.CharsetReader
err := decoder.Decode(feed)
if err != nil {
diff --git a/reader/subscription/finder.go b/reader/subscription/finder.go
index c3bf8bb..835991c 100644
--- a/reader/subscription/finder.go
+++ b/reader/subscription/finder.go
@@ -35,8 +35,13 @@ func FindSubscriptions(websiteURL string) (Subscriptions, error) {
return nil, errors.NewLocalizedError(errConnectionFailure, err)
}
+ body, err := response.NormalizeBodyEncoding()
+ if err != nil {
+ return nil, err
+ }
+
var buffer bytes.Buffer
- io.Copy(&buffer, response.Body)
+ io.Copy(&buffer, body)
reader := bytes.NewReader(buffer.Bytes())
if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown {