From 713b38e34c6d4a232af871659793bb9674e020f7 Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Fri, 19 Jan 2018 22:42:55 -0800 Subject: Handle more encoding edge cases - Feeds with charset specified only in Content-Type header and not in XML document - Feeds with charset specified in both places - Feeds with charset specified only in XML document and not in HTTP header --- reader/atom/parser.go | 5 ++--- reader/encoding/doc.go | 10 ++++++++++ reader/encoding/encoding.go | 38 ++++++++++++++++++++++++++++++++++++++ reader/feed/parser.go | 13 ++++++------- reader/opml/parser.go | 4 ++-- reader/rdf/parser.go | 4 ++-- reader/rss/parser.go | 5 ++--- reader/subscription/finder.go | 7 ++++++- 8 files changed, 68 insertions(+), 18 deletions(-) create mode 100644 reader/encoding/doc.go create mode 100644 reader/encoding/encoding.go (limited to 'reader') diff --git a/reader/atom/parser.go b/reader/atom/parser.go index 8e83bc9..94b50c7 100644 --- a/reader/atom/parser.go +++ b/reader/atom/parser.go @@ -10,15 +10,14 @@ import ( "github.com/miniflux/miniflux/errors" "github.com/miniflux/miniflux/model" - - "golang.org/x/net/html/charset" + "github.com/miniflux/miniflux/reader/encoding" ) // Parse returns a normalized feed struct from a Atom feed. func Parse(data io.Reader) (*model.Feed, error) { atomFeed := new(atomFeed) decoder := xml.NewDecoder(data) - decoder.CharsetReader = charset.NewReaderLabel + decoder.CharsetReader = encoding.CharsetReader err := decoder.Decode(atomFeed) if err != nil { diff --git a/reader/encoding/doc.go b/reader/encoding/doc.go new file mode 100644 index 0000000..bd3a42e --- /dev/null +++ b/reader/encoding/doc.go @@ -0,0 +1,10 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +/* + +Package encoding handles workarounds to deal with encoding edge cases found into the wild. + +*/ +package encoding diff --git a/reader/encoding/encoding.go b/reader/encoding/encoding.go new file mode 100644 index 0000000..7f726ad --- /dev/null +++ b/reader/encoding/encoding.go @@ -0,0 +1,38 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package encoding + +import ( + "bytes" + "io" + "unicode/utf8" + + "golang.org/x/net/html/charset" +) + +// CharsetReader is used when the XML encoding is specified for the input document. +// +// The document is converted in UTF-8 only if a different encoding is specified +// and the document is not already UTF-8. +// +// Several edge cases could exists: +// +// - Feeds with charset specified only in Content-Type header and not in XML document +// - Feeds with charset specified in both places +// - Feeds with charset specified only in XML document and not in HTTP header +func CharsetReader(label string, input io.Reader) (io.Reader, error) { + var buf1, buf2 bytes.Buffer + w := io.MultiWriter(&buf1, &buf2) + io.Copy(w, input) + r := bytes.NewReader(buf2.Bytes()) + + if !utf8.Valid(buf1.Bytes()) { + // Transform document to UTF-8 from the specified XML encoding. + return charset.NewReaderLabel(label, r) + } + + // The document is already UTF-8, do not do anything (avoid double-encoding) + return r, nil +} diff --git a/reader/feed/parser.go b/reader/feed/parser.go index e612f39..a2b39ed 100644 --- a/reader/feed/parser.go +++ b/reader/feed/parser.go @@ -14,12 +14,11 @@ import ( "github.com/miniflux/miniflux/model" "github.com/miniflux/miniflux/reader/atom" + "github.com/miniflux/miniflux/reader/encoding" "github.com/miniflux/miniflux/reader/json" "github.com/miniflux/miniflux/reader/rdf" "github.com/miniflux/miniflux/reader/rss" "github.com/miniflux/miniflux/timer" - - "golang.org/x/net/html/charset" ) // List of feed formats. @@ -32,14 +31,14 @@ const ( ) // DetectFeedFormat detect feed format from input data. -func DetectFeedFormat(data io.Reader) string { +func DetectFeedFormat(r io.Reader) string { defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]") var buffer bytes.Buffer - tee := io.TeeReader(data, &buffer) + tee := io.TeeReader(r, &buffer) decoder := xml.NewDecoder(tee) - decoder.CharsetReader = charset.NewReaderLabel + decoder.CharsetReader = encoding.CharsetReader for { token, _ := decoder.Token() @@ -66,11 +65,11 @@ func DetectFeedFormat(data io.Reader) string { return FormatUnknown } -func parseFeed(data io.Reader) (*model.Feed, error) { +func parseFeed(r io.Reader) (*model.Feed, error) { defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]") var buffer bytes.Buffer - io.Copy(&buffer, data) + io.Copy(&buffer, r) reader := bytes.NewReader(buffer.Bytes()) format := DetectFeedFormat(reader) diff --git a/reader/opml/parser.go b/reader/opml/parser.go index 8d61aa8..140acc5 100644 --- a/reader/opml/parser.go +++ b/reader/opml/parser.go @@ -9,14 +9,14 @@ import ( "io" "github.com/miniflux/miniflux/errors" - "golang.org/x/net/html/charset" + "github.com/miniflux/miniflux/reader/encoding" ) // Parse reads an OPML file and returns a SubcriptionList. func Parse(data io.Reader) (SubcriptionList, error) { feeds := new(opml) decoder := xml.NewDecoder(data) - decoder.CharsetReader = charset.NewReaderLabel + decoder.CharsetReader = encoding.CharsetReader err := decoder.Decode(feeds) if err != nil { diff --git a/reader/rdf/parser.go b/reader/rdf/parser.go index 7bd3092..da82639 100644 --- a/reader/rdf/parser.go +++ b/reader/rdf/parser.go @@ -10,14 +10,14 @@ import ( "github.com/miniflux/miniflux/errors" "github.com/miniflux/miniflux/model" - "golang.org/x/net/html/charset" + "github.com/miniflux/miniflux/reader/encoding" ) // Parse returns a normalized feed struct from a RDF feed. func Parse(data io.Reader) (*model.Feed, error) { feed := new(rdfFeed) decoder := xml.NewDecoder(data) - decoder.CharsetReader = charset.NewReaderLabel + decoder.CharsetReader = encoding.CharsetReader err := decoder.Decode(feed) if err != nil { diff --git a/reader/rss/parser.go b/reader/rss/parser.go index 0e9738a..f5de561 100644 --- a/reader/rss/parser.go +++ b/reader/rss/parser.go @@ -10,15 +10,14 @@ import ( "github.com/miniflux/miniflux/errors" "github.com/miniflux/miniflux/model" - - "golang.org/x/net/html/charset" + "github.com/miniflux/miniflux/reader/encoding" ) // Parse returns a normalized feed struct from a RSS feed. func Parse(data io.Reader) (*model.Feed, error) { feed := new(rssFeed) decoder := xml.NewDecoder(data) - decoder.CharsetReader = charset.NewReaderLabel + decoder.CharsetReader = encoding.CharsetReader err := decoder.Decode(feed) if err != nil { diff --git a/reader/subscription/finder.go b/reader/subscription/finder.go index c3bf8bb..835991c 100644 --- a/reader/subscription/finder.go +++ b/reader/subscription/finder.go @@ -35,8 +35,13 @@ func FindSubscriptions(websiteURL string) (Subscriptions, error) { return nil, errors.NewLocalizedError(errConnectionFailure, err) } + body, err := response.NormalizeBodyEncoding() + if err != nil { + return nil, err + } + var buffer bytes.Buffer - io.Copy(&buffer, response.Body) + io.Copy(&buffer, body) reader := bytes.NewReader(buffer.Bytes()) if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown { -- cgit v1.2.3