From ae1dc1a91eea23be14f952efb130412fe6a7996b Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Mon, 29 Oct 2018 23:00:03 -0700 Subject: Handle more encoding conversion edge cases --- reader/encoding/encoding.go | 25 +++++++++++++------------ reader/parser/debug.test | Bin 0 -> 9543972 bytes reader/parser/parser_test.go | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) create mode 100755 reader/parser/debug.test (limited to 'reader') diff --git a/reader/encoding/encoding.go b/reader/encoding/encoding.go index 497b2d0..7f4abda 100644 --- a/reader/encoding/encoding.go +++ b/reader/encoding/encoding.go @@ -7,6 +7,7 @@ package encoding // import "miniflux.app/reader/encoding" import ( "bytes" "io" + "io/ioutil" "unicode/utf8" "golang.org/x/net/html/charset" @@ -19,20 +20,20 @@ import ( // // Several edge cases could exists: // -// - Feeds with charset specified only in Content-Type header and not in XML document -// - Feeds with charset specified in both places -// - Feeds with charset specified only in XML document and not in HTTP header +// - Feeds with encoding specified only in Content-Type header and not in XML document +// - Feeds with encoding specified in both places +// - Feeds with encoding specified only in XML document and not in HTTP header +// - Feeds with wrong encoding defined and already in UTF-8 func CharsetReader(label string, input io.Reader) (io.Reader, error) { - var buf1, buf2 bytes.Buffer - w := io.MultiWriter(&buf1, &buf2) - io.Copy(w, input) - r := bytes.NewReader(buf2.Bytes()) + buffer, _ := ioutil.ReadAll(input) + r := bytes.NewReader(buffer) - if !utf8.Valid(buf1.Bytes()) { - // Transform document to UTF-8 from the specified XML encoding. - return charset.NewReaderLabel(label, r) + // The document is already UTF-8, do not do anything (avoid double-encoding). + // That means the specified encoding in XML prolog is wrong. + if utf8.Valid(buffer) { + return r, nil } - // The document is already UTF-8, do not do anything (avoid double-encoding) - return r, nil + // Transform document to UTF-8 from the specified encoding in XML prolog. + return charset.NewReaderLabel(label, r) } diff --git a/reader/parser/debug.test b/reader/parser/debug.test new file mode 100755 index 0000000..f97b9b3 Binary files /dev/null and b/reader/parser/debug.test differ diff --git a/reader/parser/parser_test.go b/reader/parser/parser_test.go index 8f0f162..b7a93c5 100644 --- a/reader/parser/parser_test.go +++ b/reader/parser/parser_test.go @@ -190,7 +190,7 @@ func TestDifferentEncodingWithResponse(t *testing.T) { r.EnsureUnicodeBody() feed, parseErr := ParseFeed(r.String()) if parseErr != nil { - t.Errorf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr) + t.Fatalf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr) } if feed.Entries[tc.index].Title != tc.title { -- cgit v1.2.3