aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2018-10-29 23:00:03 -0700
committerGravatar Frédéric Guillot <fred@miniflux.net>2018-10-29 23:00:03 -0700
commitae1dc1a91eea23be14f952efb130412fe6a7996b (patch)
tree21874d152599191e97f38b7dbf5fac24cfb95823 /reader
parent5ff06307265f773a39819d8229d7bc058f3bd7dc (diff)
Handle more encoding conversion edge cases
Diffstat (limited to 'reader')
-rw-r--r--reader/encoding/encoding.go25
-rwxr-xr-xreader/parser/debug.testbin0 -> 9543972 bytes
-rw-r--r--reader/parser/parser_test.go2
3 files changed, 14 insertions, 13 deletions
diff --git a/reader/encoding/encoding.go b/reader/encoding/encoding.go
index 497b2d0..7f4abda 100644
--- a/reader/encoding/encoding.go
+++ b/reader/encoding/encoding.go
@@ -7,6 +7,7 @@ package encoding // import "miniflux.app/reader/encoding"
import (
"bytes"
"io"
+ "io/ioutil"
"unicode/utf8"
"golang.org/x/net/html/charset"
@@ -19,20 +20,20 @@ import (
//
// Several edge cases could exists:
//
-// - Feeds with charset specified only in Content-Type header and not in XML document
-// - Feeds with charset specified in both places
-// - Feeds with charset specified only in XML document and not in HTTP header
+// - Feeds with encoding specified only in Content-Type header and not in XML document
+// - Feeds with encoding specified in both places
+// - Feeds with encoding specified only in XML document and not in HTTP header
+// - Feeds with wrong encoding defined and already in UTF-8
func CharsetReader(label string, input io.Reader) (io.Reader, error) {
- var buf1, buf2 bytes.Buffer
- w := io.MultiWriter(&buf1, &buf2)
- io.Copy(w, input)
- r := bytes.NewReader(buf2.Bytes())
+ buffer, _ := ioutil.ReadAll(input)
+ r := bytes.NewReader(buffer)
- if !utf8.Valid(buf1.Bytes()) {
- // Transform document to UTF-8 from the specified XML encoding.
- return charset.NewReaderLabel(label, r)
+ // The document is already UTF-8, do not do anything (avoid double-encoding).
+ // That means the specified encoding in XML prolog is wrong.
+ if utf8.Valid(buffer) {
+ return r, nil
}
- // The document is already UTF-8, do not do anything (avoid double-encoding)
- return r, nil
+ // Transform document to UTF-8 from the specified encoding in XML prolog.
+ return charset.NewReaderLabel(label, r)
}
diff --git a/reader/parser/debug.test b/reader/parser/debug.test
new file mode 100755
index 0000000..f97b9b3
--- /dev/null
+++ b/reader/parser/debug.test
Binary files differ
diff --git a/reader/parser/parser_test.go b/reader/parser/parser_test.go
index 8f0f162..b7a93c5 100644
--- a/reader/parser/parser_test.go
+++ b/reader/parser/parser_test.go
@@ -190,7 +190,7 @@ func TestDifferentEncodingWithResponse(t *testing.T) {
r.EnsureUnicodeBody()
feed, parseErr := ParseFeed(r.String())
if parseErr != nil {
- t.Errorf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr)
+ t.Fatalf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr)
}
if feed.Entries[tc.index].Title != tc.title {