aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader/encoding
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2018-01-19 22:42:55 -0800
committerGravatar Frédéric Guillot <fred@miniflux.net>2018-01-20 13:25:21 -0800
commit713b38e34c6d4a232af871659793bb9674e020f7 (patch)
tree7f7797e140334d6ad69974cbce7e5f3746d31c6b /reader/encoding
parent3b62f904d6d3fdad168d3b212ff7c465c01b50f4 (diff)
Handle more encoding edge cases
- Feeds with charset specified only in Content-Type header and not in XML document - Feeds with charset specified in both places - Feeds with charset specified only in XML document and not in HTTP header
Diffstat (limited to 'reader/encoding')
-rw-r--r--reader/encoding/doc.go10
-rw-r--r--reader/encoding/encoding.go38
2 files changed, 48 insertions, 0 deletions
diff --git a/reader/encoding/doc.go b/reader/encoding/doc.go
new file mode 100644
index 0000000..bd3a42e
--- /dev/null
+++ b/reader/encoding/doc.go
@@ -0,0 +1,10 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+/*
+
+Package encoding handles workarounds to deal with encoding edge cases found into the wild.
+
+*/
+package encoding
diff --git a/reader/encoding/encoding.go b/reader/encoding/encoding.go
new file mode 100644
index 0000000..7f726ad
--- /dev/null
+++ b/reader/encoding/encoding.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package encoding
+
+import (
+ "bytes"
+ "io"
+ "unicode/utf8"
+
+ "golang.org/x/net/html/charset"
+)
+
+// CharsetReader is used when the XML encoding is specified for the input document.
+//
+// The document is converted in UTF-8 only if a different encoding is specified
+// and the document is not already UTF-8.
+//
+// Several edge cases could exists:
+//
+// - Feeds with charset specified only in Content-Type header and not in XML document
+// - Feeds with charset specified in both places
+// - Feeds with charset specified only in XML document and not in HTTP header
+func CharsetReader(label string, input io.Reader) (io.Reader, error) {
+ var buf1, buf2 bytes.Buffer
+ w := io.MultiWriter(&buf1, &buf2)
+ io.Copy(w, input)
+ r := bytes.NewReader(buf2.Bytes())
+
+ if !utf8.Valid(buf1.Bytes()) {
+ // Transform document to UTF-8 from the specified XML encoding.
+ return charset.NewReaderLabel(label, r)
+ }
+
+ // The document is already UTF-8, do not do anything (avoid double-encoding)
+ return r, nil
+}