diff options
author | Frédéric Guillot <fred@miniflux.net> | 2018-01-19 22:42:55 -0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2018-01-20 13:25:21 -0800 |
commit | 713b38e34c6d4a232af871659793bb9674e020f7 (patch) | |
tree | 7f7797e140334d6ad69974cbce7e5f3746d31c6b /reader/encoding | |
parent | 3b62f904d6d3fdad168d3b212ff7c465c01b50f4 (diff) |
Handle more encoding edge cases
- Feeds with charset specified only in Content-Type header and not in XML document
- Feeds with charset specified in both places
- Feeds with charset specified only in XML document and not in HTTP header
Diffstat (limited to 'reader/encoding')
-rw-r--r-- | reader/encoding/doc.go | 10 | ||||
-rw-r--r-- | reader/encoding/encoding.go | 38 |
2 files changed, 48 insertions, 0 deletions
diff --git a/reader/encoding/doc.go b/reader/encoding/doc.go new file mode 100644 index 0000000..bd3a42e --- /dev/null +++ b/reader/encoding/doc.go @@ -0,0 +1,10 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +/* + +Package encoding handles workarounds to deal with encoding edge cases found into the wild. + +*/ +package encoding diff --git a/reader/encoding/encoding.go b/reader/encoding/encoding.go new file mode 100644 index 0000000..7f726ad --- /dev/null +++ b/reader/encoding/encoding.go @@ -0,0 +1,38 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package encoding + +import ( + "bytes" + "io" + "unicode/utf8" + + "golang.org/x/net/html/charset" +) + +// CharsetReader is used when the XML encoding is specified for the input document. +// +// The document is converted in UTF-8 only if a different encoding is specified +// and the document is not already UTF-8. +// +// Several edge cases could exists: +// +// - Feeds with charset specified only in Content-Type header and not in XML document +// - Feeds with charset specified in both places +// - Feeds with charset specified only in XML document and not in HTTP header +func CharsetReader(label string, input io.Reader) (io.Reader, error) { + var buf1, buf2 bytes.Buffer + w := io.MultiWriter(&buf1, &buf2) + io.Copy(w, input) + r := bytes.NewReader(buf2.Bytes()) + + if !utf8.Valid(buf1.Bytes()) { + // Transform document to UTF-8 from the specified XML encoding. + return charset.NewReaderLabel(label, r) + } + + // The document is already UTF-8, do not do anything (avoid double-encoding) + return r, nil +} |