From 15505ee4a2bd4963d0cbc9d1820e9be641b221ca Mon Sep 17 00:00:00 2001 From: Peter De Wachter Date: Tue, 1 Jan 2019 22:06:58 +0100 Subject: Make UTF-8 the default encoding for XML feeds Consider the feed http://planet.haskell.org/atom.xml - This is a UTF-8 encoded XML file - No encoding declaration in the XML header - No Unicode byte order mark - Served with HTTP Content-Type "text/xml" (no charset parameter) Miniflux lets charset.NewReader handle this. The charset package implements the HTML5 character encoding algorithm, which, in this situation, defaults to windows-1252 encoding if there are no UTF-8 characters in the first 1000 bytes. So for this feed, we get the wrong encoding. I inserted an explicit "utf8.Valid()" check, which fixes this problem. --- http/client/response.go | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'http') diff --git a/http/client/response.go b/http/client/response.go index 61a9704..0e918b5 100644 --- a/http/client/response.go +++ b/http/client/response.go @@ -11,6 +11,7 @@ import ( "mime" "regexp" "strings" + "unicode/utf8" "golang.org/x/net/html/charset" ) @@ -97,6 +98,12 @@ func (r *Response) EnsureUnicodeBody() (err error) { if xmlEncodingRegex.Match(buffer[0:length]) { return } + + // If no encoding is specified in the XML prolog and + // the document is valid UTF-8, nothing needs to be done. + if utf8.Valid(buffer) { + return + } } } -- cgit v1.2.3