From 713b38e34c6d4a232af871659793bb9674e020f7 Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Fri, 19 Jan 2018 22:42:55 -0800 Subject: Handle more encoding edge cases - Feeds with charset specified only in Content-Type header and not in XML document - Feeds with charset specified in both places - Feeds with charset specified only in XML document and not in HTTP header --- http/client.go | 3 ++- http/response.go | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'http') diff --git a/http/client.go b/http/client.go index cdff0ce..b8541a2 100644 --- a/http/client.go +++ b/http/client.go @@ -95,11 +95,12 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) { ContentLength: resp.ContentLength, } - logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ETag=%s, LastModified=%s, EffectiveURL=%s", + logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ContentType=%s, ETag=%s, LastModified=%s, EffectiveURL=%s", request.Method, c.url, response.StatusCode, resp.ContentLength, + response.ContentType, response.ETag, response.LastModified, response.EffectiveURL, diff --git a/http/response.go b/http/response.go index d9e9db6..a0cfc3f 100644 --- a/http/response.go +++ b/http/response.go @@ -6,8 +6,10 @@ package http import ( "io" + "mime" "strings" + "github.com/miniflux/miniflux/logger" "golang.org/x/net/html/charset" ) @@ -45,9 +47,22 @@ func (r *Response) IsModified(etag, lastModified string) bool { } // NormalizeBodyEncoding make sure the body is encoded in UTF-8. +// +// If a charset other than UTF-8 is detected, we convert the document to UTF-8. +// This is used by the scraper and feed readers. +// +// Do not forget edge cases: +// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document. func (r *Response) NormalizeBodyEncoding() (io.Reader, error) { - if strings.Contains(r.ContentType, "charset=") { - return charset.NewReader(r.Body, r.ContentType) + _, params, err := mime.ParseMediaType(r.ContentType) + if err == nil { + if enc, found := params["charset"]; found { + enc = strings.ToLower(enc) + if enc != "utf-8" && enc != "utf8" && enc != "" { + logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc) + return charset.NewReader(r.Body, r.ContentType) + } + } } return r.Body, nil } -- cgit v1.2.3