aboutsummaryrefslogtreecommitdiffhomepage
path: root/http
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2018-01-19 22:42:55 -0800
committerGravatar Frédéric Guillot <fred@miniflux.net>2018-01-20 13:25:21 -0800
commit713b38e34c6d4a232af871659793bb9674e020f7 (patch)
tree7f7797e140334d6ad69974cbce7e5f3746d31c6b /http
parent3b62f904d6d3fdad168d3b212ff7c465c01b50f4 (diff)
Handle more encoding edge cases
- Feeds with charset specified only in Content-Type header and not in XML document - Feeds with charset specified in both places - Feeds with charset specified only in XML document and not in HTTP header
Diffstat (limited to 'http')
-rw-r--r--http/client.go3
-rw-r--r--http/response.go19
2 files changed, 19 insertions, 3 deletions
diff --git a/http/client.go b/http/client.go
index cdff0ce..b8541a2 100644
--- a/http/client.go
+++ b/http/client.go
@@ -95,11 +95,12 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) {
ContentLength: resp.ContentLength,
}
- logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ETag=%s, LastModified=%s, EffectiveURL=%s",
+ logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ContentType=%s, ETag=%s, LastModified=%s, EffectiveURL=%s",
request.Method,
c.url,
response.StatusCode,
resp.ContentLength,
+ response.ContentType,
response.ETag,
response.LastModified,
response.EffectiveURL,
diff --git a/http/response.go b/http/response.go
index d9e9db6..a0cfc3f 100644
--- a/http/response.go
+++ b/http/response.go
@@ -6,8 +6,10 @@ package http
import (
"io"
+ "mime"
"strings"
+ "github.com/miniflux/miniflux/logger"
"golang.org/x/net/html/charset"
)
@@ -45,9 +47,22 @@ func (r *Response) IsModified(etag, lastModified string) bool {
}
// NormalizeBodyEncoding make sure the body is encoded in UTF-8.
+//
+// If a charset other than UTF-8 is detected, we convert the document to UTF-8.
+// This is used by the scraper and feed readers.
+//
+// Do not forget edge cases:
+// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
- if strings.Contains(r.ContentType, "charset=") {
- return charset.NewReader(r.Body, r.ContentType)
+ _, params, err := mime.ParseMediaType(r.ContentType)
+ if err == nil {
+ if enc, found := params["charset"]; found {
+ enc = strings.ToLower(enc)
+ if enc != "utf-8" && enc != "utf8" && enc != "" {
+ logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc)
+ return charset.NewReader(r.Body, r.ContentType)
+ }
+ }
}
return r.Body, nil
}