aboutsummaryrefslogtreecommitdiffhomepage
path: root/http/client/response.go
diff options
context:
space:
mode:
authorGravatar Frédéric Guillot <fred@miniflux.net>2018-10-29 23:00:03 -0700
committerGravatar Frédéric Guillot <fred@miniflux.net>2018-10-29 23:00:03 -0700
commitae1dc1a91eea23be14f952efb130412fe6a7996b (patch)
tree21874d152599191e97f38b7dbf5fac24cfb95823 /http/client/response.go
parent5ff06307265f773a39819d8229d7bc058f3bd7dc (diff)
Handle more encoding conversion edge cases
Diffstat (limited to 'http/client/response.go')
-rw-r--r--http/client/response.go41
1 files changed, 27 insertions, 14 deletions
diff --git a/http/client/response.go b/http/client/response.go
index c084824..8fcaa26 100644
--- a/http/client/response.go
+++ b/http/client/response.go
@@ -5,15 +5,18 @@
package client // import "miniflux.app/http/client"
import (
+ "bytes"
"io"
"io/ioutil"
"mime"
+ "regexp"
"strings"
"golang.org/x/net/html/charset"
- "miniflux.app/logger"
)
+var xmlEncodingRegex = regexp.MustCompile(`<\?xml(.*)encoding="(.+)"(.*)\?>`)
+
// Response wraps a server response.
type Response struct {
Body io.Reader
@@ -63,22 +66,32 @@ func (r *Response) IsModified(etag, lastModified string) bool {
// This is used by the scraper and feed readers.
//
// Do not forget edge cases:
-// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
-func (r *Response) EnsureUnicodeBody() error {
- _, params, err := mime.ParseMediaType(r.ContentType)
- if err == nil {
- if enc, found := params["charset"]; found {
- enc = strings.ToLower(enc)
- if enc != "utf-8" && enc != "utf8" && enc != "" {
- logger.Debug("[EnsureUnicodeBody] Convert body to utf-8 from %s", enc)
- r.Body, err = charset.NewReader(r.Body, r.ContentType)
- if err != nil {
- return err
- }
+//
+// - Feeds with encoding specified only in Content-Type header and not in XML document
+// - Feeds with encoding specified in both places
+// - Feeds with encoding specified only in XML document and not in HTTP header
+// - Feeds with wrong encoding defined and already in UTF-8
+func (r *Response) EnsureUnicodeBody() (err error) {
+ if r.ContentType != "" {
+ mediaType, _, mediaErr := mime.ParseMediaType(r.ContentType)
+ if mediaErr != nil {
+ return mediaErr
+ }
+
+ if strings.Contains(mediaType, "xml") {
+ buffer, _ := ioutil.ReadAll(r.Body)
+ r.Body = bytes.NewReader(buffer)
+
+ // We ignore documents with encoding specified in XML prolog.
+ // This is going to be handled by the XML parser.
+ if xmlEncodingRegex.Match(buffer[0:1024]) {
+ return
}
}
}
- return nil
+
+ r.Body, err = charset.NewReader(r.Body, r.ContentType)
+ return err
}
// String returns the response body as string.