aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Jebbs <qjebbs@gmail.com>2019-12-20 10:31:52 +0800
committerGravatar Frédéric Guillot <fred@miniflux.net>2019-12-19 18:31:52 -0800
commita155ab6debe1f271f809cce2ff4a20b2626e7d69 (patch)
treef638666d77afdc0357f34a97d1357387e6cc93e1
parenta4ebb33cd5f61ae30b4925a89119f20d2ed02408 (diff)
Filter valid XML characters for UTF-8 XML documents before decoding
This change should reduce "illegal character code" XML errors.
-rw-r--r--reader/xml/decoder.go39
-rw-r--r--reader/xml/decoder_test.go61
2 files changed, 96 insertions, 4 deletions
diff --git a/reader/xml/decoder.go b/reader/xml/decoder.go
index d01f74e..398665b 100644
--- a/reader/xml/decoder.go
+++ b/reader/xml/decoder.go
@@ -10,13 +10,25 @@ import (
"fmt"
"io"
"io/ioutil"
+ "strings"
"miniflux.app/reader/encoding"
)
// NewDecoder returns a XML decoder that filters illegal characters.
func NewDecoder(data io.Reader) *xml.Decoder {
- decoder := xml.NewDecoder(data)
+ var decoder *xml.Decoder
+ buffer, _ := ioutil.ReadAll(data)
+ enc := procInst("encoding", string(buffer))
+ if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
+ // filter invalid chars later within decoder.CharsetReader
+ decoder = xml.NewDecoder(bytes.NewReader(buffer))
+ } else {
+ // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
+ filteredBytes := bytes.Map(filterValidXMLChar, buffer)
+ decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
+ }
+
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
@@ -48,3 +60,28 @@ func filterValidXMLChar(r rune) rune {
}
return -1
}
+
+// This function is copied from encoding/xml package,
+// procInst parses the `param="..."` or `param='...'`
+// value out of the provided string, returning "" if not found.
+func procInst(param, s string) string {
+ // TODO: this parsing is somewhat lame and not exact.
+ // It works for all actual cases, though.
+ param = param + "="
+ idx := strings.Index(s, param)
+ if idx == -1 {
+ return ""
+ }
+ v := s[idx+len(param):]
+ if v == "" {
+ return ""
+ }
+ if v[0] != '\'' && v[0] != '"' {
+ return ""
+ }
+ idx = strings.IndexRune(v[1:], rune(v[0]))
+ if idx == -1 {
+ return ""
+ }
+ return v[1 : idx+1]
+}
diff --git a/reader/xml/decoder_test.go b/reader/xml/decoder_test.go
index ea24bf8..1208ef8 100644
--- a/reader/xml/decoder_test.go
+++ b/reader/xml/decoder_test.go
@@ -11,19 +11,74 @@ import (
"testing"
)
-func TestIllegalCharacters(t *testing.T) {
+func TestUTF8WithIllegalCharacters(t *testing.T) {
type myxml struct {
XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"`
Title string `xml:"title"`
}
- data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>%s</title></rss>`, "\x10")
+ expected := "Title & 中文标题"
+ data := fmt.Sprintf(`<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
+ reader := strings.NewReader(data)
+
+ var x myxml
+
+ decoder := NewDecoder(reader)
+ err := decoder.Decode(&x)
+ if err != nil {
+ t.Error(err)
+ return
+ }
+ if x.Title != expected {
+ t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
+ }
+}
+
+func TestWindows251WithIllegalCharacters(t *testing.T) {
+ type myxml struct {
+ XMLName xml.Name `xml:"rss"`
+ Version string `xml:"version,attr"`
+ Title string `xml:"title"`
+ }
+
+ expected := "Title & 中文标题"
+ data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
+ reader := strings.NewReader(data)
+
var x myxml
- decoder := NewDecoder(strings.NewReader(data))
+ decoder := NewDecoder(reader)
err := decoder.Decode(&x)
if err != nil {
t.Error(err)
+ return
+ }
+ if x.Title != expected {
+ t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
+ }
+}
+
+func TestIllegalEncodingField(t *testing.T) {
+ type myxml struct {
+ XMLName xml.Name `xml:"rss"`
+ Version string `xml:"version,attr"`
+ Title string `xml:"title"`
+ }
+
+ expected := "Title & 中文标题"
+ data := fmt.Sprintf(`<?xml version="1.0" encoding="invalid"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
+ reader := strings.NewReader(data)
+
+ var x myxml
+
+ decoder := NewDecoder(reader)
+ err := decoder.Decode(&x)
+ if err != nil {
+ t.Error(err)
+ return
+ }
+ if x.Title != expected {
+ t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
}
}