diff options
author | Jebbs <qjebbs@gmail.com> | 2019-12-20 10:31:52 +0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2019-12-19 18:31:52 -0800 |
commit | a155ab6debe1f271f809cce2ff4a20b2626e7d69 (patch) | |
tree | f638666d77afdc0357f34a97d1357387e6cc93e1 | |
parent | a4ebb33cd5f61ae30b4925a89119f20d2ed02408 (diff) |
Filter valid XML characters for UTF-8 XML documents before decoding
This change should reduce "illegal character code" XML errors.
-rw-r--r-- | reader/xml/decoder.go | 39 | ||||
-rw-r--r-- | reader/xml/decoder_test.go | 61 |
2 files changed, 96 insertions, 4 deletions
diff --git a/reader/xml/decoder.go b/reader/xml/decoder.go index d01f74e..398665b 100644 --- a/reader/xml/decoder.go +++ b/reader/xml/decoder.go @@ -10,13 +10,25 @@ import ( "fmt" "io" "io/ioutil" + "strings" "miniflux.app/reader/encoding" ) // NewDecoder returns a XML decoder that filters illegal characters. func NewDecoder(data io.Reader) *xml.Decoder { - decoder := xml.NewDecoder(data) + var decoder *xml.Decoder + buffer, _ := ioutil.ReadAll(data) + enc := procInst("encoding", string(buffer)) + if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") { + // filter invalid chars later within decoder.CharsetReader + decoder = xml.NewDecoder(bytes.NewReader(buffer)) + } else { + // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content + filteredBytes := bytes.Map(filterValidXMLChar, buffer) + decoder = xml.NewDecoder(bytes.NewReader(filteredBytes)) + } + decoder.Entity = xml.HTMLEntity decoder.Strict = false decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { @@ -48,3 +60,28 @@ func filterValidXMLChar(r rune) rune { } return -1 } + +// This function is copied from encoding/xml package, +// procInst parses the `param="..."` or `param='...'` +// value out of the provided string, returning "" if not found. +func procInst(param, s string) string { + // TODO: this parsing is somewhat lame and not exact. + // It works for all actual cases, though. + param = param + "=" + idx := strings.Index(s, param) + if idx == -1 { + return "" + } + v := s[idx+len(param):] + if v == "" { + return "" + } + if v[0] != '\'' && v[0] != '"' { + return "" + } + idx = strings.IndexRune(v[1:], rune(v[0])) + if idx == -1 { + return "" + } + return v[1 : idx+1] +} diff --git a/reader/xml/decoder_test.go b/reader/xml/decoder_test.go index ea24bf8..1208ef8 100644 --- a/reader/xml/decoder_test.go +++ b/reader/xml/decoder_test.go @@ -11,19 +11,74 @@ import ( "testing" ) -func TestIllegalCharacters(t *testing.T) { +func TestUTF8WithIllegalCharacters(t *testing.T) { type myxml struct { XMLName xml.Name `xml:"rss"` Version string `xml:"version,attr"` Title string `xml:"title"` } - data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>%s</title></rss>`, "\x10") + expected := "Title & 中文标题" + data := fmt.Sprintf(`<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10") + reader := strings.NewReader(data) + + var x myxml + + decoder := NewDecoder(reader) + err := decoder.Decode(&x) + if err != nil { + t.Error(err) + return + } + if x.Title != expected { + t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title) + } +} + +func TestWindows251WithIllegalCharacters(t *testing.T) { + type myxml struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"title"` + } + + expected := "Title & 中文标题" + data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10") + reader := strings.NewReader(data) + var x myxml - decoder := NewDecoder(strings.NewReader(data)) + decoder := NewDecoder(reader) err := decoder.Decode(&x) if err != nil { t.Error(err) + return + } + if x.Title != expected { + t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title) + } +} + +func TestIllegalEncodingField(t *testing.T) { + type myxml struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"title"` + } + + expected := "Title & 中文标题" + data := fmt.Sprintf(`<?xml version="1.0" encoding="invalid"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10") + reader := strings.NewReader(data) + + var x myxml + + decoder := NewDecoder(reader) + err := decoder.Decode(&x) + if err != nil { + t.Error(err) + return + } + if x.Title != expected { + t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title) } } |