diff options
author | Tony Wang <wwwjfy@gmail.com> | 2019-10-23 11:27:27 +0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2019-10-22 20:32:35 -0700 |
commit | 2eb2441f2ba9fcb50d17d8f7deead756187b3586 (patch) | |
tree | 523c1dbfe2d87b5ca44cd1d71f79e7ced12bd85c /reader/xml | |
parent | 7409bba0d8c8ab74b57cb05cb1571a7e72a7073e (diff) |
Improve XML decoder to remove illegal characters
Diffstat (limited to 'reader/xml')
-rw-r--r-- | reader/xml/decoder.go | 50 | ||||
-rw-r--r-- | reader/xml/decoder_test.go | 29 |
2 files changed, 79 insertions, 0 deletions
diff --git a/reader/xml/decoder.go b/reader/xml/decoder.go new file mode 100644 index 0000000..d01f74e --- /dev/null +++ b/reader/xml/decoder.go @@ -0,0 +1,50 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package xml // import "miniflux.app/reader/xml" + +import ( + "bytes" + "encoding/xml" + "fmt" + "io" + "io/ioutil" + + "miniflux.app/reader/encoding" +) + +// NewDecoder returns a XML decoder that filters illegal characters. +func NewDecoder(data io.Reader) *xml.Decoder { + decoder := xml.NewDecoder(data) + decoder.Entity = xml.HTMLEntity + decoder.Strict = false + decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { + utf8Reader, err := encoding.CharsetReader(charset, input) + if err != nil { + return nil, err + } + rawData, err := ioutil.ReadAll(utf8Reader) + if err != nil { + return nil, fmt.Errorf("Unable to read data: %q", err) + } + filteredBytes := bytes.Map(filterValidXMLChar, rawData) + return bytes.NewReader(filteredBytes), nil + } + + return decoder +} + +// This function is copied from encoding/xml package, +// and is used to check if all the characters are legal. +func filterValidXMLChar(r rune) rune { + if r == 0x09 || + r == 0x0A || + r == 0x0D || + r >= 0x20 && r <= 0xD7FF || + r >= 0xE000 && r <= 0xFFFD || + r >= 0x10000 && r <= 0x10FFFF { + return r + } + return -1 +} diff --git a/reader/xml/decoder_test.go b/reader/xml/decoder_test.go new file mode 100644 index 0000000..ea24bf8 --- /dev/null +++ b/reader/xml/decoder_test.go @@ -0,0 +1,29 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package xml // import "miniflux.app/reader/xml" + +import ( + "encoding/xml" + "fmt" + "strings" + "testing" +) + +func TestIllegalCharacters(t *testing.T) { + type myxml struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"title"` + } + + data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>%s</title></rss>`, "\x10") + var x myxml + + decoder := NewDecoder(strings.NewReader(data)) + err := decoder.Decode(&x) + if err != nil { + t.Error(err) + } +} |