diff options
author | Tony Wang <wwwjfy@gmail.com> | 2019-10-23 11:27:27 +0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2019-10-22 20:32:35 -0700 |
commit | 2eb2441f2ba9fcb50d17d8f7deead756187b3586 (patch) | |
tree | 523c1dbfe2d87b5ca44cd1d71f79e7ced12bd85c | |
parent | 7409bba0d8c8ab74b57cb05cb1571a7e72a7073e (diff) |
Improve XML decoder to remove illegal characters
-rw-r--r-- | locale/translations.go | 3 | ||||
-rw-r--r-- | locale/translations/zh_CN.json | 1 | ||||
-rw-r--r-- | reader/atom/parser.go | 7 | ||||
-rw-r--r-- | reader/rdf/parser.go | 7 | ||||
-rw-r--r-- | reader/rss/parser.go | 7 | ||||
-rw-r--r-- | reader/xml/decoder.go | 50 | ||||
-rw-r--r-- | reader/xml/decoder_test.go | 29 |
7 files changed, 85 insertions, 19 deletions
diff --git a/locale/translations.go b/locale/translations.go index 794a114..8451de3 100644 --- a/locale/translations.go +++ b/locale/translations.go @@ -2624,6 +2624,7 @@ var translations = map[string]string{ "Unable to parse Atom feed: %q": "无法解析Atom源: %q", "Unable to parse JSON feed: %q": "无法解析JSON源: %q", "Unable to parse RDF feed: %q": "无法解析RDF源: %q", + "Unable to read data: %q": "无法读取数据: %q", "Unable to normalize encoding: %q": "无法正则化编码: %q", "Category not found for this user": "未找到该用户的这一分类", "This feed is empty": "该源是空的", @@ -2645,5 +2646,5 @@ var translationsChecksums = map[string]string{ "nl_NL": "a91e2195ac0731a3788405a51c4201e1a89dcce35ef792356e8c17adb57aee97", "pl_PL": "097bc9beac12f33d3a5e5ee98ccba0875e4d1c1bf13e38251a66ac450834c5b3", "ru_RU": "b253bf709a2f4bcac2f894bd1797247481fa7c6b70a0a0d8785d8680be83bac8", - "zh_CN": "cb974ad8c374278057db4ca58ff0e59314dc191e2ea59af0d1472a438a9ce3e0", + "zh_CN": "5004e07fa535ea56e7fbe1501bb8ff4191d1d214e51b4590110b660994c39f0d", } diff --git a/locale/translations/zh_CN.json b/locale/translations/zh_CN.json index 453cff9..a690187 100644 --- a/locale/translations/zh_CN.json +++ b/locale/translations/zh_CN.json @@ -279,6 +279,7 @@ "Unable to parse Atom feed: %q": "无法解析Atom源: %q", "Unable to parse JSON feed: %q": "无法解析JSON源: %q", "Unable to parse RDF feed: %q": "无法解析RDF源: %q", + "Unable to read data: %q": "无法读取数据: %q", "Unable to normalize encoding: %q": "无法正则化编码: %q", "Category not found for this user": "未找到该用户的这一分类", "This feed is empty": "该源是空的", diff --git a/reader/atom/parser.go b/reader/atom/parser.go index 4749c1a..90a84aa 100644 --- a/reader/atom/parser.go +++ b/reader/atom/parser.go @@ -5,22 +5,17 @@ package atom // import "miniflux.app/reader/atom" import ( - "encoding/xml" "io" "miniflux.app/errors" "miniflux.app/model" - "miniflux.app/reader/encoding" + "miniflux.app/reader/xml" ) // Parse returns a normalized feed struct from a Atom feed. func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) { atomFeed := new(atomFeed) decoder := xml.NewDecoder(data) - decoder.Entity = xml.HTMLEntity - decoder.Strict = false - decoder.CharsetReader = encoding.CharsetReader - err := decoder.Decode(atomFeed) if err != nil { return nil, errors.NewLocalizedError("Unable to parse Atom feed: %q", err) diff --git a/reader/rdf/parser.go b/reader/rdf/parser.go index 861ce8c..57a8e52 100644 --- a/reader/rdf/parser.go +++ b/reader/rdf/parser.go @@ -5,22 +5,17 @@ package rdf // import "miniflux.app/reader/rdf" import ( - "encoding/xml" "io" "miniflux.app/errors" "miniflux.app/model" - "miniflux.app/reader/encoding" + "miniflux.app/reader/xml" ) // Parse returns a normalized feed struct from a RDF feed. func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) { feed := new(rdfFeed) decoder := xml.NewDecoder(data) - decoder.Entity = xml.HTMLEntity - decoder.Strict = false - decoder.CharsetReader = encoding.CharsetReader - err := decoder.Decode(feed) if err != nil { return nil, errors.NewLocalizedError("Unable to parse RDF feed: %q", err) diff --git a/reader/rss/parser.go b/reader/rss/parser.go index 79bd144..9ed773d 100644 --- a/reader/rss/parser.go +++ b/reader/rss/parser.go @@ -5,22 +5,17 @@ package rss // import "miniflux.app/reader/rss" import ( - "encoding/xml" "io" "miniflux.app/errors" "miniflux.app/model" - "miniflux.app/reader/encoding" + "miniflux.app/reader/xml" ) // Parse returns a normalized feed struct from a RSS feed. func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) { feed := new(rssFeed) decoder := xml.NewDecoder(data) - decoder.Entity = xml.HTMLEntity - decoder.Strict = false - decoder.CharsetReader = encoding.CharsetReader - err := decoder.Decode(feed) if err != nil { return nil, errors.NewLocalizedError("Unable to parse RSS feed: %q", err) diff --git a/reader/xml/decoder.go b/reader/xml/decoder.go new file mode 100644 index 0000000..d01f74e --- /dev/null +++ b/reader/xml/decoder.go @@ -0,0 +1,50 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package xml // import "miniflux.app/reader/xml" + +import ( + "bytes" + "encoding/xml" + "fmt" + "io" + "io/ioutil" + + "miniflux.app/reader/encoding" +) + +// NewDecoder returns a XML decoder that filters illegal characters. +func NewDecoder(data io.Reader) *xml.Decoder { + decoder := xml.NewDecoder(data) + decoder.Entity = xml.HTMLEntity + decoder.Strict = false + decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { + utf8Reader, err := encoding.CharsetReader(charset, input) + if err != nil { + return nil, err + } + rawData, err := ioutil.ReadAll(utf8Reader) + if err != nil { + return nil, fmt.Errorf("Unable to read data: %q", err) + } + filteredBytes := bytes.Map(filterValidXMLChar, rawData) + return bytes.NewReader(filteredBytes), nil + } + + return decoder +} + +// This function is copied from encoding/xml package, +// and is used to check if all the characters are legal. +func filterValidXMLChar(r rune) rune { + if r == 0x09 || + r == 0x0A || + r == 0x0D || + r >= 0x20 && r <= 0xD7FF || + r >= 0xE000 && r <= 0xFFFD || + r >= 0x10000 && r <= 0x10FFFF { + return r + } + return -1 +} diff --git a/reader/xml/decoder_test.go b/reader/xml/decoder_test.go new file mode 100644 index 0000000..ea24bf8 --- /dev/null +++ b/reader/xml/decoder_test.go @@ -0,0 +1,29 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package xml // import "miniflux.app/reader/xml" + +import ( + "encoding/xml" + "fmt" + "strings" + "testing" +) + +func TestIllegalCharacters(t *testing.T) { + type myxml struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"title"` + } + + data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>%s</title></rss>`, "\x10") + var x myxml + + decoder := NewDecoder(strings.NewReader(data)) + err := decoder.Decode(&x) + if err != nil { + t.Error(err) + } +} |