aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Tony Wang <wwwjfy@gmail.com>2019-10-23 11:27:27 +0800
committerGravatar Frédéric Guillot <fred@miniflux.net>2019-10-22 20:32:35 -0700
commit2eb2441f2ba9fcb50d17d8f7deead756187b3586 (patch)
tree523c1dbfe2d87b5ca44cd1d71f79e7ced12bd85c
parent7409bba0d8c8ab74b57cb05cb1571a7e72a7073e (diff)
Improve XML decoder to remove illegal characters
-rw-r--r--locale/translations.go3
-rw-r--r--locale/translations/zh_CN.json1
-rw-r--r--reader/atom/parser.go7
-rw-r--r--reader/rdf/parser.go7
-rw-r--r--reader/rss/parser.go7
-rw-r--r--reader/xml/decoder.go50
-rw-r--r--reader/xml/decoder_test.go29
7 files changed, 85 insertions, 19 deletions
diff --git a/locale/translations.go b/locale/translations.go
index 794a114..8451de3 100644
--- a/locale/translations.go
+++ b/locale/translations.go
@@ -2624,6 +2624,7 @@ var translations = map[string]string{
"Unable to parse Atom feed: %q": "无法解析Atom源: %q",
"Unable to parse JSON feed: %q": "无法解析JSON源: %q",
"Unable to parse RDF feed: %q": "无法解析RDF源: %q",
+ "Unable to read data: %q": "无法读取数据: %q",
"Unable to normalize encoding: %q": "无法正则化编码: %q",
"Category not found for this user": "未找到该用户的这一分类",
"This feed is empty": "该源是空的",
@@ -2645,5 +2646,5 @@ var translationsChecksums = map[string]string{
"nl_NL": "a91e2195ac0731a3788405a51c4201e1a89dcce35ef792356e8c17adb57aee97",
"pl_PL": "097bc9beac12f33d3a5e5ee98ccba0875e4d1c1bf13e38251a66ac450834c5b3",
"ru_RU": "b253bf709a2f4bcac2f894bd1797247481fa7c6b70a0a0d8785d8680be83bac8",
- "zh_CN": "cb974ad8c374278057db4ca58ff0e59314dc191e2ea59af0d1472a438a9ce3e0",
+ "zh_CN": "5004e07fa535ea56e7fbe1501bb8ff4191d1d214e51b4590110b660994c39f0d",
}
diff --git a/locale/translations/zh_CN.json b/locale/translations/zh_CN.json
index 453cff9..a690187 100644
--- a/locale/translations/zh_CN.json
+++ b/locale/translations/zh_CN.json
@@ -279,6 +279,7 @@
"Unable to parse Atom feed: %q": "无法解析Atom源: %q",
"Unable to parse JSON feed: %q": "无法解析JSON源: %q",
"Unable to parse RDF feed: %q": "无法解析RDF源: %q",
+ "Unable to read data: %q": "无法读取数据: %q",
"Unable to normalize encoding: %q": "无法正则化编码: %q",
"Category not found for this user": "未找到该用户的这一分类",
"This feed is empty": "该源是空的",
diff --git a/reader/atom/parser.go b/reader/atom/parser.go
index 4749c1a..90a84aa 100644
--- a/reader/atom/parser.go
+++ b/reader/atom/parser.go
@@ -5,22 +5,17 @@
package atom // import "miniflux.app/reader/atom"
import (
- "encoding/xml"
"io"
"miniflux.app/errors"
"miniflux.app/model"
- "miniflux.app/reader/encoding"
+ "miniflux.app/reader/xml"
)
// Parse returns a normalized feed struct from a Atom feed.
func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
atomFeed := new(atomFeed)
decoder := xml.NewDecoder(data)
- decoder.Entity = xml.HTMLEntity
- decoder.Strict = false
- decoder.CharsetReader = encoding.CharsetReader
-
err := decoder.Decode(atomFeed)
if err != nil {
return nil, errors.NewLocalizedError("Unable to parse Atom feed: %q", err)
diff --git a/reader/rdf/parser.go b/reader/rdf/parser.go
index 861ce8c..57a8e52 100644
--- a/reader/rdf/parser.go
+++ b/reader/rdf/parser.go
@@ -5,22 +5,17 @@
package rdf // import "miniflux.app/reader/rdf"
import (
- "encoding/xml"
"io"
"miniflux.app/errors"
"miniflux.app/model"
- "miniflux.app/reader/encoding"
+ "miniflux.app/reader/xml"
)
// Parse returns a normalized feed struct from a RDF feed.
func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
feed := new(rdfFeed)
decoder := xml.NewDecoder(data)
- decoder.Entity = xml.HTMLEntity
- decoder.Strict = false
- decoder.CharsetReader = encoding.CharsetReader
-
err := decoder.Decode(feed)
if err != nil {
return nil, errors.NewLocalizedError("Unable to parse RDF feed: %q", err)
diff --git a/reader/rss/parser.go b/reader/rss/parser.go
index 79bd144..9ed773d 100644
--- a/reader/rss/parser.go
+++ b/reader/rss/parser.go
@@ -5,22 +5,17 @@
package rss // import "miniflux.app/reader/rss"
import (
- "encoding/xml"
"io"
"miniflux.app/errors"
"miniflux.app/model"
- "miniflux.app/reader/encoding"
+ "miniflux.app/reader/xml"
)
// Parse returns a normalized feed struct from a RSS feed.
func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
feed := new(rssFeed)
decoder := xml.NewDecoder(data)
- decoder.Entity = xml.HTMLEntity
- decoder.Strict = false
- decoder.CharsetReader = encoding.CharsetReader
-
err := decoder.Decode(feed)
if err != nil {
return nil, errors.NewLocalizedError("Unable to parse RSS feed: %q", err)
diff --git a/reader/xml/decoder.go b/reader/xml/decoder.go
new file mode 100644
index 0000000..d01f74e
--- /dev/null
+++ b/reader/xml/decoder.go
@@ -0,0 +1,50 @@
+// Copyright 2019 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package xml // import "miniflux.app/reader/xml"
+
+import (
+ "bytes"
+ "encoding/xml"
+ "fmt"
+ "io"
+ "io/ioutil"
+
+ "miniflux.app/reader/encoding"
+)
+
+// NewDecoder returns a XML decoder that filters illegal characters.
+func NewDecoder(data io.Reader) *xml.Decoder {
+ decoder := xml.NewDecoder(data)
+ decoder.Entity = xml.HTMLEntity
+ decoder.Strict = false
+ decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
+ utf8Reader, err := encoding.CharsetReader(charset, input)
+ if err != nil {
+ return nil, err
+ }
+ rawData, err := ioutil.ReadAll(utf8Reader)
+ if err != nil {
+ return nil, fmt.Errorf("Unable to read data: %q", err)
+ }
+ filteredBytes := bytes.Map(filterValidXMLChar, rawData)
+ return bytes.NewReader(filteredBytes), nil
+ }
+
+ return decoder
+}
+
+// This function is copied from encoding/xml package,
+// and is used to check if all the characters are legal.
+func filterValidXMLChar(r rune) rune {
+ if r == 0x09 ||
+ r == 0x0A ||
+ r == 0x0D ||
+ r >= 0x20 && r <= 0xD7FF ||
+ r >= 0xE000 && r <= 0xFFFD ||
+ r >= 0x10000 && r <= 0x10FFFF {
+ return r
+ }
+ return -1
+}
diff --git a/reader/xml/decoder_test.go b/reader/xml/decoder_test.go
new file mode 100644
index 0000000..ea24bf8
--- /dev/null
+++ b/reader/xml/decoder_test.go
@@ -0,0 +1,29 @@
+// Copyright 2019 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package xml // import "miniflux.app/reader/xml"
+
+import (
+ "encoding/xml"
+ "fmt"
+ "strings"
+ "testing"
+)
+
+func TestIllegalCharacters(t *testing.T) {
+ type myxml struct {
+ XMLName xml.Name `xml:"rss"`
+ Version string `xml:"version,attr"`
+ Title string `xml:"title"`
+ }
+
+ data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>%s</title></rss>`, "\x10")
+ var x myxml
+
+ decoder := NewDecoder(strings.NewReader(data))
+ err := decoder.Decode(&x)
+ if err != nil {
+ t.Error(err)
+ }
+}