From 7b0bfd930839e6ebe4b64f37cbdc6efbf7dbd090 Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Wed, 7 Feb 2018 20:57:56 -0800 Subject: Strip invalid XML characters to avoid parsing errors --- reader/feed/parser.go | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'reader') diff --git a/reader/feed/parser.go b/reader/feed/parser.go index c04836a..70c81e3 100644 --- a/reader/feed/parser.go +++ b/reader/feed/parser.go @@ -12,6 +12,7 @@ import ( "strings" "time" + "github.com/miniflux/miniflux/logger" "github.com/miniflux/miniflux/model" "github.com/miniflux/miniflux/reader/atom" "github.com/miniflux/miniflux/reader/encoding" @@ -74,7 +75,8 @@ func parseFeed(r io.Reader) (*model.Feed, error) { return nil, errors.New("This feed is empty") } - reader := bytes.NewReader(buffer.Bytes()) + str := stripInvalidXMLCharacters(buffer.String()) + reader := strings.NewReader(str) format := DetectFeedFormat(reader) reader.Seek(0, io.SeekStart) @@ -91,3 +93,26 @@ func parseFeed(r io.Reader) (*model.Feed, error) { return nil, errors.New("Unsupported feed format") } } + +func stripInvalidXMLCharacters(input string) string { + return strings.Map(func(r rune) rune { + if isInCharacterRange(r) { + return r + } + + logger.Debug("Strip invalid XML characters: %U", r) + return -1 + }, input) +} + +// Decide whether the given rune is in the XML Character Range, per +// the Char production of http://www.xml.com/axml/testaxml.htm, +// Section 2.2 Characters. +func isInCharacterRange(r rune) (inrange bool) { + return r == 0x09 || + r == 0x0A || + r == 0x0D || + r >= 0x20 && r <= 0xDF77 || + r >= 0xE000 && r <= 0xFFFD || + r >= 0x10000 && r <= 0x10FFFF +} -- cgit v1.2.3