aboutsummaryrefslogtreecommitdiffhomepage
path: root/reader/encoding/encoding.go
blob: 7f4abda1beff2d6a9622b6d2ef900cab7b1ee939 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// Copyright 2018 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.

package encoding // import "miniflux.app/reader/encoding"

import (
	"bytes"
	"io"
	"io/ioutil"
	"unicode/utf8"

	"golang.org/x/net/html/charset"
)

// CharsetReader is used when the XML encoding is specified for the input document.
//
// The document is converted in UTF-8 only if a different encoding is specified
// and the document is not already UTF-8.
//
// Several edge cases could exists:
//
// - Feeds with encoding specified only in Content-Type header and not in XML document
// - Feeds with encoding specified in both places
// - Feeds with encoding specified only in XML document and not in HTTP header
// - Feeds with wrong encoding defined and already in UTF-8
func CharsetReader(label string, input io.Reader) (io.Reader, error) {
	buffer, _ := ioutil.ReadAll(input)
	r := bytes.NewReader(buffer)

	// The document is already UTF-8, do not do anything (avoid double-encoding).
	// That means the specified encoding in XML prolog is wrong.
	if utf8.Valid(buffer) {
		return r, nil
	}

	// Transform document to UTF-8 from the specified encoding in XML prolog.
	return charset.NewReaderLabel(label, r)
}