diff options
author | Frédéric Guillot <fred@miniflux.net> | 2017-12-10 19:01:38 -0800 |
---|---|---|
committer | Frédéric Guillot <fred@miniflux.net> | 2017-12-10 19:01:38 -0800 |
commit | 7a35c58f53d76356292e3e0ca9c91add3595a9e0 (patch) | |
tree | 99d2720d78049751d18033cb5ccafedc91f596c2 /reader/scraper | |
parent | b75a9987ba99047efe846b8f196bc5a28b7474c1 (diff) |
Add readability package to fetch original content
Diffstat (limited to 'reader/scraper')
-rw-r--r-- | reader/scraper/scraper.go | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go new file mode 100644 index 0000000..6c51862 --- /dev/null +++ b/reader/scraper/scraper.go @@ -0,0 +1,38 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package scraper + +import ( + "errors" + + "github.com/miniflux/miniflux2/http" + "github.com/miniflux/miniflux2/reader/readability" + "github.com/miniflux/miniflux2/reader/sanitizer" +) + +// Fetch download a web page a returns relevant contents. +func Fetch(websiteURL string) (string, error) { + client := http.NewClient(websiteURL) + response, err := client.Get() + if err != nil { + return "", err + } + + if response.HasServerFailure() { + return "", errors.New("unable to download web page") + } + + page, err := response.NormalizeBodyEncoding() + if err != nil { + return "", err + } + + content, err := readability.ExtractContent(page) + if err != nil { + return "", err + } + + return sanitizer.Sanitize(websiteURL, content), nil +} |