From 3debf75eb9229144a05701e03ba59408a75dd815 Mon Sep 17 00:00:00 2001 From: Frédéric Guillot Date: Thu, 26 Dec 2019 15:26:23 -0800 Subject: Normalize URL query string before executing HTTP requests - Make sure query strings parameters are encoded - As opposed to the standard library, do not append equal sign for query parameters with empty value - Strip URL fragments like Web browsers --- http/client/client.go | 42 ++++++++++++++++++++----------- http/client/response.go | 19 +++++++++++++-- http/client/response_test.go | 4 +-- reader/feed/handler.go | 4 +-- reader/parser/parser_test.go | 2 +- reader/subscription/finder.go | 4 +-- url/url.go | 57 ++++++++++++++++++++++++++++++++++++++++--- url/url_test.go | 22 +++++++++++++++++ 8 files changed, 128 insertions(+), 26 deletions(-) diff --git a/http/client/client.go b/http/client/client.go index 175fe9f..d38859a 100644 --- a/http/client/client.go +++ b/http/client/client.go @@ -22,6 +22,7 @@ import ( "miniflux.app/errors" "miniflux.app/logger" "miniflux.app/timer" + url_helper "miniflux.app/url" "miniflux.app/version" ) @@ -37,7 +38,8 @@ var ( // Client is a HTTP Client :) type Client struct { - url string + inputURL string + requestURL string etagHeader string lastModifiedHeader string authorizationHeader string @@ -47,6 +49,18 @@ type Client struct { Insecure bool } +func (c *Client) String() string { + return fmt.Sprintf( + `InputURL=%q RequestURL=%q ETag=%s LastModified=%q BasicAuth=%v UserAgent=%q`, + c.inputURL, + c.requestURL, + c.etagHeader, + c.lastModifiedHeader, + c.authorizationHeader != "" || (c.username != "" && c.password != ""), + c.userAgent, + ) +} + // WithCredentials defines the username/password for HTTP Basic authentication. func (c *Client) WithCredentials(username, password string) *Client { if username != "" && password != "" { @@ -115,7 +129,12 @@ func (c *Client) PostJSON(data interface{}) (*Response, error) { } func (c *Client) executeRequest(request *http.Request) (*Response, error) { - defer timer.ExecutionTime(time.Now(), fmt.Sprintf("[HttpClient] url=%s", c.url)) + defer timer.ExecutionTime(time.Now(), fmt.Sprintf("[HttpClient] inputURL=%s", c.inputURL)) + + logger.Debug("[HttpClient:Before] Method=%s %s", + request.Method, + c.String(), + ) client := c.buildClient() resp, err := client.Do(request) @@ -162,21 +181,15 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) { EffectiveURL: resp.Request.URL.String(), LastModified: resp.Header.Get("Last-Modified"), ETag: resp.Header.Get("ETag"), + Expires: resp.Header.Get("Expires"), ContentType: resp.Header.Get("Content-Type"), ContentLength: resp.ContentLength, } - logger.Debug("[HttpClient:%s] URL=%s, EffectiveURL=%s, Code=%d, Length=%d, Type=%s, ETag=%s, LastMod=%s, Expires=%s, Auth=%v", + logger.Debug("[HttpClient:After] Method=%s %s; Response => %s", request.Method, - c.url, - response.EffectiveURL, - response.StatusCode, - resp.ContentLength, - response.ContentType, - response.ETag, - response.LastModified, - resp.Header.Get("Expires"), - c.username != "", + c.String(), + response, ) // Ignore caching headers for feeds that do not want any cache. @@ -190,7 +203,8 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) { } func (c *Client) buildRequest(method string, body io.Reader) (*http.Request, error) { - request, err := http.NewRequest(method, c.url, body) + c.requestURL = url_helper.RequestURI(c.inputURL) + request, err := http.NewRequest(method, c.requestURL, body) if err != nil { return nil, err } @@ -238,5 +252,5 @@ func (c *Client) buildHeaders() http.Header { // New returns a new HTTP client. func New(url string) *Client { - return &Client{url: url, userAgent: DefaultUserAgent, Insecure: false} + return &Client{inputURL: url, userAgent: DefaultUserAgent, Insecure: false} } diff --git a/http/client/response.go b/http/client/response.go index e62b5ab..122c40c 100644 --- a/http/client/response.go +++ b/http/client/response.go @@ -6,6 +6,7 @@ package client // import "miniflux.app/http/client" import ( "bytes" + "fmt" "io" "io/ioutil" "regexp" @@ -24,10 +25,24 @@ type Response struct { EffectiveURL string LastModified string ETag string + Expires string ContentType string ContentLength int64 } +func (r *Response) String() string { + return fmt.Sprintf( + `StatusCode=%d EffectiveURL=%q LastModified=%q ETag=%s Expires=%s ContentType=%q ContentLength=%d`, + r.StatusCode, + r.EffectiveURL, + r.LastModified, + r.ETag, + r.Expires, + r.ContentType, + r.ContentLength, + ) +} + // IsNotFound returns true if the resource doesn't exists anymore. func (r *Response) IsNotFound() bool { return r.StatusCode == 404 || r.StatusCode == 410 @@ -105,8 +120,8 @@ func (r *Response) EnsureUnicodeBody() (err error) { return err } -// String returns the response body as string. -func (r *Response) String() string { +// BodyAsString returns the response body as string. +func (r *Response) BodyAsString() string { bytes, _ := ioutil.ReadAll(r.Body) return string(bytes) } diff --git a/http/client/response_test.go b/http/client/response_test.go index 818fd81..6a2d8c2 100644 --- a/http/client/response_test.go +++ b/http/client/response_test.go @@ -95,7 +95,7 @@ func TestToString(t *testing.T) { input := `test` r := &Response{Body: strings.NewReader(input)} - if r.String() != input { + if r.BodyAsString() != input { t.Error(`Unexpected ouput`) } } @@ -140,7 +140,7 @@ func TestEnsureUnicodeWithHTMLDocuments(t *testing.T) { t.Fatalf(`Unicode conversion error for %q - %q: %v`, tc.filename, tc.contentType, parseErr) } - isUnicode := utf8.ValidString(r.String()) + isUnicode := utf8.ValidString(r.BodyAsString()) if isUnicode != tc.convertedToUnicode { t.Errorf(`Unicode conversion %q - %q, got: %v, expected: %v`, tc.filename, tc.contentType, isUnicode, tc.convertedToUnicode) diff --git a/reader/feed/handler.go b/reader/feed/handler.go index b272ea1..7dd0bc7 100644 --- a/reader/feed/handler.go +++ b/reader/feed/handler.go @@ -52,7 +52,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool, return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL) } - subscription, parseErr := parser.ParseFeed(response.String()) + subscription, parseErr := parser.ParseFeed(response.BodyAsString()) if parseErr != nil { return nil, parseErr } @@ -106,7 +106,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error { if response.IsModified(originalFeed.EtagHeader, originalFeed.LastModifiedHeader) { logger.Debug("[Handler:RefreshFeed] Feed #%d has been modified", feedID) - updatedFeed, parseErr := parser.ParseFeed(response.String()) + updatedFeed, parseErr := parser.ParseFeed(response.BodyAsString()) if parseErr != nil { originalFeed.WithError(parseErr.Localize(printer)) h.store.UpdateFeedError(originalFeed) diff --git a/reader/parser/parser_test.go b/reader/parser/parser_test.go index a5a79a9..ddbacec 100644 --- a/reader/parser/parser_test.go +++ b/reader/parser/parser_test.go @@ -191,7 +191,7 @@ func TestDifferentEncodingWithResponse(t *testing.T) { t.Fatalf(`Encoding error for %q: %v`, tc.filename, encodingErr) } - feed, parseErr := ParseFeed(r.String()) + feed, parseErr := ParseFeed(r.BodyAsString()) if parseErr != nil { t.Fatalf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr) } diff --git a/reader/subscription/finder.go b/reader/subscription/finder.go index fab8d93..66bbedd 100644 --- a/reader/subscription/finder.go +++ b/reader/subscription/finder.go @@ -18,7 +18,7 @@ import ( ) var ( - errUnreadableDoc = "Unable to analyze this page: %v" + errUnreadableDoc = "Unable to analyze this page: %v" ) // FindSubscriptions downloads and try to find one or more subscriptions from an URL. @@ -31,7 +31,7 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr return nil, err } - body := response.String() + body := response.BodyAsString() if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown { var subscriptions Subscriptions subscriptions = append(subscriptions, &Subscription{ diff --git a/url/url.go b/url/url.go index cc35e2f..b02348f 100644 --- a/url/url.go +++ b/url/url.go @@ -4,9 +4,12 @@ package url // import "miniflux.app/url" -import "net/url" -import "fmt" -import "strings" +import ( + "fmt" + "net/url" + "sort" + "strings" +) // AbsoluteURL converts the input URL as absolute URL if necessary. func AbsoluteURL(baseURL, input string) (string, error) { @@ -69,3 +72,51 @@ func Domain(websiteURL string) string { return parsedURL.Host } + +// RequestURI returns the encoded URI to be used in HTTP requests. +func RequestURI(websiteURL string) string { + u, err := url.Parse(websiteURL) + if err != nil { + return websiteURL + } + + queryValues := u.Query() + u.RawQuery = "" // Clear RawQuery to make sure it's encoded properly. + u.Fragment = "" // Clear fragment because Web browsers strip #fragment before sending the URL to a web server. + + var buf strings.Builder + buf.WriteString(u.String()) + + if len(queryValues) > 0 { + buf.WriteByte('?') + + // Sort keys. + keys := make([]string, 0, len(queryValues)) + for k := range queryValues { + keys = append(keys, k) + } + sort.Strings(keys) + + i := 0 + for _, key := range keys { + keyEscaped := url.QueryEscape(key) + values := queryValues[key] + for _, value := range values { + if i > 0 { + buf.WriteByte('&') + } + buf.WriteString(keyEscaped) + + // As opposed to the standard library, we append the = only if the value is not empty. + if value != "" { + buf.WriteByte('=') + buf.WriteString(url.QueryEscape(value)) + } + + i++ + } + } + } + + return buf.String() +} diff --git a/url/url_test.go b/url/url_test.go index 9a14b20..54868a9 100644 --- a/url/url_test.go +++ b/url/url_test.go @@ -71,3 +71,25 @@ func TestDomain(t *testing.T) { } } } + +func TestRequestURI(t *testing.T) { + scenarios := map[string]string{ + "https://www.example.org": "https://www.example.org", + "https://user:password@www.example.org": "https://user:password@www.example.org", + "https://www.example.org/path with spaces": "https://www.example.org/path%20with%20spaces", + "https://www.example.org/path#test": "https://www.example.org/path", + "https://www.example.org/path?abc#test": "https://www.example.org/path?abc", + "https://www.example.org/path?a=b&a=c": "https://www.example.org/path?a=b&a=c", + "https://www.example.org/path?a=b&a=c&d": "https://www.example.org/path?a=b&a=c&d", + "https://www.example.org/path?atom": "https://www.example.org/path?atom", + "https://www.example.org/path?测试=测试": "https://www.example.org/path?%E6%B5%8B%E8%AF%95=%E6%B5%8B%E8%AF%95", + "https://www.example.org/url=http%3A%2F%2Fwww.example.com%2Ffeed%2F&max=20": "https://www.example.org/url=http%3A%2F%2Fwww.example.com%2Ffeed%2F&max=20", + } + + for input, expected := range scenarios { + actual := RequestURI(input) + if actual != expected { + t.Errorf(`Unexpected result, got %q instead of %q`, actual, expected) + } + } +} -- cgit v1.2.3