diff options
author | Joey Hess <joey@kitenet.net> | 2013-09-28 14:35:21 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2013-09-28 14:35:21 -0400 |
commit | a05cefbd7cdfc75109d8f55c4cb699352745841c (patch) | |
tree | a3d10d759b00a2c00340d352827fe9d287bed07c | |
parent | 309750f7588d7c9a6eadbdd30b630250f766311f (diff) |
Send a git-annex user-agent when downloading urls.
Overridable with --user-agent option.
Not yet done for S3 or WebDAV due to limitations of libraries used --
nether allows a user-agent header to be specified.
This commit sponsored by Michael Zehrer.
-rw-r--r-- | Annex.hs | 2 | ||||
-rw-r--r-- | Annex/Content.hs | 4 | ||||
-rw-r--r-- | Annex/Url.hs | 27 | ||||
-rw-r--r-- | Command/AddUrl.hs | 8 | ||||
-rw-r--r-- | Command/ImportFeed.hs | 5 | ||||
-rw-r--r-- | GitAnnex/Options.hs | 3 | ||||
-rw-r--r-- | Remote/Git.hs | 7 | ||||
-rw-r--r-- | Remote/Web.hs | 4 | ||||
-rw-r--r-- | Utility/Url.hs | 48 | ||||
-rw-r--r-- | debian/changelog | 3 | ||||
-rw-r--r-- | doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn | 4 | ||||
-rw-r--r-- | doc/git-annex.mdwn | 4 |
12 files changed, 86 insertions, 33 deletions
@@ -108,6 +108,7 @@ data AnnexState = AnnexState , fields :: M.Map String String , cleanup :: M.Map String (Annex ()) , inodeschanged :: Maybe Bool + , useragent :: Maybe String } newState :: Git.Repo -> AnnexState @@ -141,6 +142,7 @@ newState gitrepo = AnnexState , fields = M.empty , cleanup = M.empty , inodeschanged = Nothing + , useragent = Nothing } {- Makes an Annex state object for the specified git repo. diff --git a/Annex/Content.hs b/Annex/Content.hs index c7afa8070..da0189c74 100644 --- a/Annex/Content.hs +++ b/Annex/Content.hs @@ -43,7 +43,7 @@ import qualified Annex.Queue import qualified Annex.Branch import Utility.DiskFree import Utility.FileMode -import qualified Utility.Url as Url +import qualified Annex.Url as Url import Types.Key import Utility.DataUnits import Utility.CopyFile @@ -458,7 +458,7 @@ downloadUrl urls file = go =<< annexWebDownloadCommand <$> Annex.getGitConfig go Nothing = do opts <- map Param . annexWebOptions <$> Annex.getGitConfig headers <- getHttpHeaders - liftIO $ anyM (\u -> Url.download u headers opts file) urls + anyM (\u -> Url.withUserAgent $ Url.download u headers opts file) urls go (Just basecmd) = liftIO $ anyM (downloadcmd basecmd) urls downloadcmd basecmd url = boolSystem "sh" [Param "-c", Param $ gencmd url basecmd] diff --git a/Annex/Url.hs b/Annex/Url.hs new file mode 100644 index 000000000..0401ffe07 --- /dev/null +++ b/Annex/Url.hs @@ -0,0 +1,27 @@ +{- Url downloading, with git-annex user agent. + - + - Copyright 2013 Joey Hess <joey@kitenet.net> + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Annex.Url ( + module U, + withUserAgent, + getUserAgent, +) where + +import Common.Annex +import qualified Annex +import Utility.Url as U +import qualified Build.SysConfig as SysConfig + +defaultUserAgent :: U.UserAgent +defaultUserAgent = "git-annex/" ++ SysConfig.packageversion + +getUserAgent :: Annex (Maybe U.UserAgent) +getUserAgent = Annex.getState $ + Just . fromMaybe defaultUserAgent . Annex.useragent + +withUserAgent :: (Maybe U.UserAgent -> IO a) -> Annex a +withUserAgent a = liftIO . a =<< getUserAgent diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index e767a45e0..951bbdbe8 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -17,8 +17,8 @@ import Backend import qualified Command.Add import qualified Annex import qualified Annex.Queue +import qualified Annex.Url as Url import qualified Backend.URL -import qualified Utility.Url as Url import Annex.Content import Logs.Web import qualified Option @@ -123,7 +123,7 @@ perform relaxed url file = ifAnnexed file addurl geturl next $ return True | otherwise = do headers <- getHttpHeaders - ifM (liftIO $ Url.check url headers $ keySize key) + ifM (Url.withUserAgent $ Url.check url headers $ keySize key) ( do setUrlPresent key url next $ return True @@ -174,7 +174,7 @@ download url file = do size <- ifM (liftIO $ isJust <$> checkDaemon pidfile) ( do headers <- getHttpHeaders - liftIO $ snd <$> Url.exists url headers + snd <$> Url.withUserAgent (Url.exists url headers) , return Nothing ) Backend.URL.fromUrl url size @@ -203,7 +203,7 @@ nodownload relaxed url file = do headers <- getHttpHeaders (exists, size) <- if relaxed then pure (True, Nothing) - else liftIO $ Url.exists url headers + else Url.withUserAgent $ Url.exists url headers if exists then do key <- Backend.URL.fromUrl url size diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs index d2f806402..7f54643c9 100644 --- a/Command/ImportFeed.hs +++ b/Command/ImportFeed.hs @@ -17,7 +17,7 @@ import Data.Time.Clock import Common.Annex import qualified Annex import Command -import qualified Utility.Url as Url +import qualified Annex.Url as Url import Logs.Web import qualified Option import qualified Utility.Format @@ -102,9 +102,10 @@ findEnclosures url = extract <$> downloadFeed url downloadFeed :: URLString -> Annex (Maybe Feed) downloadFeed url = do showOutput + ua <- Url.getUserAgent liftIO $ withTmpFile "feed" $ \f h -> do fileEncoding h - ifM (Url.download url [] [] f) + ifM (Url.download url [] [] f ua) ( liftIO $ parseFeedString <$> hGetContentsStrict h , return Nothing ) diff --git a/GitAnnex/Options.hs b/GitAnnex/Options.hs index 459ee3bf4..596cc138f 100644 --- a/GitAnnex/Options.hs +++ b/GitAnnex/Options.hs @@ -48,6 +48,8 @@ options = Option.common ++ "skip files smaller than a size" , Option ['T'] ["time-limit"] (ReqArg Limit.addTimeLimit paramTime) "stop after the specified amount of time" + , Option [] ["user-agent"] (ReqArg setuseragent paramName) + "override default User-Agent" , Option [] ["trust-glacier"] (NoArg (Annex.setFlag "trustglacier")) "Trust Amazon Glacier inventory" ] ++ Option.matcher @@ -55,6 +57,7 @@ options = Option.common ++ setnumcopies v = maybe noop (\n -> Annex.changeState $ \s -> s { Annex.forcenumcopies = Just n }) (readish v) + setuseragent v = Annex.changeState $ \s -> s { Annex.useragent = Just v } setgitconfig v = Annex.changeGitRepo =<< inRepo (Git.Config.store v) trustArg t = ReqArg (Remote.forceTrust t) paramRemote diff --git a/Remote/Git.hs b/Remote/Git.hs index 7083de667..0f3f35811 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -30,7 +30,7 @@ import Annex.Exception import qualified Annex.Content import qualified Annex.BranchState import qualified Annex.Branch -import qualified Utility.Url as Url +import qualified Annex.Url as Url import Utility.Tmp import Config import Config.Cost @@ -177,9 +177,10 @@ tryGitConfigRead r Left l -> return $ Left l geturlconfig headers = do + ua <- Url.getUserAgent v <- liftIO $ withTmpFile "git-annex.tmp" $ \tmpfile h -> do hClose h - ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile) + ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile ua) ( pipedconfig "git" [Param "config", Param "--null", Param "--list", Param "--file", File tmpfile] , return $ Left undefined ) @@ -240,7 +241,7 @@ inAnnex r key where checkhttp headers = do showChecking r - liftIO $ ifM (anyM (\u -> Url.check u headers (keySize key)) (keyUrls r key)) + ifM (anyM (\u -> Url.withUserAgent $ Url.check u headers (keySize key)) (keyUrls r key)) ( return $ Right True , return $ Left "not found" ) diff --git a/Remote/Web.hs b/Remote/Web.hs index 789aab698..af60beee0 100644 --- a/Remote/Web.hs +++ b/Remote/Web.hs @@ -19,7 +19,7 @@ import Config.Cost import Logs.Web import Types.Key import Utility.Metered -import qualified Utility.Url as Url +import qualified Annex.Url as Url #ifdef WITH_QUVI import Annex.Quvi import qualified Utility.Quvi as Quvi @@ -118,7 +118,7 @@ checkKey' key us = firsthit us (Right False) $ \u -> do #endif DefaultDownloader -> do headers <- getHttpHeaders - liftIO $ Right <$> Url.check u' headers (keySize key) + Right <$> Url.withUserAgent (Url.check u' headers $ keySize key) where firsthit [] miss _ = return miss firsthit (u:rest) _ a = do diff --git a/Utility/Url.hs b/Utility/Url.hs index 2f2ec1dc0..baea0fda1 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -9,6 +9,7 @@ module Utility.Url ( URLString, + UserAgent, check, exists, download, @@ -27,10 +28,12 @@ type URLString = String type Headers = [String] +type UserAgent = String + {- Checks that an url exists and could be successfully downloaded, - also checking that its size, if available, matches a specified size. -} -check :: URLString -> Headers -> Maybe Integer -> IO Bool -check url headers expected_size = handle <$> exists url headers +check :: URLString -> Headers -> Maybe Integer -> Maybe UserAgent -> IO Bool +check url headers expected_size = handle <$$> exists url headers where handle (False, _) = False handle (True, Nothing) = True @@ -44,8 +47,8 @@ check url headers expected_size = handle <$> exists url headers - Uses curl otherwise, when available, since curl handles https better - than does Haskell's Network.Browser. -} -exists :: URLString -> Headers -> IO (Bool, Maybe Integer) -exists url headers = case parseURIRelaxed url of +exists :: URLString -> Headers -> Maybe UserAgent -> IO (Bool, Maybe Integer) +exists url headers ua = case parseURIRelaxed url of Just u | uriScheme u == "file:" -> do s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u) @@ -54,12 +57,12 @@ exists url headers = case parseURIRelaxed url of Nothing -> dne | otherwise -> if Build.SysConfig.curl then do - output <- readProcess "curl" curlparams + output <- readProcess "curl" $ toCommand curlparams case lastMaybe (lines output) of Just ('2':_:_) -> return (True, extractsize output) _ -> dne else do - r <- request u headers HEAD + r <- request u headers HEAD ua case rspCode r of (2,_,_) -> return (True, size r) _ -> return (False, Nothing) @@ -67,13 +70,12 @@ exists url headers = case parseURIRelaxed url of where dne = return (False, Nothing) - curlparams = - [ "-s" - , "--head" - , "-L" - , url - , "-w", "%{http_code}" - ] ++ concatMap (\h -> ["-H", h]) headers + curlparams = addUserAgent ua $ + [ Param "-s" + , Param "--head" + , Param "-L", Param url + , Param "-w", Param "%{http_code}" + ] ++ concatMap (\h -> [Param "-H", Param h]) headers extractsize s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of Just l -> case lastMaybe $ words l of @@ -83,6 +85,11 @@ exists url headers = case parseURIRelaxed url of size = liftM Prelude.read . lookupHeader HdrContentLength . rspHeaders +-- works for both wget and curl commands +addUserAgent :: Maybe UserAgent -> [CommandParam] -> [CommandParam] +addUserAgent Nothing ps = ps +addUserAgent (Just ua) ps = ps ++ [Param "--user-agent", Param ua] + {- Used to download large files, such as the contents of keys. - - Uses wget or curl program for its progress bar. (Wget has a better one, @@ -90,15 +97,15 @@ exists url headers = case parseURIRelaxed url of - would not be appropriate to test at configure time and build support - for only one in. -} -download :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool +download :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool download = download' False {- No output, even on error. -} -downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool +downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool downloadQuiet = download' True -download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool -download' quiet url headers options file = +download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool +download' quiet url headers options file ua = case parseURIRelaxed url of Just u | uriScheme u == "file:" -> do @@ -119,7 +126,7 @@ download' quiet url headers options file = curl = go "curl" $ headerparams ++ quietopt "-s" ++ [Params "-f -L -C - -# -o"] go cmd opts = boolSystem cmd $ - options++opts++[File file, File url] + addUserAgent ua $ options++opts++[File file, File url] quietopt s | quiet = [Param s] | otherwise = [] @@ -134,13 +141,14 @@ download' quiet url headers options file = - Unfortunately, does not handle https, so should only be used - when curl is not available. -} -request :: URI -> Headers -> RequestMethod -> IO (Response String) -request url headers requesttype = go 5 url +request :: URI -> Headers -> RequestMethod -> Maybe UserAgent -> IO (Response String) +request url headers requesttype ua = go 5 url where go :: Int -> URI -> IO (Response String) go 0 _ = error "Too many redirects " go n u = do rsp <- Browser.browse $ do + maybe noop Browser.setUserAgent ua Browser.setErrHandler ignore Browser.setOutHandler ignore Browser.setAllowRedirects False diff --git a/debian/changelog b/debian/changelog index 7c05434e4..1379ffc01 100644 --- a/debian/changelog +++ b/debian/changelog @@ -18,6 +18,9 @@ git-annex (4.20130921) UNRELEASED; urgency=low * add, import, assistant: Better preserve the mtime of symlinks, when when adding content that gets deduplicated. * webapp: Support storing encrypted git repositories on rsync.net. + * Send a git-annex user-agent when downloading urls. + Overridable with --user-agent option. + (Not yet done for S3 or WebDAV due to limitations of libraries used.) -- Joey Hess <joeyh@debian.org> Sun, 22 Sep 2013 19:42:29 -0400 diff --git a/doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn b/doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn index c1ccf86c2..0d6bcb05c 100644 --- a/doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn +++ b/doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn @@ -71,3 +71,7 @@ HTTP request sent, awaiting response... 200 OK > switch, and/or to make git-annex set a default user agent header > of "git-annex", rather than relying on the curl/wget defaults. > --[[Joey]] + +> I've [[done]] what's discussed above, and verified it fixes +> behavior for this specific server too. +> --[[Joey]] diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index f0c81f07f..c06d1ffe9 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -824,6 +824,10 @@ subdirectories). Also, '\\n' is a newline, '\\000' is a NULL, etc. +* `--user-agent=value` + + Overrides the User-Agent to use when downloading files from the web. + * `-c name=value` Used to override git configuration settings. May be specified multiple times. |