summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2013-09-28 14:35:21 -0400
committerGravatar Joey Hess <joey@kitenet.net>2013-09-28 14:35:21 -0400
commita05cefbd7cdfc75109d8f55c4cb699352745841c (patch)
treea3d10d759b00a2c00340d352827fe9d287bed07c
parent309750f7588d7c9a6eadbdd30b630250f766311f (diff)
Send a git-annex user-agent when downloading urls.
Overridable with --user-agent option. Not yet done for S3 or WebDAV due to limitations of libraries used -- nether allows a user-agent header to be specified. This commit sponsored by Michael Zehrer.
-rw-r--r--Annex.hs2
-rw-r--r--Annex/Content.hs4
-rw-r--r--Annex/Url.hs27
-rw-r--r--Command/AddUrl.hs8
-rw-r--r--Command/ImportFeed.hs5
-rw-r--r--GitAnnex/Options.hs3
-rw-r--r--Remote/Git.hs7
-rw-r--r--Remote/Web.hs4
-rw-r--r--Utility/Url.hs48
-rw-r--r--debian/changelog3
-rw-r--r--doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn4
-rw-r--r--doc/git-annex.mdwn4
12 files changed, 86 insertions, 33 deletions
diff --git a/Annex.hs b/Annex.hs
index 7625fa8b6..940b69e4b 100644
--- a/Annex.hs
+++ b/Annex.hs
@@ -108,6 +108,7 @@ data AnnexState = AnnexState
, fields :: M.Map String String
, cleanup :: M.Map String (Annex ())
, inodeschanged :: Maybe Bool
+ , useragent :: Maybe String
}
newState :: Git.Repo -> AnnexState
@@ -141,6 +142,7 @@ newState gitrepo = AnnexState
, fields = M.empty
, cleanup = M.empty
, inodeschanged = Nothing
+ , useragent = Nothing
}
{- Makes an Annex state object for the specified git repo.
diff --git a/Annex/Content.hs b/Annex/Content.hs
index c7afa8070..da0189c74 100644
--- a/Annex/Content.hs
+++ b/Annex/Content.hs
@@ -43,7 +43,7 @@ import qualified Annex.Queue
import qualified Annex.Branch
import Utility.DiskFree
import Utility.FileMode
-import qualified Utility.Url as Url
+import qualified Annex.Url as Url
import Types.Key
import Utility.DataUnits
import Utility.CopyFile
@@ -458,7 +458,7 @@ downloadUrl urls file = go =<< annexWebDownloadCommand <$> Annex.getGitConfig
go Nothing = do
opts <- map Param . annexWebOptions <$> Annex.getGitConfig
headers <- getHttpHeaders
- liftIO $ anyM (\u -> Url.download u headers opts file) urls
+ anyM (\u -> Url.withUserAgent $ Url.download u headers opts file) urls
go (Just basecmd) = liftIO $ anyM (downloadcmd basecmd) urls
downloadcmd basecmd url =
boolSystem "sh" [Param "-c", Param $ gencmd url basecmd]
diff --git a/Annex/Url.hs b/Annex/Url.hs
new file mode 100644
index 000000000..0401ffe07
--- /dev/null
+++ b/Annex/Url.hs
@@ -0,0 +1,27 @@
+{- Url downloading, with git-annex user agent.
+ -
+ - Copyright 2013 Joey Hess <joey@kitenet.net>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Annex.Url (
+ module U,
+ withUserAgent,
+ getUserAgent,
+) where
+
+import Common.Annex
+import qualified Annex
+import Utility.Url as U
+import qualified Build.SysConfig as SysConfig
+
+defaultUserAgent :: U.UserAgent
+defaultUserAgent = "git-annex/" ++ SysConfig.packageversion
+
+getUserAgent :: Annex (Maybe U.UserAgent)
+getUserAgent = Annex.getState $
+ Just . fromMaybe defaultUserAgent . Annex.useragent
+
+withUserAgent :: (Maybe U.UserAgent -> IO a) -> Annex a
+withUserAgent a = liftIO . a =<< getUserAgent
diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs
index e767a45e0..951bbdbe8 100644
--- a/Command/AddUrl.hs
+++ b/Command/AddUrl.hs
@@ -17,8 +17,8 @@ import Backend
import qualified Command.Add
import qualified Annex
import qualified Annex.Queue
+import qualified Annex.Url as Url
import qualified Backend.URL
-import qualified Utility.Url as Url
import Annex.Content
import Logs.Web
import qualified Option
@@ -123,7 +123,7 @@ perform relaxed url file = ifAnnexed file addurl geturl
next $ return True
| otherwise = do
headers <- getHttpHeaders
- ifM (liftIO $ Url.check url headers $ keySize key)
+ ifM (Url.withUserAgent $ Url.check url headers $ keySize key)
( do
setUrlPresent key url
next $ return True
@@ -174,7 +174,7 @@ download url file = do
size <- ifM (liftIO $ isJust <$> checkDaemon pidfile)
( do
headers <- getHttpHeaders
- liftIO $ snd <$> Url.exists url headers
+ snd <$> Url.withUserAgent (Url.exists url headers)
, return Nothing
)
Backend.URL.fromUrl url size
@@ -203,7 +203,7 @@ nodownload relaxed url file = do
headers <- getHttpHeaders
(exists, size) <- if relaxed
then pure (True, Nothing)
- else liftIO $ Url.exists url headers
+ else Url.withUserAgent $ Url.exists url headers
if exists
then do
key <- Backend.URL.fromUrl url size
diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs
index d2f806402..7f54643c9 100644
--- a/Command/ImportFeed.hs
+++ b/Command/ImportFeed.hs
@@ -17,7 +17,7 @@ import Data.Time.Clock
import Common.Annex
import qualified Annex
import Command
-import qualified Utility.Url as Url
+import qualified Annex.Url as Url
import Logs.Web
import qualified Option
import qualified Utility.Format
@@ -102,9 +102,10 @@ findEnclosures url = extract <$> downloadFeed url
downloadFeed :: URLString -> Annex (Maybe Feed)
downloadFeed url = do
showOutput
+ ua <- Url.getUserAgent
liftIO $ withTmpFile "feed" $ \f h -> do
fileEncoding h
- ifM (Url.download url [] [] f)
+ ifM (Url.download url [] [] f ua)
( liftIO $ parseFeedString <$> hGetContentsStrict h
, return Nothing
)
diff --git a/GitAnnex/Options.hs b/GitAnnex/Options.hs
index 459ee3bf4..596cc138f 100644
--- a/GitAnnex/Options.hs
+++ b/GitAnnex/Options.hs
@@ -48,6 +48,8 @@ options = Option.common ++
"skip files smaller than a size"
, Option ['T'] ["time-limit"] (ReqArg Limit.addTimeLimit paramTime)
"stop after the specified amount of time"
+ , Option [] ["user-agent"] (ReqArg setuseragent paramName)
+ "override default User-Agent"
, Option [] ["trust-glacier"] (NoArg (Annex.setFlag "trustglacier"))
"Trust Amazon Glacier inventory"
] ++ Option.matcher
@@ -55,6 +57,7 @@ options = Option.common ++
setnumcopies v = maybe noop
(\n -> Annex.changeState $ \s -> s { Annex.forcenumcopies = Just n })
(readish v)
+ setuseragent v = Annex.changeState $ \s -> s { Annex.useragent = Just v }
setgitconfig v = Annex.changeGitRepo =<< inRepo (Git.Config.store v)
trustArg t = ReqArg (Remote.forceTrust t) paramRemote
diff --git a/Remote/Git.hs b/Remote/Git.hs
index 7083de667..0f3f35811 100644
--- a/Remote/Git.hs
+++ b/Remote/Git.hs
@@ -30,7 +30,7 @@ import Annex.Exception
import qualified Annex.Content
import qualified Annex.BranchState
import qualified Annex.Branch
-import qualified Utility.Url as Url
+import qualified Annex.Url as Url
import Utility.Tmp
import Config
import Config.Cost
@@ -177,9 +177,10 @@ tryGitConfigRead r
Left l -> return $ Left l
geturlconfig headers = do
+ ua <- Url.getUserAgent
v <- liftIO $ withTmpFile "git-annex.tmp" $ \tmpfile h -> do
hClose h
- ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile)
+ ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile ua)
( pipedconfig "git" [Param "config", Param "--null", Param "--list", Param "--file", File tmpfile]
, return $ Left undefined
)
@@ -240,7 +241,7 @@ inAnnex r key
where
checkhttp headers = do
showChecking r
- liftIO $ ifM (anyM (\u -> Url.check u headers (keySize key)) (keyUrls r key))
+ ifM (anyM (\u -> Url.withUserAgent $ Url.check u headers (keySize key)) (keyUrls r key))
( return $ Right True
, return $ Left "not found"
)
diff --git a/Remote/Web.hs b/Remote/Web.hs
index 789aab698..af60beee0 100644
--- a/Remote/Web.hs
+++ b/Remote/Web.hs
@@ -19,7 +19,7 @@ import Config.Cost
import Logs.Web
import Types.Key
import Utility.Metered
-import qualified Utility.Url as Url
+import qualified Annex.Url as Url
#ifdef WITH_QUVI
import Annex.Quvi
import qualified Utility.Quvi as Quvi
@@ -118,7 +118,7 @@ checkKey' key us = firsthit us (Right False) $ \u -> do
#endif
DefaultDownloader -> do
headers <- getHttpHeaders
- liftIO $ Right <$> Url.check u' headers (keySize key)
+ Right <$> Url.withUserAgent (Url.check u' headers $ keySize key)
where
firsthit [] miss _ = return miss
firsthit (u:rest) _ a = do
diff --git a/Utility/Url.hs b/Utility/Url.hs
index 2f2ec1dc0..baea0fda1 100644
--- a/Utility/Url.hs
+++ b/Utility/Url.hs
@@ -9,6 +9,7 @@
module Utility.Url (
URLString,
+ UserAgent,
check,
exists,
download,
@@ -27,10 +28,12 @@ type URLString = String
type Headers = [String]
+type UserAgent = String
+
{- Checks that an url exists and could be successfully downloaded,
- also checking that its size, if available, matches a specified size. -}
-check :: URLString -> Headers -> Maybe Integer -> IO Bool
-check url headers expected_size = handle <$> exists url headers
+check :: URLString -> Headers -> Maybe Integer -> Maybe UserAgent -> IO Bool
+check url headers expected_size = handle <$$> exists url headers
where
handle (False, _) = False
handle (True, Nothing) = True
@@ -44,8 +47,8 @@ check url headers expected_size = handle <$> exists url headers
- Uses curl otherwise, when available, since curl handles https better
- than does Haskell's Network.Browser.
-}
-exists :: URLString -> Headers -> IO (Bool, Maybe Integer)
-exists url headers = case parseURIRelaxed url of
+exists :: URLString -> Headers -> Maybe UserAgent -> IO (Bool, Maybe Integer)
+exists url headers ua = case parseURIRelaxed url of
Just u
| uriScheme u == "file:" -> do
s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u)
@@ -54,12 +57,12 @@ exists url headers = case parseURIRelaxed url of
Nothing -> dne
| otherwise -> if Build.SysConfig.curl
then do
- output <- readProcess "curl" curlparams
+ output <- readProcess "curl" $ toCommand curlparams
case lastMaybe (lines output) of
Just ('2':_:_) -> return (True, extractsize output)
_ -> dne
else do
- r <- request u headers HEAD
+ r <- request u headers HEAD ua
case rspCode r of
(2,_,_) -> return (True, size r)
_ -> return (False, Nothing)
@@ -67,13 +70,12 @@ exists url headers = case parseURIRelaxed url of
where
dne = return (False, Nothing)
- curlparams =
- [ "-s"
- , "--head"
- , "-L"
- , url
- , "-w", "%{http_code}"
- ] ++ concatMap (\h -> ["-H", h]) headers
+ curlparams = addUserAgent ua $
+ [ Param "-s"
+ , Param "--head"
+ , Param "-L", Param url
+ , Param "-w", Param "%{http_code}"
+ ] ++ concatMap (\h -> [Param "-H", Param h]) headers
extractsize s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of
Just l -> case lastMaybe $ words l of
@@ -83,6 +85,11 @@ exists url headers = case parseURIRelaxed url of
size = liftM Prelude.read . lookupHeader HdrContentLength . rspHeaders
+-- works for both wget and curl commands
+addUserAgent :: Maybe UserAgent -> [CommandParam] -> [CommandParam]
+addUserAgent Nothing ps = ps
+addUserAgent (Just ua) ps = ps ++ [Param "--user-agent", Param ua]
+
{- Used to download large files, such as the contents of keys.
-
- Uses wget or curl program for its progress bar. (Wget has a better one,
@@ -90,15 +97,15 @@ exists url headers = case parseURIRelaxed url of
- would not be appropriate to test at configure time and build support
- for only one in.
-}
-download :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
+download :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
download = download' False
{- No output, even on error. -}
-downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
+downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
downloadQuiet = download' True
-download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
-download' quiet url headers options file =
+download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
+download' quiet url headers options file ua =
case parseURIRelaxed url of
Just u
| uriScheme u == "file:" -> do
@@ -119,7 +126,7 @@ download' quiet url headers options file =
curl = go "curl" $ headerparams ++ quietopt "-s" ++
[Params "-f -L -C - -# -o"]
go cmd opts = boolSystem cmd $
- options++opts++[File file, File url]
+ addUserAgent ua $ options++opts++[File file, File url]
quietopt s
| quiet = [Param s]
| otherwise = []
@@ -134,13 +141,14 @@ download' quiet url headers options file =
- Unfortunately, does not handle https, so should only be used
- when curl is not available.
-}
-request :: URI -> Headers -> RequestMethod -> IO (Response String)
-request url headers requesttype = go 5 url
+request :: URI -> Headers -> RequestMethod -> Maybe UserAgent -> IO (Response String)
+request url headers requesttype ua = go 5 url
where
go :: Int -> URI -> IO (Response String)
go 0 _ = error "Too many redirects "
go n u = do
rsp <- Browser.browse $ do
+ maybe noop Browser.setUserAgent ua
Browser.setErrHandler ignore
Browser.setOutHandler ignore
Browser.setAllowRedirects False
diff --git a/debian/changelog b/debian/changelog
index 7c05434e4..1379ffc01 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -18,6 +18,9 @@ git-annex (4.20130921) UNRELEASED; urgency=low
* add, import, assistant: Better preserve the mtime of symlinks,
when when adding content that gets deduplicated.
* webapp: Support storing encrypted git repositories on rsync.net.
+ * Send a git-annex user-agent when downloading urls.
+ Overridable with --user-agent option.
+ (Not yet done for S3 or WebDAV due to limitations of libraries used.)
-- Joey Hess <joeyh@debian.org> Sun, 22 Sep 2013 19:42:29 -0400
diff --git a/doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn b/doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn
index c1ccf86c2..0d6bcb05c 100644
--- a/doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn
+++ b/doc/bugs/importfeed_fails_when_using_the_option_--lazy_for_specific_podcast.mdwn
@@ -71,3 +71,7 @@ HTTP request sent, awaiting response... 200 OK
> switch, and/or to make git-annex set a default user agent header
> of "git-annex", rather than relying on the curl/wget defaults.
> --[[Joey]]
+
+> I've [[done]] what's discussed above, and verified it fixes
+> behavior for this specific server too.
+> --[[Joey]]
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index f0c81f07f..c06d1ffe9 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -824,6 +824,10 @@ subdirectories).
Also, '\\n' is a newline, '\\000' is a NULL, etc.
+* `--user-agent=value`
+
+ Overrides the User-Agent to use when downloading files from the web.
+
* `-c name=value`
Used to override git configuration settings. May be specified multiple times.