summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2015-01-22 14:52:52 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2015-01-22 14:52:52 -0400
commit07211ad1d80d2be9693c5f37f1bf104c6ce6baa0 (patch)
tree43fedb306be22459b548589afa08efd5c64f636a
parentc222a65270a95a59307377f81522a46d95db6e9a (diff)
addurl: When a Content-Disposition header suggests a filename to use, addurl will consider using it, if it's reasonable and doesn't conflict with an existing file. (--file overrides this)
-rw-r--r--Command/AddUrl.hs100
-rw-r--r--Command/ImportFeed.hs4
-rw-r--r--Utility/Url.hs68
-rw-r--r--debian/changelog3
-rw-r--r--doc/git-annex.mdwn6
5 files changed, 111 insertions, 70 deletions
diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs
index 9e3aa31fb..67f883d69 100644
--- a/Command/AddUrl.hs
+++ b/Command/AddUrl.hs
@@ -120,17 +120,16 @@ downloadRemoteFile r relaxed uri file sz = do
loguri = setDownloader uri OtherDownloader
startWeb :: Bool -> Maybe FilePath -> Maybe Int -> String -> CommandStart
-startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
+startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI urlstring
where
- (s', downloader) = getDownloader s
- bad = fromMaybe (error $ "bad url " ++ s') $
- parseURI $ escapeURIString isUnescapedInURI s'
- choosefile = flip fromMaybe optfile
+ (urlstring, downloader) = getDownloader s
+ bad = fromMaybe (error $ "bad url " ++ urlstring) $
+ parseURI $ escapeURIString isUnescapedInURI urlstring
go url = case downloader of
QuviDownloader -> usequvi
_ ->
#ifdef WITH_QUVI
- ifM (quviSupported s')
+ ifM (quviSupported urlstring)
( usequvi
, regulardownload url
)
@@ -139,30 +138,44 @@ startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
#endif
regulardownload url = do
pathmax <- liftIO $ fileNameLengthLimit "."
- let file = choosefile $ url2file url pathdepth pathmax
+ urlinfo <- if relaxed
+ then pure $ Url.UrlInfo True Nothing Nothing
+ else Url.withUrlOptions (Url.getUrlInfo urlstring)
+ file <- case optfile of
+ Just f -> pure f
+ Nothing -> case Url.urlSuggestedFile urlinfo of
+ Nothing -> pure $ url2file url pathdepth pathmax
+ Just sf -> do
+ let f = truncateFilePath pathmax $
+ sanitizeFilePath sf
+ ifM (liftIO $ doesFileExist f <||> doesDirectoryExist f)
+ ( pure $ url2file url pathdepth pathmax
+ , pure f
+ )
showStart "addurl" file
- next $ performWeb relaxed s' file
+ next $ performWeb relaxed urlstring file urlinfo
#ifdef WITH_QUVI
- badquvi = error $ "quvi does not know how to download url " ++ s'
+ badquvi = error $ "quvi does not know how to download url " ++ urlstring
usequvi = do
page <- fromMaybe badquvi
- <$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] s'
+ <$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] urlstring
let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page
pathmax <- liftIO $ fileNameLengthLimit "."
- let file = choosefile $ truncateFilePath pathmax $ sanitizeFilePath $
- Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link
+ let file = flip fromMaybe optfile $
+ truncateFilePath pathmax $ sanitizeFilePath $
+ Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link
showStart "addurl" file
- next $ performQuvi relaxed s' (Quvi.linkUrl link) file
+ next $ performQuvi relaxed urlstring (Quvi.linkUrl link) file
#else
usequvi = error "not built with quvi support"
#endif
-performWeb :: Bool -> URLString -> FilePath -> CommandPerform
-performWeb relaxed url file = ifAnnexed file addurl geturl
+performWeb :: Bool -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform
+performWeb relaxed url file urlinfo = ifAnnexed file addurl geturl
where
- geturl = next $ isJust <$> addUrlFile relaxed url file
- addurl = addUrlChecked relaxed url webUUID checkexistssize
- checkexistssize = Url.withUrlOptions . Url.check url . keySize
+ geturl = next $ isJust <$> addUrlFile relaxed url urlinfo file
+ addurl = addUrlChecked relaxed url webUUID $ \k -> return $
+ (Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k)
#ifdef WITH_QUVI
performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform
@@ -189,7 +202,8 @@ addUrlFileQuvi relaxed quviurl videourl file = do
- retained, because the size of a video stream
- might change and we want to be able to download
- it later. -}
- sizedkey <- addSizeUrlKey videourl key
+ urlinfo <- Url.withUrlOptions (Url.getUrlInfo videourl)
+ let sizedkey = addSizeUrlKey urlinfo key
prepGetViaTmpChecked sizedkey Nothing $ do
tmp <- fromRepo $ gitAnnexTmpObjectLocation key
showOutput
@@ -225,17 +239,17 @@ addUrlChecked relaxed url u checkexistssize key
stop
)
-addUrlFile :: Bool -> URLString -> FilePath -> Annex (Maybe Key)
-addUrlFile relaxed url file = do
+addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
+addUrlFile relaxed url urlinfo file = do
liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed)
- ( nodownload relaxed url file
- , downloadWeb url file
+ ( nodownload relaxed url urlinfo file
+ , downloadWeb url urlinfo file
)
-downloadWeb :: URLString -> FilePath -> Annex (Maybe Key)
-downloadWeb url file = do
- dummykey <- addSizeUrlKey url =<< Backend.URL.fromUrl url Nothing
+downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
+downloadWeb url urlinfo file = do
+ dummykey <- addSizeUrlKey urlinfo <$> Backend.URL.fromUrl url Nothing
let downloader f _ = do
showOutput
downloadUrl [url] f
@@ -272,15 +286,9 @@ downloadWith downloader dummykey u url file =
liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloader tmp p
-{- Hits the url to get the size, if available.
- -
- - This is needed to avoid exceeding the diskreserve when downloading,
- - and so the assistant can display a pretty progress bar.
- -}
-addSizeUrlKey :: URLString -> Key -> Annex Key
-addSizeUrlKey url key = do
- size <- snd <$> Url.withUrlOptions (Url.exists url)
- return $ key { keySize = size }
+{- Adds the url size to the Key. -}
+addSizeUrlKey :: Url.UrlInfo -> Key -> Key
+addSizeUrlKey urlinfo key = key { keySize = Url.urlSize urlinfo }
cleanup :: UUID -> URLString -> FilePath -> Key -> Maybe FilePath -> Annex ()
cleanup u url file key mtmp = do
@@ -295,19 +303,15 @@ cleanup u url file key mtmp = do
Annex.Queue.flush
maybe noop (moveAnnex key) mtmp
-nodownload :: Bool -> URLString -> FilePath -> Annex (Maybe Key)
-nodownload relaxed url file = do
- (exists, size) <- if relaxed
- then pure (True, Nothing)
- else Url.withUrlOptions (Url.exists url)
- if exists
- then do
- key <- Backend.URL.fromUrl url size
- cleanup webUUID url file key Nothing
- return (Just key)
- else do
- warning $ "unable to access url: " ++ url
- return Nothing
+nodownload :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
+nodownload relaxed url urlinfo file
+ | Url.urlExists urlinfo = do
+ key <- Backend.URL.fromUrl url (Url.urlSize urlinfo)
+ cleanup webUUID url file key Nothing
+ return (Just key)
+ | otherwise = do
+ warning $ "unable to access url: " ++ url
+ return Nothing
url2file :: URI -> Maybe Int -> Int -> FilePath
url2file url pathdepth pathmax = case pathdepth of
diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs
index b9d78d713..ed035fa85 100644
--- a/Command/ImportFeed.hs
+++ b/Command/ImportFeed.hs
@@ -144,7 +144,9 @@ performDownload relaxed cache todownload = case location todownload of
rundownload url (takeExtension url) $ \f -> do
r <- Remote.claimingUrl url
if Remote.uuid r == webUUID
- then maybeToList <$> addUrlFile relaxed url f
+ then do
+ urlinfo <- Url.withUrlOptions (Url.getUrlInfo url)
+ maybeToList <$> addUrlFile relaxed url urlinfo f
else do
res <- tryNonAsync $ maybe
(error $ "unable to checkUrl of " ++ Remote.name r)
diff --git a/Utility/Url.hs b/Utility/Url.hs
index b6af123b9..cb4fc7d37 100644
--- a/Utility/Url.hs
+++ b/Utility/Url.hs
@@ -17,6 +17,8 @@ module Utility.Url (
check,
checkBoth,
exists,
+ UrlInfo(..),
+ getUrlInfo,
download,
downloadQuiet,
parseURIRelaxed
@@ -84,18 +86,27 @@ checkBoth url expected_size uo = do
v <- check url expected_size uo
return (fst v && snd v)
check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool)
-check url expected_size = go <$$> exists url
+check url expected_size = go <$$> getUrlInfo url
where
- go (False, _) = (False, False)
- go (True, Nothing) = (True, True)
- go (True, s) = case expected_size of
+ go (UrlInfo False _ _) = (False, False)
+ go (UrlInfo True Nothing _) = (True, True)
+ go (UrlInfo True s _) = case expected_size of
Just _ -> (True, expected_size == s)
Nothing -> (True, True)
+exists :: URLString -> UrlOptions -> IO Bool
+exists url uo = urlExists <$> getUrlInfo url uo
+
+data UrlInfo = UrlInfo
+ { urlExists :: Bool
+ , urlSize :: Maybe Integer
+ , urlSuggestedFile :: Maybe FilePath
+ }
+
{- Checks that an url exists and could be successfully downloaded,
- - also returning its size if available. -}
-exists :: URLString -> UrlOptions -> IO (Bool, Maybe Integer)
-exists url uo = case parseURIRelaxed url of
+ - also returning its size and suggested filename if available. -}
+getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
+getUrlInfo url uo = case parseURIRelaxed url of
Just u -> case parseUrl (show u) of
Just req -> existsconduit req `catchNonAsync` const dne
-- http-conduit does not support file:, ftp:, etc urls,
@@ -107,18 +118,21 @@ exists url uo = case parseURIRelaxed url of
case s of
Just stat -> do
sz <- getFileSize' f stat
- return (True, Just sz)
+ found (Just sz) Nothing
Nothing -> dne
| Build.SysConfig.curl -> do
output <- catchDefaultIO "" $
readProcess "curl" $ toCommand curlparams
case lastMaybe (lines output) of
- Just ('2':_:_) -> return (True, extractlencurl output)
+ Just ('2':_:_) -> found
+ (extractlencurl output)
+ Nothing
_ -> dne
| otherwise -> dne
Nothing -> dne
where
- dne = return (False, Nothing)
+ dne = return $ UrlInfo False Nothing Nothing
+ found sz f = return $ UrlInfo True sz f
curlparams = addUserAgent uo $
[ Param "-s"
@@ -133,23 +147,36 @@ exists url uo = case parseURIRelaxed url of
_ -> Nothing
_ -> Nothing
- extractlen resp = readish . B8.toString =<< headMaybe lenheaders
- where
- lenheaders = map snd $
- filter (\(h, _) -> h == hContentLength)
- (responseHeaders resp)
-
+ extractlen = readish . B8.toString <=< firstheader hContentLength
+
+ extractfilename = contentDispositionFilename . B8.toString
+ <=< firstheader hContentDisposition
+
+ firstheader h = headMaybe . map snd .
+ filter (\p -> fst p == h) . responseHeaders
+
existsconduit req = withManager $ \mgr -> do
let req' = headRequest (applyRequest uo req)
resp <- http req' mgr
-- forces processing the response before the
-- manager is closed
- ret <- if responseStatus resp == ok200
- then return (True, extractlen resp)
- else liftIO dne
+ ret <- liftIO $ if responseStatus resp == ok200
+ then found
+ (extractlen resp)
+ (extractfilename resp)
+ else dne
liftIO $ closeManager mgr
return ret
+-- Parse eg: attachment; filename="fname.ext"
+-- per RFC 2616
+contentDispositionFilename :: String -> Maybe FilePath
+contentDispositionFilename s
+ | "attachment; filename=\"" `isPrefixOf` s && "\"" `isSuffixOf` s =
+ Just $ reverse $ drop 1 $ reverse $
+ drop 1 $ dropWhile (/= '"') s
+ | otherwise = Nothing
+
#if MIN_VERSION_http_conduit(2,0,0)
headRequest :: Request -> Request
#else
@@ -229,6 +256,9 @@ parseURIRelaxed = parseURI . escapeURIString isAllowedInURI
hAcceptEncoding :: CI.CI B.ByteString
hAcceptEncoding = "Accept-Encoding"
+hContentDisposition :: CI.CI B.ByteString
+hContentDisposition = "Content-Disposition"
+
#if ! MIN_VERSION_http_types(0,7,0)
hContentLength :: CI.CI B.ByteString
hContentLength = "Content-Length"
diff --git a/debian/changelog b/debian/changelog
index eb48707f2..fc23d900d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -16,6 +16,9 @@ git-annex (5.20150114) UNRELEASED; urgency=medium
* Avoid using fileSize which maxes out at just 2 gb on Windows.
Instead, use hFileSize, which doesn't have a bounded size.
Fixes support for files > 2 gb on Windows.
+ * addurl: When a Content-Disposition header suggests a filename to use,
+ addurl will consider using it, if it's reasonable and doesn't conflict
+ with an existing file. (--file overrides this)
-- Joey Hess <id@joeyh.name> Tue, 13 Jan 2015 17:03:39 -0400
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index c92208e5c..c4586ba1d 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -218,8 +218,10 @@ subdirectories).
is there at a future point, specify `--relaxed`. (Implies `--fast`.)
Normally the filename is based on the full url, so will look like
- "www.example.com_dir_subdir_bigfile". For a shorter filename, specify
- `--pathdepth=N`. For example, `--pathdepth=1` will use "dir/subdir/bigfile",
+ "www.example.com_dir_subdir_bigfile". In some cases, addurl is able to
+ come up with a better filename based on other information. Or, for a
+ shorter filename, specify `--pathdepth=N`. For example,
+ `--pathdepth=1` will use "dir/subdir/bigfile",
while `--pathdepth=3` will use "bigfile". It can also be negative;
`--pathdepth=-2` will use the last two parts of the url.