diff options
author | Joey Hess <joeyh@joeyh.name> | 2015-01-22 14:52:52 -0400 |
---|---|---|
committer | Joey Hess <joeyh@joeyh.name> | 2015-01-22 14:52:52 -0400 |
commit | 07211ad1d80d2be9693c5f37f1bf104c6ce6baa0 (patch) | |
tree | 43fedb306be22459b548589afa08efd5c64f636a | |
parent | c222a65270a95a59307377f81522a46d95db6e9a (diff) |
addurl: When a Content-Disposition header suggests a filename to use, addurl will consider using it, if it's reasonable and doesn't conflict with an existing file. (--file overrides this)
-rw-r--r-- | Command/AddUrl.hs | 100 | ||||
-rw-r--r-- | Command/ImportFeed.hs | 4 | ||||
-rw-r--r-- | Utility/Url.hs | 68 | ||||
-rw-r--r-- | debian/changelog | 3 | ||||
-rw-r--r-- | doc/git-annex.mdwn | 6 |
5 files changed, 111 insertions, 70 deletions
diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index 9e3aa31fb..67f883d69 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -120,17 +120,16 @@ downloadRemoteFile r relaxed uri file sz = do loguri = setDownloader uri OtherDownloader startWeb :: Bool -> Maybe FilePath -> Maybe Int -> String -> CommandStart -startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s +startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI urlstring where - (s', downloader) = getDownloader s - bad = fromMaybe (error $ "bad url " ++ s') $ - parseURI $ escapeURIString isUnescapedInURI s' - choosefile = flip fromMaybe optfile + (urlstring, downloader) = getDownloader s + bad = fromMaybe (error $ "bad url " ++ urlstring) $ + parseURI $ escapeURIString isUnescapedInURI urlstring go url = case downloader of QuviDownloader -> usequvi _ -> #ifdef WITH_QUVI - ifM (quviSupported s') + ifM (quviSupported urlstring) ( usequvi , regulardownload url ) @@ -139,30 +138,44 @@ startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s #endif regulardownload url = do pathmax <- liftIO $ fileNameLengthLimit "." - let file = choosefile $ url2file url pathdepth pathmax + urlinfo <- if relaxed + then pure $ Url.UrlInfo True Nothing Nothing + else Url.withUrlOptions (Url.getUrlInfo urlstring) + file <- case optfile of + Just f -> pure f + Nothing -> case Url.urlSuggestedFile urlinfo of + Nothing -> pure $ url2file url pathdepth pathmax + Just sf -> do + let f = truncateFilePath pathmax $ + sanitizeFilePath sf + ifM (liftIO $ doesFileExist f <||> doesDirectoryExist f) + ( pure $ url2file url pathdepth pathmax + , pure f + ) showStart "addurl" file - next $ performWeb relaxed s' file + next $ performWeb relaxed urlstring file urlinfo #ifdef WITH_QUVI - badquvi = error $ "quvi does not know how to download url " ++ s' + badquvi = error $ "quvi does not know how to download url " ++ urlstring usequvi = do page <- fromMaybe badquvi - <$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] s' + <$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] urlstring let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page pathmax <- liftIO $ fileNameLengthLimit "." - let file = choosefile $ truncateFilePath pathmax $ sanitizeFilePath $ - Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link + let file = flip fromMaybe optfile $ + truncateFilePath pathmax $ sanitizeFilePath $ + Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link showStart "addurl" file - next $ performQuvi relaxed s' (Quvi.linkUrl link) file + next $ performQuvi relaxed urlstring (Quvi.linkUrl link) file #else usequvi = error "not built with quvi support" #endif -performWeb :: Bool -> URLString -> FilePath -> CommandPerform -performWeb relaxed url file = ifAnnexed file addurl geturl +performWeb :: Bool -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform +performWeb relaxed url file urlinfo = ifAnnexed file addurl geturl where - geturl = next $ isJust <$> addUrlFile relaxed url file - addurl = addUrlChecked relaxed url webUUID checkexistssize - checkexistssize = Url.withUrlOptions . Url.check url . keySize + geturl = next $ isJust <$> addUrlFile relaxed url urlinfo file + addurl = addUrlChecked relaxed url webUUID $ \k -> return $ + (Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k) #ifdef WITH_QUVI performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform @@ -189,7 +202,8 @@ addUrlFileQuvi relaxed quviurl videourl file = do - retained, because the size of a video stream - might change and we want to be able to download - it later. -} - sizedkey <- addSizeUrlKey videourl key + urlinfo <- Url.withUrlOptions (Url.getUrlInfo videourl) + let sizedkey = addSizeUrlKey urlinfo key prepGetViaTmpChecked sizedkey Nothing $ do tmp <- fromRepo $ gitAnnexTmpObjectLocation key showOutput @@ -225,17 +239,17 @@ addUrlChecked relaxed url u checkexistssize key stop ) -addUrlFile :: Bool -> URLString -> FilePath -> Annex (Maybe Key) -addUrlFile relaxed url file = do +addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) +addUrlFile relaxed url urlinfo file = do liftIO $ createDirectoryIfMissing True (parentDir file) ifM (Annex.getState Annex.fast <||> pure relaxed) - ( nodownload relaxed url file - , downloadWeb url file + ( nodownload relaxed url urlinfo file + , downloadWeb url urlinfo file ) -downloadWeb :: URLString -> FilePath -> Annex (Maybe Key) -downloadWeb url file = do - dummykey <- addSizeUrlKey url =<< Backend.URL.fromUrl url Nothing +downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) +downloadWeb url urlinfo file = do + dummykey <- addSizeUrlKey urlinfo <$> Backend.URL.fromUrl url Nothing let downloader f _ = do showOutput downloadUrl [url] f @@ -272,15 +286,9 @@ downloadWith downloader dummykey u url file = liftIO $ createDirectoryIfMissing True (parentDir tmp) downloader tmp p -{- Hits the url to get the size, if available. - - - - This is needed to avoid exceeding the diskreserve when downloading, - - and so the assistant can display a pretty progress bar. - -} -addSizeUrlKey :: URLString -> Key -> Annex Key -addSizeUrlKey url key = do - size <- snd <$> Url.withUrlOptions (Url.exists url) - return $ key { keySize = size } +{- Adds the url size to the Key. -} +addSizeUrlKey :: Url.UrlInfo -> Key -> Key +addSizeUrlKey urlinfo key = key { keySize = Url.urlSize urlinfo } cleanup :: UUID -> URLString -> FilePath -> Key -> Maybe FilePath -> Annex () cleanup u url file key mtmp = do @@ -295,19 +303,15 @@ cleanup u url file key mtmp = do Annex.Queue.flush maybe noop (moveAnnex key) mtmp -nodownload :: Bool -> URLString -> FilePath -> Annex (Maybe Key) -nodownload relaxed url file = do - (exists, size) <- if relaxed - then pure (True, Nothing) - else Url.withUrlOptions (Url.exists url) - if exists - then do - key <- Backend.URL.fromUrl url size - cleanup webUUID url file key Nothing - return (Just key) - else do - warning $ "unable to access url: " ++ url - return Nothing +nodownload :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) +nodownload relaxed url urlinfo file + | Url.urlExists urlinfo = do + key <- Backend.URL.fromUrl url (Url.urlSize urlinfo) + cleanup webUUID url file key Nothing + return (Just key) + | otherwise = do + warning $ "unable to access url: " ++ url + return Nothing url2file :: URI -> Maybe Int -> Int -> FilePath url2file url pathdepth pathmax = case pathdepth of diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs index b9d78d713..ed035fa85 100644 --- a/Command/ImportFeed.hs +++ b/Command/ImportFeed.hs @@ -144,7 +144,9 @@ performDownload relaxed cache todownload = case location todownload of rundownload url (takeExtension url) $ \f -> do r <- Remote.claimingUrl url if Remote.uuid r == webUUID - then maybeToList <$> addUrlFile relaxed url f + then do + urlinfo <- Url.withUrlOptions (Url.getUrlInfo url) + maybeToList <$> addUrlFile relaxed url urlinfo f else do res <- tryNonAsync $ maybe (error $ "unable to checkUrl of " ++ Remote.name r) diff --git a/Utility/Url.hs b/Utility/Url.hs index b6af123b9..cb4fc7d37 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -17,6 +17,8 @@ module Utility.Url ( check, checkBoth, exists, + UrlInfo(..), + getUrlInfo, download, downloadQuiet, parseURIRelaxed @@ -84,18 +86,27 @@ checkBoth url expected_size uo = do v <- check url expected_size uo return (fst v && snd v) check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool) -check url expected_size = go <$$> exists url +check url expected_size = go <$$> getUrlInfo url where - go (False, _) = (False, False) - go (True, Nothing) = (True, True) - go (True, s) = case expected_size of + go (UrlInfo False _ _) = (False, False) + go (UrlInfo True Nothing _) = (True, True) + go (UrlInfo True s _) = case expected_size of Just _ -> (True, expected_size == s) Nothing -> (True, True) +exists :: URLString -> UrlOptions -> IO Bool +exists url uo = urlExists <$> getUrlInfo url uo + +data UrlInfo = UrlInfo + { urlExists :: Bool + , urlSize :: Maybe Integer + , urlSuggestedFile :: Maybe FilePath + } + {- Checks that an url exists and could be successfully downloaded, - - also returning its size if available. -} -exists :: URLString -> UrlOptions -> IO (Bool, Maybe Integer) -exists url uo = case parseURIRelaxed url of + - also returning its size and suggested filename if available. -} +getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo +getUrlInfo url uo = case parseURIRelaxed url of Just u -> case parseUrl (show u) of Just req -> existsconduit req `catchNonAsync` const dne -- http-conduit does not support file:, ftp:, etc urls, @@ -107,18 +118,21 @@ exists url uo = case parseURIRelaxed url of case s of Just stat -> do sz <- getFileSize' f stat - return (True, Just sz) + found (Just sz) Nothing Nothing -> dne | Build.SysConfig.curl -> do output <- catchDefaultIO "" $ readProcess "curl" $ toCommand curlparams case lastMaybe (lines output) of - Just ('2':_:_) -> return (True, extractlencurl output) + Just ('2':_:_) -> found + (extractlencurl output) + Nothing _ -> dne | otherwise -> dne Nothing -> dne where - dne = return (False, Nothing) + dne = return $ UrlInfo False Nothing Nothing + found sz f = return $ UrlInfo True sz f curlparams = addUserAgent uo $ [ Param "-s" @@ -133,23 +147,36 @@ exists url uo = case parseURIRelaxed url of _ -> Nothing _ -> Nothing - extractlen resp = readish . B8.toString =<< headMaybe lenheaders - where - lenheaders = map snd $ - filter (\(h, _) -> h == hContentLength) - (responseHeaders resp) - + extractlen = readish . B8.toString <=< firstheader hContentLength + + extractfilename = contentDispositionFilename . B8.toString + <=< firstheader hContentDisposition + + firstheader h = headMaybe . map snd . + filter (\p -> fst p == h) . responseHeaders + existsconduit req = withManager $ \mgr -> do let req' = headRequest (applyRequest uo req) resp <- http req' mgr -- forces processing the response before the -- manager is closed - ret <- if responseStatus resp == ok200 - then return (True, extractlen resp) - else liftIO dne + ret <- liftIO $ if responseStatus resp == ok200 + then found + (extractlen resp) + (extractfilename resp) + else dne liftIO $ closeManager mgr return ret +-- Parse eg: attachment; filename="fname.ext" +-- per RFC 2616 +contentDispositionFilename :: String -> Maybe FilePath +contentDispositionFilename s + | "attachment; filename=\"" `isPrefixOf` s && "\"" `isSuffixOf` s = + Just $ reverse $ drop 1 $ reverse $ + drop 1 $ dropWhile (/= '"') s + | otherwise = Nothing + #if MIN_VERSION_http_conduit(2,0,0) headRequest :: Request -> Request #else @@ -229,6 +256,9 @@ parseURIRelaxed = parseURI . escapeURIString isAllowedInURI hAcceptEncoding :: CI.CI B.ByteString hAcceptEncoding = "Accept-Encoding" +hContentDisposition :: CI.CI B.ByteString +hContentDisposition = "Content-Disposition" + #if ! MIN_VERSION_http_types(0,7,0) hContentLength :: CI.CI B.ByteString hContentLength = "Content-Length" diff --git a/debian/changelog b/debian/changelog index eb48707f2..fc23d900d 100644 --- a/debian/changelog +++ b/debian/changelog @@ -16,6 +16,9 @@ git-annex (5.20150114) UNRELEASED; urgency=medium * Avoid using fileSize which maxes out at just 2 gb on Windows. Instead, use hFileSize, which doesn't have a bounded size. Fixes support for files > 2 gb on Windows. + * addurl: When a Content-Disposition header suggests a filename to use, + addurl will consider using it, if it's reasonable and doesn't conflict + with an existing file. (--file overrides this) -- Joey Hess <id@joeyh.name> Tue, 13 Jan 2015 17:03:39 -0400 diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index c92208e5c..c4586ba1d 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -218,8 +218,10 @@ subdirectories). is there at a future point, specify `--relaxed`. (Implies `--fast`.) Normally the filename is based on the full url, so will look like - "www.example.com_dir_subdir_bigfile". For a shorter filename, specify - `--pathdepth=N`. For example, `--pathdepth=1` will use "dir/subdir/bigfile", + "www.example.com_dir_subdir_bigfile". In some cases, addurl is able to + come up with a better filename based on other information. Or, for a + shorter filename, specify `--pathdepth=N`. For example, + `--pathdepth=1` will use "dir/subdir/bigfile", while `--pathdepth=3` will use "bigfile". It can also be negative; `--pathdepth=-2` will use the last two parts of the url. |