From 563036b66fcc78ffe1e18086289b89b457221cac Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 8 Aug 2014 18:54:04 -0400 Subject: WIP converting S3 special remote from hS3 to aws library Currently, initremote works, but not the other operations. They should be fairly easy to add from this base. Also, https://github.com/aristidb/aws/issues/119 blocks internet archive support. Note that since http-conduit is used, this also adds https support to S3. Although git-annex encrypts everything anyway, so that may not be extremely useful. It is not enabled by default, because existing S3 special remotes have port=80 in their config. Setting port=443 will enable it. This commit was sponsored by Daniel Brockman. --- debian/changelog | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'debian') diff --git a/debian/changelog b/debian/changelog index 3a8ab302e..cfc47906d 100644 --- a/debian/changelog +++ b/debian/changelog @@ -21,6 +21,10 @@ git-annex (5.20140718) UNRELEASED; urgency=medium * WebDAV: Avoid buffering whole file in memory when uploading and downloading. * WebDAV: Dropped support for DAV before 1.0. + * S3: Switched to using the haskell aws library. + * S3: Now supports https. To enable this, configure a S3 special remote to + use port=443. However, with encrypted special remotes, this does not + add any security. * testremote: New command to test uploads/downloads to a remote. * Dropping an object from a bup special remote now deletes the git branch for the object, although of course the object's content cannot be deleted -- cgit v1.2.3 From 620d1e4273310373f38866259f9f6b3bc6995a5e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 15:31:12 -0400 Subject: deps --- debian/control | 6 +++++- git-annex.cabal | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'debian') diff --git a/debian/control b/debian/control index 522b7c5cc..7defd8a7c 100644 --- a/debian/control +++ b/debian/control @@ -13,7 +13,11 @@ Build-Depends: libghc-cryptohash-dev, libghc-dataenc-dev, libghc-utf8-string-dev, - libghc-hs3-dev (>= 0.5.6), + libghc-aws-dev, + libghc-conduit-dev, + libghc-resourcet-dev, + libghc-http-conduit-dev, + libghc-http-client-dev, libghc-dav-dev (>= 1.0) [amd64 i386 kfreebsd-amd64 kfreebsd-i386 powerpc], libghc-quickcheck2-dev, libghc-monad-control-dev (>= 0.3), diff --git a/git-annex.cabal b/git-annex.cabal index 70bd9c88b..b05863b79 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -137,12 +137,12 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: hS3, http-conduit, http-client, resourcet, http-types, aws + Build-Depends: aws, conduit, resourcet, + http-conduit, http-client, http-types CPP-Options: -DWITH_S3 if flag(WebDAV) - Build-Depends: DAV (>= 1.0), - http-client, http-types + Build-Depends: DAV (>= 1.0), http-client, http-types CPP-Options: -DWITH_WEBDAV if flag(Assistant) && ! os(solaris) -- cgit v1.2.3 From 1ebf529787d1766a5b6a25c6c2b58f126053816f Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 15:58:01 -0400 Subject: S3: finish converting to aws library Implemented the Retriever. Unfortunately, it is a fileRetriever and not a byteRetriever. It should be possible to convert this to a byteRetiever, but I got stuck: The conduit sink needs to process individual chunks, but a byteRetriever needs to pass a single L.ByteString to its callback for processing. I looked into using unsafeInerlaveIO to build up the bytestring lazily, but the sink is already operating under conduit's inversion of control, and does not run directly in IO anyway. On the plus side, no more memory leak.. --- Remote/S3.hs | 41 +++++++++++++++++++++++++++++++---------- debian/changelog | 4 ++-- doc/bugs/S3_memory_leaks.mdwn | 2 ++ 3 files changed, 35 insertions(+), 12 deletions(-) (limited to 'debian') diff --git a/Remote/S3.hs b/Remote/S3.hs index 885396f98..e06a3d6c8 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -15,6 +15,7 @@ import qualified Aws.S3 as S3 import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString as S import qualified Data.Map as M import Data.Char import Network.Socket (HostName) @@ -23,6 +24,7 @@ import Network.HTTP.Client (defaultManagerSettings, managerResponseTimeout, resp import Network.HTTP.Types import Control.Monad.Trans.Resource import Control.Monad.Catch +import Data.Conduit import Common.Annex import Types.Remote @@ -36,6 +38,7 @@ import qualified Remote.Helper.AWS as AWS import Creds import Annex.UUID import Logs.Web +import Utility.Metered type BucketName = String @@ -145,14 +148,27 @@ store r h = fileStorer $ \k f p -> do return True +{- Implemented as a fileRetriever, that uses conduit to stream the chunks + - out to the file. Would be better to implement a byteRetriever, but + - that is difficult. -} retrieve :: S3Handle -> Retriever -retrieve _h = error "TODO" - {- - resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> - byteRetriever $ \k sink -> - liftIO (getObject conn $ bucketKey r bucket k) - >>= either s3Error (sink . obj_data) - -} +retrieve h = fileRetriever $ \f k p -> liftIO $ runResourceT $ do + (fr, fh) <- allocate (openFile f WriteMode) hClose + let req = S3.getObject (hBucket h) (hBucketObject h k) + S3.GetObjectResponse { S3.gorResponse = rsp } <- sendS3Handle' h req + responseBody rsp $$+- sinkprogressfile fh p zeroBytesProcessed + release fr + where + sinkprogressfile fh meterupdate sofar = do + mbs <- await + case mbs of + Nothing -> return () + Just bs -> do + let sofar' = sofar -- addBytesProcessed $ S.length bs + liftIO $ do + void $ meterupdate sofar' + S.hPut fh bs + sinkprogressfile fh meterupdate sofar' retrieveCheap :: Key -> FilePath -> Annex Bool retrieveCheap _ _ = return False @@ -289,9 +305,14 @@ sendS3Handle => S3Handle -> req -> Annex res -sendS3Handle h = liftIO . runResourceT . call - where - call = AWS.pureAws (hawscfg h) (hs3cfg h) (hmanager h) +sendS3Handle h r = liftIO $ runResourceT $ sendS3Handle' h r + +sendS3Handle' + :: (AWS.Transaction r a, AWS.ServiceConfiguration r ~ S3.S3Configuration) + => S3Handle + -> r + -> ResourceT IO a +sendS3Handle' h = AWS.pureAws (hawscfg h) (hs3cfg h) (hmanager h) withS3Handle :: RemoteConfig -> UUID -> (S3Handle -> Annex a) -> Annex a withS3Handle c u a = do diff --git a/debian/changelog b/debian/changelog index fb30e7736..48d4d9144 100644 --- a/debian/changelog +++ b/debian/changelog @@ -16,9 +16,9 @@ git-annex (5.20140718) UNRELEASED; urgency=medium were incompletely repaired before. * Fix cost calculation for non-encrypted remotes. * Display exception message when a transfer fails due to an exception. - * WebDAV: Sped up by avoiding making multiple http connections + * WebDAV, S3: Sped up by avoiding making multiple http connections when storing a file. - * WebDAV: Avoid buffering whole file in memory when uploading and + * WebDAV, S3: Avoid buffering whole file in memory when uploading and downloading. * WebDAV: Dropped support for DAV before 1.0. * S3: Switched to using the haskell aws library. diff --git a/doc/bugs/S3_memory_leaks.mdwn b/doc/bugs/S3_memory_leaks.mdwn index 88dd6eaa6..7dc1e5757 100644 --- a/doc/bugs/S3_memory_leaks.mdwn +++ b/doc/bugs/S3_memory_leaks.mdwn @@ -7,6 +7,8 @@ Sending a file to S3 causes a slow memory increase toward the file size. Copying the file back from S3 causes a slow memory increase toward the file size. +> [[fixed|done]] too! --[[Joey]] + The author of hS3 is aware of the problem, and working on it. I think I have identified the root cause of the buffering; it's done by hS3 so it can resend the data if S3 sends it a 307 redirect. --[[Joey]] -- cgit v1.2.3 From e9128138ef1e19a341d7edfda4db2d6b1f8f6b0a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 15 Aug 2014 17:17:19 -0400 Subject: Switched from the old haskell HTTP library to http-conduit. The hoary old HTTP library was only used when checking if an url exists, when curl was not available. It had many problems, including not supporting https at all. Now, this is done using http-conduit for all urls that it supports. Falls back to curl for any url that http-conduit doesn't like (probably ftp etc, but could also be an url that its parser chokes on for whatever reason). This adds a new dependency on http-conduit, but webdav support already indirectly depended on that, and the s3-aws branch also uses it. --- Utility/Url.hs | 125 +++++++++++++++++++++++-------------------------------- debian/changelog | 1 + debian/control | 2 +- git-annex.cabal | 9 ++-- 4 files changed, 57 insertions(+), 80 deletions(-) (limited to 'debian') diff --git a/Utility/Url.hs b/Utility/Url.hs index 4137a5d8b..ebcae55ca 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -1,6 +1,6 @@ {- Url downloading. - - - Copyright 2011,2013 Joey Hess + - Copyright 2011-2014 Joey Hess - - License: BSD-2-clause -} @@ -21,10 +21,11 @@ module Utility.Url ( import Common import Network.URI -import qualified Network.Browser as Browser -import Network.HTTP -import Data.Either +import Network.HTTP.Conduit +import Network.HTTP.Types import Data.Default +import qualified Data.CaseInsensitive as CI +import qualified Data.ByteString.UTF8 as B8 import qualified Build.SysConfig @@ -60,33 +61,26 @@ check url expected_size = go <$$> exists url Nothing -> (True, True) {- Checks that an url exists and could be successfully downloaded, - - also returning its size if available. - - - - For a file: url, check it directly. - - - - Uses curl otherwise, when available, since curl handles https better - - than does Haskell's Network.Browser. - -} + - also returning its size if available. -} exists :: URLString -> UrlOptions -> IO (Bool, Maybe Integer) exists url uo = case parseURIRelaxed url of - Just u - | uriScheme u == "file:" -> do - s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u) - case s of - Just stat -> return (True, Just $ fromIntegral $ fileSize stat) - Nothing -> dne - | otherwise -> if Build.SysConfig.curl - then do + Just u -> case parseUrl (show u) of + Just req -> existsconduit req `catchNonAsync` const dne + -- http-conduit does not support file:, ftp:, etc urls, + -- so fall back to reading files and using curl. + Nothing + | uriScheme u == "file:" -> do + s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u) + case s of + Just stat -> return (True, Just $ fromIntegral $ fileSize stat) + Nothing -> dne + | Build.SysConfig.curl -> do output <- catchDefaultIO "" $ readProcess "curl" $ toCommand curlparams case lastMaybe (lines output) of - Just ('2':_:_) -> return (True, extractsize output) + Just ('2':_:_) -> return (True, extractlencurl output) _ -> dne - else do - r <- request u HEAD uo - case rspCode r of - (2,_,_) -> return (True, size r) - _ -> return (False, Nothing) + | otherwise -> dne Nothing -> dne where dne = return (False, Nothing) @@ -98,13 +92,28 @@ exists url uo = case parseURIRelaxed url of , Param "-w", Param "%{http_code}" ] ++ concatMap (\h -> [Param "-H", Param h]) (reqHeaders uo) ++ (reqParams uo) - extractsize s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of + extractlencurl s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of Just l -> case lastMaybe $ words l of Just sz -> readish sz _ -> Nothing _ -> Nothing - - size = liftM Prelude.read . lookupHeader HdrContentLength . rspHeaders + + extractlen resp = readish . B8.toString =<< headMaybe lenheaders + where + lenheaders = map snd $ + filter (\(h, _) -> h == hContentLength) + (responseHeaders resp) + + existsconduit req = withManager $ \mgr -> do + let req' = (addUrlOptions uo req) { method = methodHead } + resp <- http req' mgr + -- forces processing the response before the + -- manager is closed + ret <- if responseStatus resp == ok200 + then return (True, extractlen resp) + else liftIO dne + liftIO $ closeManager mgr + return ret -- works for both wget and curl commands addUserAgent :: UrlOptions -> [CommandParam] -> [CommandParam] @@ -112,6 +121,20 @@ addUserAgent uo ps = case userAgent uo of Nothing -> ps Just ua -> ps ++ [Param "--user-agent", Param ua] +addUrlOptions :: UrlOptions -> Request -> Request +addUrlOptions uo r = r { requestHeaders = requestHeaders r ++ uaheader ++ otherheaders} + where + uaheader = case userAgent uo of + Nothing -> [] + Just ua -> [(hUserAgent, B8.fromString ua)] + otherheaders = map toheader (reqHeaders uo) + toheader s = + let (h, v) = separate (== ':') s + h' = CI.mk (B8.fromString h) + in case v of + (' ':v') -> (h', B8.fromString v') + _ -> (h', B8.fromString v) + {- Used to download large files, such as the contents of keys. - - Uses wget or curl program for its progress bar. (Wget has a better one, @@ -161,52 +184,6 @@ download' quiet url file uo = | quiet = [Param s] | otherwise = [] -{- Uses Network.Browser to make a http request of an url. - - For example, HEAD can be used to check if the url exists, - - or GET used to get the url content (best for small urls). - - - - This does its own redirect following because Browser's is buggy for HEAD - - requests. - - - - Unfortunately, does not handle https, so should only be used - - when curl is not available. - -} -request :: URI -> RequestMethod -> UrlOptions -> IO (Response String) -request url requesttype uo = go 5 url - where - go :: Int -> URI -> IO (Response String) - go 0 _ = error "Too many redirects " - go n u = do - rsp <- Browser.browse $ do - maybe noop Browser.setUserAgent (userAgent uo) - Browser.setErrHandler ignore - Browser.setOutHandler ignore - Browser.setAllowRedirects False - let req = mkRequest requesttype u :: Request_String - snd <$> Browser.request (addheaders req) - case rspCode rsp of - (3,0,x) | x /= 5 -> redir (n - 1) u rsp - _ -> return rsp - addheaders req = setHeaders req (rqHeaders req ++ userheaders) - userheaders = rights $ map parseHeader (reqHeaders uo) - ignore = const noop - redir n u rsp = case retrieveHeaders HdrLocation rsp of - [] -> return rsp - (Header _ newu:_) -> - case parseURIReference newu of - Nothing -> return rsp - Just newURI -> go n $ -#if defined VERSION_network -#if ! MIN_VERSION_network(2,4,0) -#define WITH_OLD_URI -#endif -#endif -#ifdef WITH_OLD_URI - fromMaybe newURI (newURI `relativeTo` u) -#else - newURI `relativeTo` u -#endif - {- Allows for spaces and other stuff in urls, properly escaping them. -} parseURIRelaxed :: URLString -> Maybe URI parseURIRelaxed = parseURI . escapeURIString isAllowedInURI diff --git a/debian/changelog b/debian/changelog index c55fbabd3..0d884cd81 100644 --- a/debian/changelog +++ b/debian/changelog @@ -39,6 +39,7 @@ git-annex (5.20140718) UNRELEASED; urgency=medium * git-annex-shell sendkey: Don't fail if a remote asks for a key to be sent that already has a transfer lock file indicating it's being sent to that remote. The remote may have moved between networks, or reconnected. + * Switched from the old haskell HTTP library to http-conduit. -- Joey Hess Mon, 21 Jul 2014 14:41:26 -0400 diff --git a/debian/control b/debian/control index 522b7c5cc..1106bc89d 100644 --- a/debian/control +++ b/debian/control @@ -46,6 +46,7 @@ Build-Depends: libghc-dns-dev, libghc-case-insensitive-dev, libghc-http-types-dev, + libghc-http-conduit-dev, libghc-blaze-builder-dev, libghc-crypto-api-dev, libghc-network-multicast-dev, @@ -55,7 +56,6 @@ Build-Depends: libghc-gnutls-dev (>= 0.1.4), libghc-xml-types-dev, libghc-async-dev, - libghc-http-dev, libghc-feed-dev (>= 0.3.9.2), libghc-regex-tdfa-dev [!mipsel !s390], libghc-regex-compat-dev [mipsel s390], diff --git a/git-annex.cabal b/git-annex.cabal index 097fee4cb..58aac39b3 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -96,11 +96,11 @@ Executable git-annex Main-Is: git-annex.hs Build-Depends: MissingH, hslogger, directory, filepath, containers, utf8-string, network (>= 2.0), mtl (>= 2), - bytestring, old-locale, time, HTTP, dataenc, SHA, process, json, + bytestring, old-locale, time, dataenc, SHA, process, json, base (>= 4.5 && < 4.9), monad-control, exceptions (>= 0.5), transformers, IfElse, text, QuickCheck >= 2.1, bloomfilter, edit-distance, process, SafeSemaphore, uuid, random, dlist, unix-compat, async, stm (>= 2.3), - data-default, case-insensitive + data-default, case-insensitive, http-conduit, http-types CC-Options: -Wall GHC-Options: -Wall Extensions: PackageImports @@ -141,8 +141,7 @@ Executable git-annex CPP-Options: -DWITH_S3 if flag(WebDAV) - Build-Depends: DAV (>= 1.0), - http-client, http-types + Build-Depends: DAV (>= 1.0), http-client CPP-Options: -DWITH_WEBDAV if flag(Assistant) && ! os(solaris) @@ -188,7 +187,7 @@ Executable git-annex if flag(Webapp) Build-Depends: yesod, yesod-default, yesod-static, yesod-form, yesod-core, - http-types, wai, wai-extra, warp, warp-tls, + wai, wai-extra, warp, warp-tls, blaze-builder, crypto-api, hamlet, clientsession, template-haskell, data-default, aeson, path-pieces, shakespeare -- cgit v1.2.3 From b9343ca1d3e0c7e136f7894cd159edb1d0730a0a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 18 Aug 2014 15:26:18 -0400 Subject: update aws version requirements --- debian/control | 2 +- git-annex.cabal | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'debian') diff --git a/debian/control b/debian/control index 89141a7e3..2b430b4dc 100644 --- a/debian/control +++ b/debian/control @@ -13,7 +13,7 @@ Build-Depends: libghc-cryptohash-dev, libghc-dataenc-dev, libghc-utf8-string-dev, - libghc-aws-dev, + libghc-aws-dev (>= 0.10.2), libghc-conduit-dev, libghc-resourcet-dev, libghc-dav-dev (>= 1.0) [amd64 i386 kfreebsd-amd64 kfreebsd-i386 powerpc], diff --git a/git-annex.cabal b/git-annex.cabal index b0835f50a..6e57dd1f4 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -137,7 +137,7 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: aws, conduit, resourcet + Build-Depends: aws (>= 0.10.2), conduit, resourcet CPP-Options: -DWITH_S3 if flag(WebDAV) -- cgit v1.2.3 From 737267c0a0c21823ec65d786e59589900859320d Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 16:39:55 -0400 Subject: add cabal flag for use with debian's older version of aws, which is now patched with the necessary stuff --- debian/control | 2 +- debian/rules | 4 ++++ git-annex.cabal | 10 +++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'debian') diff --git a/debian/control b/debian/control index 659ec05fa..55fc17448 100644 --- a/debian/control +++ b/debian/control @@ -14,7 +14,7 @@ Build-Depends: libghc-cryptohash-dev, libghc-dataenc-dev, libghc-utf8-string-dev, - libghc-aws-dev (>= 0.10.2), + libghc-aws-dev (>= 0.9.2), libghc-conduit-dev, libghc-resourcet-dev, libghc-dav-dev (>= 1.0) [amd64 i386 kfreebsd-amd64 kfreebsd-i386 powerpc], diff --git a/debian/rules b/debian/rules index 7c8f8a560..22be48195 100755 --- a/debian/rules +++ b/debian/rules @@ -8,6 +8,10 @@ export RELEASE_BUILD=1 %: dh $@ +# Debian currently has a patched aws 0.9.2, rather than the newer 0.10.2. +override_dh_auto_configure: + debian/cabal-wrapper configure -fPatchedAWS + # Not intended for use by anyone except the author. announcedir: @echo ${HOME}/src/git-annex/doc/news diff --git a/git-annex.cabal b/git-annex.cabal index 94b1ed3be..863f35cf8 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -34,6 +34,10 @@ Description: Flag S3 Description: Enable S3 support +Flag PatchedAWS + Description: Building on system, like Debian, with old AWS patched to support git-annex + Default: False + Flag WebDAV Description: Enable WebDAV support @@ -151,7 +155,11 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: aws (>= 0.10.2), conduit, resourcet + Build-Depends: conduit, resourcet + if flag(PatchedAWS) + Build-Depends: aws (>= 0.9.2) + else + Build-Depends: aws (>= 0.10.2) CPP-Options: -DWITH_S3 if flag(WebDAV) -- cgit v1.2.3 From 718932c895b38228ab8aed4477d7ce8bba205e5a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 6 Nov 2014 14:38:51 -0400 Subject: add changelog entires for when this branch gets merged --- debian/changelog | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'debian') diff --git a/debian/changelog b/debian/changelog index efaba2d95..b57ca6678 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ + * S3: Switched to using the haskell aws library. + * S3: No longer buffers entire files in memory when uploading without + chunking. + * S3: When built with a new enough version of the haskell aws library, + supports doing multipart uploads, in order to store extremely large + files in S3 when not using chunking. + git-annex (5.20141025) UNRELEASED; urgency=medium * Windows: Fix crash when user.name is not set in git config. -- cgit v1.2.3