diff options
author | Joey Hess <joey@kitenet.net> | 2014-08-02 15:51:58 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2014-08-02 15:51:58 -0400 |
commit | 81b339034e8871f211ede2cf3bdb7319ad16caed (patch) | |
tree | 8fe89688fc4c224958bb822c9dcbd2cf434c8a4b | |
parent | 0ddf8152ce1353bbbcd7c87c3f67063b4aed892b (diff) |
S3: support chunking
The assistant defaults to 1MiB chunk size for new S3 special remotes.
Which will work around a couple of bugs:
http://git-annex.branchable.com/bugs/S3_memory_leaks/
http://git-annex.branchable.com/bugs/S3_upload_not_using_multipart/
-rw-r--r-- | Remote/S3.hs | 88 | ||||
-rw-r--r-- | Types/Key.hs | 4 | ||||
-rw-r--r-- | debian/changelog | 2 | ||||
-rw-r--r-- | doc/special_remotes/S3.mdwn | 3 | ||||
-rw-r--r-- | doc/tips/using_Amazon_S3.mdwn | 2 |
5 files changed, 38 insertions, 61 deletions
diff --git a/Remote/S3.hs b/Remote/S3.hs index c30d07b8a..ed9122cab 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -25,12 +25,10 @@ import qualified Git import Config import Config.Cost import Remote.Helper.Special -import Remote.Helper.Encryptable +import Remote.Helper.ChunkedEncryptable import qualified Remote.Helper.AWS as AWS -import Crypto import Creds import Utility.Metered -import Annex.Content import Annex.UUID import Logs.Web @@ -47,17 +45,17 @@ remote = RemoteType { gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote) gen r u c gc = new <$> remoteCost gc expensiveRemoteCost where - new cst = Just $ encryptableRemote c - (storeEncrypted this) - (retrieveEncrypted this) + new cst = Just $ chunkedEncryptableRemote c + (prepareStore this) + (prepareRetrieve this) this where this = Remote { uuid = u, cost = cst, name = Git.repoDescribe r, - storeKey = store this, - retrieveKeyFile = retrieve this, + storeKey = storeKeyDummy, + retrieveKeyFile = retreiveKeyFileDummy, retrieveKeyFileCheap = retrieveCheap this, removeKey = remove this c, hasKey = checkPresent this, @@ -123,67 +121,39 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost writeUUIDFile archiveconfig u use archiveconfig -store :: Remote -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool -store r k _f p = s3Action r False $ \(conn, bucket) -> - sendAnnex k (void $ remove' r k) $ \src -> do - ok <- s3Bool =<< storeHelper (conn, bucket) r k p src +prepareStore :: Remote -> Preparer Storer +prepareStore r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> + fileStorer $ \k src p -> do + ok <- s3Bool =<< liftIO (store (conn, bucket) r k p src) -- Store public URL to item in Internet Archive. - when (ok && isIA (config r)) $ + when (ok && isIA (config r) && not (isChunkKey k)) $ setUrlPresent k (iaKeyUrl r k) return ok -storeEncrypted :: Remote -> (Cipher, Key) -> Key -> MeterUpdate -> Annex Bool -storeEncrypted r (cipher, enck) k p = s3Action r False $ \(conn, bucket) -> - -- To get file size of the encrypted content, have to use a temp file. - -- (An alternative would be chunking to to a constant size.) - withTmp enck $ \tmp -> sendAnnex k (void $ remove' r enck) $ \src -> do - liftIO $ encrypt (getGpgEncParams r) cipher (feedFile src) $ - readBytes $ L.writeFile tmp - s3Bool =<< storeHelper (conn, bucket) r enck p tmp - -storeHelper :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> Annex (AWSResult ()) -storeHelper (conn, bucket) r k p file = do - size <- maybe getsize (return . fromIntegral) $ keySize k - meteredBytes (Just p) size $ \meterupdate -> - liftIO $ withMeteredFile file meterupdate $ \content -> do - -- size is provided to S3 so the whole content - -- does not need to be buffered to calculate it - let object = S3Object - bucket (bucketFile r k) "" - (("Content-Length", show size) : getXheaders (config r)) - content - sendObject conn $ - setStorageClass (getStorageClass $ config r) object - where - getsize = liftIO $ fromIntegral . fileSize <$> getFileStatus file - -retrieve :: Remote -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Bool -retrieve r k _f d p = s3Action r False $ \(conn, bucket) -> - metered (Just p) k $ \meterupdate -> do - res <- liftIO $ getObject conn $ bucketKey r bucket k - case res of - Right o -> do - liftIO $ meteredWriteFile meterupdate d $ - obj_data o - return True - Left e -> s3Warning e +store :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> IO (AWSResult ()) +store (conn, bucket) r k p file = do + size <- (fromIntegral . fileSize <$> getFileStatus file) :: IO Integer + withMeteredFile file p $ \content -> do + -- size is provided to S3 so the whole content + -- does not need to be buffered to calculate it + let object = S3Object + bucket (bucketFile r k) "" + (("Content-Length", show size) : getXheaders (config r)) + content + sendObject conn $ + setStorageClass (getStorageClass $ config r) object + +prepareRetrieve :: Remote -> Preparer Retriever +prepareRetrieve r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> + byteRetriever $ \k -> + liftIO (getObject conn $ bucketKey r bucket k) + >>= either s3Error (return . obj_data) retrieveCheap :: Remote -> Key -> FilePath -> Annex Bool retrieveCheap _ _ _ = return False -retrieveEncrypted :: Remote -> (Cipher, Key) -> Key -> FilePath -> MeterUpdate -> Annex Bool -retrieveEncrypted r (cipher, enck) k d p = s3Action r False $ \(conn, bucket) -> - metered (Just p) k $ \meterupdate -> do - res <- liftIO $ getObject conn $ bucketKey r bucket enck - case res of - Right o -> liftIO $ decrypt cipher (\h -> meteredWrite meterupdate h $ obj_data o) $ - readBytes $ \content -> do - L.writeFile d content - return True - Left e -> s3Warning e - {- Internet Archive doesn't easily allow removing content. - While it may remove the file, there are generally other files - derived from it that it does not remove. -} diff --git a/Types/Key.hs b/Types/Key.hs index 154e813ff..5bb41e15f 100644 --- a/Types/Key.hs +++ b/Types/Key.hs @@ -15,6 +15,7 @@ module Types.Key ( file2key, nonChunkKey, chunkKeyOffset, + isChunkKey, prop_idempotent_key_encode, prop_idempotent_key_decode @@ -62,6 +63,9 @@ chunkKeyOffset k = (*) <$> keyChunkSize k <*> (pred <$> keyChunkNum k) +isChunkKey :: Key -> Bool +isChunkKey k = isJust (keyChunkSize k) && isJust (keyChunkNum k) + fieldSep :: Char fieldSep = '-' diff --git a/debian/changelog b/debian/changelog index f8b700ae7..d8c23af9b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,7 +1,7 @@ git-annex (5.20140718) UNRELEASED; urgency=medium * New chunk= option to chunk files stored in special remotes. - Currently supported by: directory, and all external special remotes. + Currently supported by: directory, S3, and all external special remotes. * Partially transferred files are automatically resumed when using chunked remotes! * The old chunksize= option is deprecated. Do not use for new remotes. diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index 5291a4eb6..fe46948b3 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -18,6 +18,9 @@ the S3 remote. * `encryption` - One of "none", "hybrid", "shared", or "pubkey". See [[encryption]]. +* `chunk` - Enables [[chunking]] when storing large files. + `chunk=1MiB` is a good starting point for chunking. + * `keyid` - Specifies the gpg key to use for [[encryption]]. * `embedcreds` - Optional. Set to "yes" embed the login credentials inside diff --git a/doc/tips/using_Amazon_S3.mdwn b/doc/tips/using_Amazon_S3.mdwn index 0c68c7387..ede3f952f 100644 --- a/doc/tips/using_Amazon_S3.mdwn +++ b/doc/tips/using_Amazon_S3.mdwn @@ -14,7 +14,7 @@ like "2512E3C7" Next, create the S3 remote, and describe it. - # git annex initremote cloud type=S3 keyid=2512E3C7 + # git annex initremote cloud type=S3 chunk=1MiB keyid=2512E3C7 initremote cloud (encryption setup with gpg key C910D9222512E3C7) (checking bucket) (creating bucket in US) (gpg) ok # git annex describe cloud "at Amazon's US datacenter" describe cloud ok |