aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2011-04-21 10:31:54 -0400
committerGravatar Joey Hess <joey@kitenet.net>2011-04-21 10:33:17 -0400
commit6fcd3e1ef77d3dc99da30cdf2b82489d4bd3d7df (patch)
tree5508b32f00b72af6eaee42ba132758af5d076651
parentd8329731c617b278967304389c300f2c832db28d (diff)
fix S3 upload buffering problem
Provide file size to new version of hS3.
-rw-r--r--Remote/S3real.hs45
-rw-r--r--debian/changelog9
-rw-r--r--debian/control1
-rw-r--r--doc/bugs/S3_memory_leaks.mdwn18
4 files changed, 43 insertions, 30 deletions
diff --git a/Remote/S3real.hs b/Remote/S3real.hs
index 5095b4039..2b0234dc2 100644
--- a/Remote/S3real.hs
+++ b/Remote/S3real.hs
@@ -10,7 +10,7 @@ module Remote.S3 (remote) where
import Control.Exception.Extensible (IOException)
import Network.AWS.AWSConnection
import Network.AWS.S3Object
-import Network.AWS.S3Bucket
+import Network.AWS.S3Bucket hiding (size)
import Network.AWS.AWSResult
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Map as M
@@ -18,6 +18,8 @@ import Data.Maybe
import Control.Monad (when)
import Control.Monad.State (liftIO)
import System.Environment
+import System.Posix.Files
+import System.Directory
import RemoteClass
import Types
@@ -30,6 +32,7 @@ import Config
import Remote.Special
import Remote.Encryptable
import Crypto
+import Key
remote :: RemoteType Annex
remote = RemoteType {
@@ -100,21 +103,35 @@ s3Setup u c = do
store :: Remote Annex -> Key -> Annex Bool
store r k = s3Action r False $ \(conn, bucket) -> do
g <- Annex.gitRepo
- content <- liftIO $ L.readFile $ gitAnnexLocation g k
- res <- liftIO $ storeHelper (conn, bucket) r k content
+ res <- liftIO $ storeHelper (conn, bucket) r k $ gitAnnexLocation g k
s3Bool res
storeEncrypted :: Remote Annex -> (Cipher, Key) -> Key -> Annex Bool
storeEncrypted r (cipher, enck) k = s3Action r False $ \(conn, bucket) -> do
g <- Annex.gitRepo
let f = gitAnnexLocation g k
- res <- liftIO $ withEncryptedContent cipher (L.readFile f) $ \s -> do
- storeHelper (conn, bucket) r enck s
+ -- To get file size of the encrypted content, have to use a temp file.
+ -- (An alternative would be chunking to to a constant size.)
+ let tmp = gitAnnexTmpLocation g enck
+ liftIO $ withEncryptedContent cipher (L.readFile f) $ \s -> L.writeFile tmp s
+ res <- liftIO $ storeHelper (conn, bucket) r enck tmp
+ tmp_exists <- liftIO $ doesFileExist tmp
+ when tmp_exists $ liftIO $ removeFile tmp
s3Bool res
-storeHelper :: (AWSConnection, String) -> Remote Annex -> Key -> L.ByteString -> IO (AWSResult ())
-storeHelper (conn, bucket) r k content = do
- let object = setStorageClass storageclass $ bucketKey bucket k content
+storeHelper :: (AWSConnection, String) -> Remote Annex -> Key -> FilePath -> IO (AWSResult ())
+storeHelper (conn, bucket) r k file = do
+ content <- liftIO $ L.readFile file
+ -- size is provided to S3 so the whole content does not need to be
+ -- buffered to calculate it
+ size <- case keySize k of
+ Just s -> return $ fromIntegral s
+ Nothing -> do
+ s <- liftIO $ getFileStatus file
+ return $ fileSize s
+ let object = setStorageClass storageclass $
+ S3Object bucket (show k) ""
+ [("Content-Length",(show size))] content
sendObject conn object
where
storageclass =
@@ -124,7 +141,7 @@ storeHelper (conn, bucket) r k content = do
retrieve :: Remote Annex -> Key -> FilePath -> Annex Bool
retrieve r k f = s3Action r False $ \(conn, bucket) -> do
- res <- liftIO $ getObject conn $ bucketKey bucket k L.empty
+ res <- liftIO $ getObject conn $ bucketKey bucket k
case res of
Right o -> do
liftIO $ L.writeFile f $ obj_data o
@@ -133,7 +150,7 @@ retrieve r k f = s3Action r False $ \(conn, bucket) -> do
retrieveEncrypted :: Remote Annex -> (Cipher, Key) -> FilePath -> Annex Bool
retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
- res <- liftIO $ getObject conn $ bucketKey bucket enck L.empty
+ res <- liftIO $ getObject conn $ bucketKey bucket enck
case res of
Right o -> liftIO $
withDecryptedContent cipher (return $ obj_data o) $ \content -> do
@@ -143,13 +160,13 @@ retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
remove :: Remote Annex -> Key -> Annex Bool
remove r k = s3Action r False $ \(conn, bucket) -> do
- res <- liftIO $ deleteObject conn $ bucketKey bucket k L.empty
+ res <- liftIO $ deleteObject conn $ bucketKey bucket k
s3Bool res
checkPresent :: Remote Annex -> Key -> Annex (Either IOException Bool)
checkPresent r k = s3Action r noconn $ \(conn, bucket) -> do
showNote ("checking " ++ name r ++ "...")
- res <- liftIO $ getObjectInfo conn $ bucketKey bucket k L.empty
+ res <- liftIO $ getObjectInfo conn $ bucketKey bucket k
case res of
Right _ -> return $ Right True
Left (AWSError _ _) -> return $ Right False
@@ -205,5 +222,5 @@ s3Action r noconn action = do
(Just b, Just c) -> action (c, b)
_ -> return noconn
-bucketKey :: String -> Key -> L.ByteString -> S3Object
-bucketKey bucket k content = S3Object bucket (show k) "" [] content
+bucketKey :: String -> Key -> S3Object
+bucketKey bucket k = S3Object bucket (show k) "" [] L.empty
diff --git a/debian/changelog b/debian/changelog
index cdb530c24..30a3e7cc5 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,9 +1,12 @@
git-annex (0.20110420) UNRELEASED; urgency=low
* Update Debian build dependencies for ghc 7.
- * Debian package is now built with S3 support. Thanks Joachim Breitner for
- making this possible, also thanks Greg Heartsfield for working to improve
- the hS3 library for git-annex.
+ * Debian package is now built with S3 support.
+ Thanks Joachim Breitner for making this possible.
+ * No longer needs to buffer entire files when sending them to S3.
+ (However, getting files from S3 still requires buffering.)
+ Thanks Greg Heartsfield for ongoing work to improve the hS3 library
+ for git-annex.
-- Joey Hess <joeyh@debian.org> Thu, 21 Apr 2011 02:00:00 -0400
diff --git a/debian/control b/debian/control
index 444d71261..be2e35911 100644
--- a/debian/control
+++ b/debian/control
@@ -43,4 +43,3 @@ Description: manage files with git, without checking their contents into git
versioned files, which is convenient for maintaining documents, Makefiles,
etc that are associated with annexed files but that benefit from full
revision control.
-
diff --git a/doc/bugs/S3_memory_leaks.mdwn b/doc/bugs/S3_memory_leaks.mdwn
index 86e3dd1a4..bfb3e1ec5 100644
--- a/doc/bugs/S3_memory_leaks.mdwn
+++ b/doc/bugs/S3_memory_leaks.mdwn
@@ -1,4 +1,4 @@
-S3 has two memory leaks.
+S3 has memory leaks
## with encryption
@@ -8,16 +8,10 @@ not yet for S3, in 5985acdfad8a6791f0b2fc54a1e116cee9c12479.
## always
-The other occurs independant of encryption use. Copying a 100 mb
-file to S3 causes an immediate sharp memory spike to 119 mb.
Copying the file back from S3 causes a slow memory increase toward 119 mb.
-It's likely that this memory is used by the hS3 library, if it does not
-construct the message to Amazon lazily. (And it may not be possible to
-construct it lazily, if it includes checksum headers..)
-I have emailed the hS3 author about this. He wrote back quickly, seems
-only getting the size of the file is causing it to be buffered, and a quick
-fix should be forthcoming. Update: 0.5.6 has been released which will
-allow providing file size out of band to avoid buffering when uploading.
-Downloading will take further work in hS3.
---[[Joey]]
+The author of hS3 is aware of the problem, and working on it.
+
+## fixed
+
+memory leak while uploading content to S3