From 2b9e3d5ca89f88fbcae237270e95c592e3ce3bc1 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 25 Jul 2014 16:09:23 -0400 Subject: use same hash directories for chunked key as are used for its parent This avoids a proliferation of hash directories when using new-style chunking, and should improve performance since chunks are accessed in sequence and so should have a common locality. Of course, when a chunked key is encrypted, its hash directories have no relation to the parent key. This commit was sponsored by Christian Kellermann. --- Types/Key.hs | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'Types/Key.hs') diff --git a/Types/Key.hs b/Types/Key.hs index 90f66f23e..3015b1e86 100644 --- a/Types/Key.hs +++ b/Types/Key.hs @@ -13,6 +13,8 @@ module Types.Key ( stubKey, key2file, file2key, + isChunkKey, + nonChunkKey, prop_idempotent_key_encode, prop_idempotent_key_decode @@ -47,6 +49,16 @@ stubKey = Key , keyChunkNum = Nothing } +isChunkKey :: Key -> Bool +isChunkKey k = isJust (keyChunkSize k) && isJust (keyChunkNum k) + +-- Gets the parent of a chunk key. +nonChunkKey :: Key -> Key +nonChunkKey k = k + { keyChunkSize = Nothing + , keyChunkNum = Nothing + } + fieldSep :: Char fieldSep = '-' -- cgit v1.2.3 From f6e7037c0ccdf646d1ca955c48cece814f5700a4 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sun, 27 Jul 2014 18:52:42 -0400 Subject: resume interrupted chunked downloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leverage the new chunked remotes to automatically resume downloads. Sort of like rsync, although of course not as efficient since this needs to start at a chunk boundry. But, unlike rsync, this method will work for S3, WebDAV, external special remotes, etc, etc. Only directory special remotes so far, but many more soon! This implementation will also properly handle starting a download from one remote, interrupting, and resuming from another one, and so on. (Resuming interrupted chunked uploads is similarly doable, although slightly more expensive.) This commit was sponsored by Thomas Djärv. --- Remote/Helper/Chunked.hs | 97 ++++++++++++++++++++++++++++--------- Remote/Helper/ChunkedEncryptable.hs | 4 +- Types/Key.hs | 11 +++-- Utility/Metered.hs | 2 +- debian/changelog | 4 +- doc/chunking.mdwn | 2 + 6 files changed, 88 insertions(+), 32 deletions(-) (limited to 'Types/Key.hs') diff --git a/Remote/Helper/Chunked.hs b/Remote/Helper/Chunked.hs index 3415c2df6..9ba6d9cbd 100644 --- a/Remote/Helper/Chunked.hs +++ b/Remote/Helper/Chunked.hs @@ -1,6 +1,6 @@ {- git-annex chunked remotes - - - Copyright 2012-2014 Joey Hess + - Copyright 2014 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -27,6 +27,7 @@ import Annex.Exception import qualified Data.ByteString.Lazy as L import qualified Data.Map as M +import Control.Exception data ChunkConfig = NoChunks @@ -147,12 +148,16 @@ removeChunks remover u chunkconfig encryptor k = do - - When the remote is chunked, tries each of the options returned by - chunkKeys until it finds one where the retriever successfully - - gets the first key in the list. The content of that key, and any + - gets the first chunked key. The content of that key, and any - other chunks in the list is fed to the sink. - - If retrival of one of the subsequent chunks throws an exception, - gives up and returns False. Note that partial data may have been - written to the sink in this case. + - + - Resuming is supported when using chunks. When the destination file + - already exists, it skips to the next chunked key that would be needed + - to resume. -} retrieveChunks :: (Key -> IO L.ByteString) @@ -160,43 +165,88 @@ retrieveChunks -> ChunkConfig -> EncKey -> Key + -> FilePath -> MeterUpdate - -> (MeterUpdate -> L.ByteString -> IO ()) + -> (Handle -> MeterUpdate -> L.ByteString -> IO ()) -> Annex Bool -retrieveChunks retriever u chunkconfig encryptor basek basep sink +retrieveChunks retriever u chunkconfig encryptor basek dest basep sink | noChunks chunkconfig = -- Optimisation: Try the unchunked key first, to avoid - -- looking in the git-annex branch for chunk counts. - liftIO (retriever (encryptor basek) >>= sink basep >> return True) - `catchNonAsyncAnnex` - const (go =<< chunkKeysOnly u basek) + -- looking in the git-annex branch for chunk counts + -- that are likely not there. + getunchunked `catchNonAsyncAnnex` + const (go =<< chunkKeysOnly u basek) | otherwise = go =<< chunkKeys u chunkconfig basek where - go ls = liftIO $ firstavail ls `catchNonAsync` giveup + go ls = liftIO $ do + currsize <- catchMaybeIO $ + toInteger . fileSize <$> getFileStatus dest + let ls' = maybe ls (setupResume ls) currsize + firstavail currsize ls' `catchNonAsync` giveup giveup e = do warningIO (show e) return False - firstavail [] = return False - firstavail ([]:ls) = firstavail ls - firstavail ((k:ks):ls) = do + firstavail _ [] = return False + firstavail currsize ([]:ls) = firstavail currsize ls + firstavail currsize ((k:ks):ls) = do v <- tryNonAsync $ retriever (encryptor k) case v of Left e | null ls -> giveup e - | otherwise -> firstavail ls + | otherwise -> firstavail currsize ls Right b -> do - sink basep b - let sz = toBytesProcessed $ - fromMaybe 0 $ keyChunkSize k - getrest sz sz ks + let offset = resumeOffset currsize k + let p = maybe basep + (offsetMeterUpdate basep . toBytesProcessed) + offset + bracket (maybe opennew openresume offset) hClose $ \h -> do + sink h p b + let sz = toBytesProcessed $ + fromMaybe 0 $ keyChunkSize k + getrest p h sz sz ks + + getrest _ _ _ _ [] = return True + getrest p h sz bytesprocessed (k:ks) = do + let p' = offsetMeterUpdate p bytesprocessed + sink h p' =<< retriever (encryptor k) + getrest p h sz (addBytesProcessed bytesprocessed sz) ks + + getunchunked = liftIO $ bracket opennew hClose $ \h -> do + retriever (encryptor basek) >>= sink h basep + return True + + opennew = openBinaryFile dest WriteMode - getrest _ _ [] = return True - getrest sz bytesprocessed (k:ks) = do - let p = offsetMeterUpdate basep bytesprocessed - sink p =<< retriever (encryptor k) - getrest sz (addBytesProcessed bytesprocessed sz) ks + -- Open the file and seek to the start point in order to resume. + openresume startpoint = do + -- ReadWriteMode allows seeking; AppendMode does not. + h <- openBinaryFile dest ReadWriteMode + hSeek h AbsoluteSeek startpoint + return h + +{- Can resume when the chunk's offset is at or before the end of + - the dest file. -} +resumeOffset :: Maybe Integer -> Key -> Maybe Integer +resumeOffset Nothing _ = Nothing +resumeOffset currsize k + | offset <= currsize = offset + | otherwise = Nothing + where + offset = chunkKeyOffset k + +{- Drops chunks that are already present in a file, based on its size. + - Keeps any non-chunk keys. + -} +setupResume :: [[Key]] -> Integer -> [[Key]] +setupResume ls currsize = map dropunneeded ls + where + dropunneeded [] = [] + dropunneeded l@(k:_) = case keyChunkSize k of + Just chunksize | chunksize > 0 -> + genericDrop (currsize `div` chunksize) l + _ -> l {- Checks if a key is present in a remote. This requires any one - of the lists of options returned by chunkKeys to all check out @@ -212,7 +262,8 @@ hasKeyChunks hasKeyChunks checker u chunkconfig encryptor basek | noChunks chunkconfig = -- Optimisation: Try the unchunked key first, to avoid - -- looking in the git-annex branch for chunk counts. + -- looking in the git-annex branch for chunk counts + -- that are likely not there. ifM ((Right True ==) <$> checker (encryptor basek)) ( return (Right True) , checklists impossible =<< chunkKeysOnly u basek diff --git a/Remote/Helper/ChunkedEncryptable.hs b/Remote/Helper/ChunkedEncryptable.hs index ac8917851..66e02da12 100644 --- a/Remote/Helper/ChunkedEncryptable.hs +++ b/Remote/Helper/ChunkedEncryptable.hs @@ -103,9 +103,7 @@ chunkedEncryptableRemote c preparestorer prepareretriever r = encr safely $ prepareretriever k $ safely . go where go (Just retriever) = metered (Just p) k $ \p' -> - bracketIO (openBinaryFile dest WriteMode) hClose $ \h -> - retrieveChunks retriever (uuid r) chunkconfig enck k p' $ - sink h + retrieveChunks retriever (uuid r) chunkconfig enck k dest p' sink go Nothing = return False sink h p' b = do let write = meteredWrite p' h diff --git a/Types/Key.hs b/Types/Key.hs index 3015b1e86..154e813ff 100644 --- a/Types/Key.hs +++ b/Types/Key.hs @@ -13,8 +13,8 @@ module Types.Key ( stubKey, key2file, file2key, - isChunkKey, nonChunkKey, + chunkKeyOffset, prop_idempotent_key_encode, prop_idempotent_key_decode @@ -49,9 +49,6 @@ stubKey = Key , keyChunkNum = Nothing } -isChunkKey :: Key -> Bool -isChunkKey k = isJust (keyChunkSize k) && isJust (keyChunkNum k) - -- Gets the parent of a chunk key. nonChunkKey :: Key -> Key nonChunkKey k = k @@ -59,6 +56,12 @@ nonChunkKey k = k , keyChunkNum = Nothing } +-- Where a chunk key is offset within its parent. +chunkKeyOffset :: Key -> Maybe Integer +chunkKeyOffset k = (*) + <$> keyChunkSize k + <*> (pred <$> keyChunkNum k) + fieldSep :: Char fieldSep = '-' diff --git a/Utility/Metered.hs b/Utility/Metered.hs index bca7f58e7..cc07f9c35 100644 --- a/Utility/Metered.hs +++ b/Utility/Metered.hs @@ -88,7 +88,7 @@ meteredWriteFile meterupdate f b = withBinaryFile f WriteMode $ \h -> {- Applies an offset to a MeterUpdate. This can be useful when - performing a sequence of actions, such as multiple meteredWriteFiles, - - that all update a common meter progressively. + - that all update a common meter progressively. Or when resuming. -} offsetMeterUpdate :: MeterUpdate -> BytesProcessed -> MeterUpdate offsetMeterUpdate base offset = \n -> base (offset `addBytesProcessed` n) diff --git a/debian/changelog b/debian/changelog index d5c000003..c85247b69 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,7 +1,9 @@ git-annex (5.20140718) UNRELEASED; urgency=medium * New chunk= option to chunk files stored in directory remotes. - * The old chunksize= option is deprecated. Do not use for new remotes! + * Partially transferred files are automatically resumed when using + chunked remotes! + * The old chunksize= option is deprecated. Do not use for new remotes. * Legacy code for directory remotes using the old chunksize= option will keep them working, but more slowly than before. * webapp: Automatically install Konqueror integration scripts diff --git a/doc/chunking.mdwn b/doc/chunking.mdwn index 1be1fbef6..d1dce317c 100644 --- a/doc/chunking.mdwn +++ b/doc/chunking.mdwn @@ -4,6 +4,8 @@ chunks that are stored on the remote. This can be useful to work around limitations on the size of files on the remote. +Chunking also allows for resuming interrupted downloads and uploads. + Note that git-annex has to buffer chunks in memory before they are sent to a remote. So, using a large chunk size will make it use more memory. -- cgit v1.2.3