diff options
-rw-r--r-- | Remote/Directory.hs | 12 | ||||
-rw-r--r-- | Types/Remote.hs | 3 | ||||
-rw-r--r-- | Utility/Tmp.hs | 2 | ||||
-rw-r--r-- | doc/design/exporting_trees_to_special_remotes.mdwn | 74 |
4 files changed, 50 insertions, 41 deletions
diff --git a/Remote/Directory.hs b/Remote/Directory.hs index abbde1ceb..f5d7f7e49 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -29,6 +29,7 @@ import qualified Remote.Directory.LegacyChunked as Legacy import Annex.Content import Annex.UUID import Utility.Metered +import Utility.Tmp remote :: RemoteType remote = RemoteType { @@ -116,11 +117,6 @@ getLocation d k = do storeDir :: FilePath -> Key -> FilePath storeDir d k = addTrailingPathSeparator $ d </> hashDirLower def k </> keyFile k -{- Where we store temporary data for a key, in the directory, as it's being - - written. -} -tmpDir :: FilePath -> Key -> FilePath -tmpDir d k = addTrailingPathSeparator $ d </> "tmp" </> keyFile k - {- Check if there is enough free disk space in the remote's directory to - store the key. Note that the unencrypted key size is checked. -} prepareStore :: FilePath -> ChunkConfig -> Preparer Storer @@ -148,7 +144,7 @@ store d chunkconfig k b p = liftIO $ do finalizeStoreGeneric tmpdir destdir return True where - tmpdir = tmpDir d k + tmpdir = addTrailingPathSeparator $ d </> "tmp" </> keyFile k destdir = storeDir d k {- Passed a temp directory that contains the files that should be placed @@ -233,7 +229,9 @@ checkPresentGeneric d ps = liftIO $ storeExportDirectory :: FilePath -> FilePath -> Key -> ExportLocation -> MeterUpdate -> Annex Bool storeExportDirectory d src _k loc p = liftIO $ catchBoolIO $ do createDirectoryIfMissing True (takeDirectory dest) - withMeteredFile src p (L.writeFile dest) + -- Write via temp file so that checkPresentGeneric will not + -- see it until it's fully stored. + viaTmp (\tmp () -> withMeteredFile src p (L.writeFile tmp)) dest () return True where dest = exportPath d loc diff --git a/Types/Remote.hs b/Types/Remote.hs index 6a4d2039e..6e78bf238 100644 --- a/Types/Remote.hs +++ b/Types/Remote.hs @@ -98,7 +98,8 @@ data RemoteA a = Remote { checkPresentCheap :: Bool, -- Exports content to an ExportLocation. - -- The exported file does not need to be updated atomically. + -- The exported file should not appear to be present on the remote + -- until all of its contents have been transferred. storeExport :: Maybe (FilePath -> Key -> ExportLocation -> MeterUpdate -> a Bool), -- Retrieves exported content to a file. -- (The MeterUpdate does not need to be used if it writes diff --git a/Utility/Tmp.hs b/Utility/Tmp.hs index 6a541cfe4..ca611e0b4 100644 --- a/Utility/Tmp.hs +++ b/Utility/Tmp.hs @@ -28,7 +28,7 @@ type Template = String {- Runs an action like writeFile, writing to a temp file first and - then moving it into place. The temp file is stored in the same - directory as the final file to avoid cross-device renames. -} -viaTmp :: (MonadMask m, MonadIO m) => (FilePath -> String -> m ()) -> FilePath -> String -> m () +viaTmp :: (MonadMask m, MonadIO m) => (FilePath -> v -> m ()) -> FilePath -> v -> m () viaTmp a file content = bracketIO setup cleanup use where (dir, base) = splitFileName file diff --git a/doc/design/exporting_trees_to_special_remotes.mdwn b/doc/design/exporting_trees_to_special_remotes.mdwn index c9b2b72e5..835a86640 100644 --- a/doc/design/exporting_trees_to_special_remotes.mdwn +++ b/doc/design/exporting_trees_to_special_remotes.mdwn @@ -69,11 +69,6 @@ To efficiently update an export, git-annex can diff the tree that was exported with the new tree. The naive approach is to upload new and modified files and remove deleted files. -Note that a file may have been partially uploaded to an export, and then -the export updated to a tree without that file. So, need to try to delete -all removed files, even if location tracking does not say that the special -remote contains them. - With rename detection, if the special remote supports moving files, more efficient updates can be done. It gets complicated; consider two files that swap names. @@ -81,33 +76,6 @@ that swap names. If the special remote supports copying files, that would also make some updates more efficient. -## resuming exports - -Resuming an interrupted export needs to work well. - -There are two cases here: - -1. Some of the files in the tree have been uploaded; others have not. -2. A file has been partially uploaded. - -These two cases need to be disentangled somehow in order to handle -them. One way is to use the location log as follows: - -* Before a file is uploaded, look up what key is currently exported - using that filename. If there is one, update the location log, - saying it's not present in the special remote. -* Upload the file. -* Update the location log for the newly exported key. - -Note that this method does not allow resuming a partial upload by appending to -a file, because we don't know if the file actually started to be uploaded, or -if the file instead still has the old key's content. Instead, the whole -file needs to be re-uploaded. - -Alternative: Keep an index file that's the current state of the export. -See comment #4 of [[todo/export]]. Not sure if that works? Perhaps it -would be overkill if it's only used to support resuming partial uploads. - ## changes to special remote interface This needs some additional methods added to special remotes, and to @@ -123,6 +91,9 @@ Here's the changes to the latter: * `TRANSFEREXPORT STORE|RETRIEVE Key File` Requests the transfer of a File on local disk to or from the previously provided Name on the special remote. + Note that it's important that, while a file is being stored, + CHECKPRESENTEXPORT not indicate it's present until all the data has + been transferred. The remote responds with either `TRANSFER-SUCCESS` or `TRANSFER-FAILURE`, and a remote where exports do not make sense may always fail. @@ -241,3 +212,42 @@ re-uploads, but it's reasonably efficient. The documentation should suggest strongly only exporting to a given special remote from a single repository, or having some other rule that avoids export conflicts. + +## when to update export.log for efficient resuming of exports + +When should `export.log` be updated? Possibilities: + +* Before performing any work, to set the goal. +* After the export is fully successful, to record the current state. +* After some mid-point. + +Lots of things could go wrong during an export. A file might fail to be +transferred or only part of it be transferred; a file's content might not +be present to transfer at all. The export could be interrupted part way. +Updating the export.log at the right point in time is important to handle +these cases efficiently. + +If the export.log is updated first, then it's only a goal and does not tell +us what's been done already. + +If the export.log is updated only after complete success, then the common +case of some files not having content locally present will prevent it from +being updated. When we resume, we again don't know what's been done +already. + +If the export.log is updated after deleting any files from the +remote that are not the same in the new treeish as in the old treeish, +and as long as TRANSFEREXPORT STORE is atomic, then when resuming we can +trust CHECKPRESENTEXPORT to only find files that have the correct content +for the current treeish. (Unless a conflicting export was made from +elsewhere, but in that case, the conflict resolution will have to fix up +later.) + +Efficient resuming can then first check if the location log says the +export contains the content. (If not, transfer a copy.) If the location +log says the export contains the content, use CHECKPRESENTEXPORT to see if +the file exists, and if not transfer a copy. The CHECKPRESENTEXPORT check +deals with the case where the treeish has two files with the same content. +If we have a key-to-files map for the export, then we can skip the +CHECKPRESENTEXPORT check when there's only one file using a key. So, +resuming can be quite efficient. |