summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2017-09-06 15:33:40 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2017-09-06 15:44:10 -0400
commit6fcefbdb6629c3e94c41bc05a6b7c224ade99ba0 (patch)
treedba0a4b10efa30c3fe491c5163a2942eda56eb69
parent9dd2651e8e5efbbf3a9cc59cab3afa1fef7446f2 (diff)
export file renaming
This is seriously super hairy. It has to handle interrupted exports, which may be resumed with the same or a different tree. It also has to recover from export conflicts, which could cause the wrong content to be renamed to a file. I think this works, or is close to working. See the update to the design for how it works. This is definitely not optimal, in that it does more renames than are necessary. It would probably be worth finding the keys that are really renamed and only renaming those. But let's get the "simple" approach to work first.. This commit was supported by the NSF-funded DataLad project.
-rw-r--r--Command/Export.hs167
-rw-r--r--Types/Remote.hs2
-rw-r--r--doc/design/exporting_trees_to_special_remotes.mdwn44
-rw-r--r--doc/git-annex-export.mdwn19
4 files changed, 190 insertions, 42 deletions
diff --git a/Command/Export.hs b/Command/Export.hs
index 878cda8e3..6090b2603 100644
--- a/Command/Export.hs
+++ b/Command/Export.hs
@@ -67,6 +67,12 @@ exportKey sha = mk <$> catKey sha
, keyChunkNum = Nothing
}
+-- To handle renames which swap files, the exported file is first renamed
+-- to a stable temporary name based on the key.
+exportTempName :: ExportKey -> ExportLocation
+exportTempName ek = ExportLocation $
+ ".git-annex-tmp-content-" ++ key2file (asKey (ek))
+
seek :: ExportOptions -> CommandSeek
seek o = do
r <- getParsed (exportRemote o)
@@ -78,23 +84,51 @@ seek o = do
-- or tag.
inRepo (Git.Ref.tree (exportTreeish o))
old <- getExport (uuid r)
-
recordExportBeginning (uuid r) new
- when (length old > 1) $
- warning "Export conflict detected. Different trees have been exported to the same special remote. Resolving.."
-
db <- openDb (uuid r)
- -- First, diff the old and new trees and delete all changed
- -- files in the export. Every file that remains in the export will
- -- have the content from the new treeish.
+ -- Clean up after incomplete export of a tree, in which
+ -- the next block of code below may have renamed some files to
+ -- temp files. Diff from the incomplete tree to the new tree,
+ -- and delete any temp files that the new tree can't use.
+ forM_ (concatMap incompleteExportedTreeish old) $ \incomplete ->
+ mapdiff (startUnexportTempName r db . Git.DiffTree.srcsha) incomplete new
+
+ -- Diff the old and new trees, and delete or rename to new name all
+ -- changed files in the export. After this, every file that remains
+ -- in the export will have the content from the new treeish.
--
-- (Also, when there was an export conflict, this resolves it.)
- forM_ (map exportedTreeish old) $ \oldtreesha -> do
- (diff, cleanup) <- inRepo $
- Git.DiffTree.diffTreeRecursive oldtreesha new
- seekActions $ pure $ map (startUnexport r db) diff
- void $ liftIO cleanup
+ case map exportedTreeish old of
+ [] -> return ()
+ [oldtreesha] -> do
+ -- Rename all old files to temp.
+ mapdiff
+ (\diff -> startMoveToTempName r db (Git.DiffTree.file diff) (Git.DiffTree.srcsha diff))
+ oldtreesha new
+ -- Rename from temp to new files.
+ mapdiff (\diff -> startMoveFromTempName r db (Git.DiffTree.dstsha diff) (Git.DiffTree.file diff))
+ new oldtreesha
+ -- Remove all remaining temps.
+ mapdiff
+ (startUnexportTempName r db . Git.DiffTree.srcsha)
+ oldtreesha new
+ ts -> do
+ warning "Export conflict detected. Different trees have been exported to the same special remote. Resolving.."
+ forM_ ts $ \oldtreesha -> do
+ -- Unexport both the srcsha and the dstsha,
+ -- because the wrong content may have
+ -- been renamed to the dstsha due to the
+ -- export conflict.
+ let unexportboth d =
+ [ Git.DiffTree.srcsha d
+ , Git.DiffTree.dstsha d
+ ]
+ -- Don't rename to temp, because the
+ -- content is unknown; unexport instead.
+ mapdiff
+ (\diff -> startUnexport r db (Git.DiffTree.file diff) (unexportboth diff))
+ oldtreesha new
-- Waiting until now to record the export guarantees that,
-- if this export is interrupted, there are no files left over
@@ -110,6 +144,12 @@ seek o = do
void $ liftIO cleanup'
closeDb db
+ where
+ mapdiff a oldtreesha newtreesha = do
+ (diff, cleanup) <- inRepo $
+ Git.DiffTree.diffTreeRecursive oldtreesha newtreesha
+ seekActions $ pure $ map a diff
+ void $ liftIO cleanup
startExport :: Remote -> ExportHandle -> Git.LsTree.TreeItem -> CommandStart
startExport r db ti = do
@@ -127,7 +167,7 @@ performExport r db ek contentsha loc = do
sent <- case ek of
AnnexKey k -> ifM (inAnnex k)
( metered Nothing k $ \m -> do
- let rollback = void $ performUnexport r db ek loc
+ let rollback = void $ performUnexport r db [ek] loc
sendAnnex k rollback
(\f -> storer f k loc m)
, do
@@ -151,32 +191,89 @@ cleanupExport r db ek loc = do
logChange (asKey ek) (uuid r) InfoPresent
return True
-startUnexport :: Remote -> ExportHandle -> Git.DiffTree.DiffTreeItem -> CommandStart
-startUnexport r db diff
- | Git.DiffTree.srcsha diff /= nullSha = do
- showStart "unexport" f
- ek <- exportKey (Git.DiffTree.srcsha diff)
- next $ performUnexport r db ek loc
- | otherwise = stop
+startUnexport :: Remote -> ExportHandle -> TopFilePath -> [Git.Sha] -> CommandStart
+startUnexport r db f shas = do
+ eks <- forM (filter (/= nullSha) shas) exportKey
+ if null eks
+ then stop
+ else do
+ showStart "unexport" f'
+ next $ performUnexport r db eks loc
where
- loc = ExportLocation $ toInternalGitPath f
- f = getTopFilePath $ Git.DiffTree.file diff
-
-performUnexport :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> CommandPerform
-performUnexport r db ek loc = do
- let remover = removeExport $ exportActions r
- ok <- remover (asKey ek) loc
- if ok
- then next $ cleanupUnexport r db ek loc
- else stop
+ loc = ExportLocation $ toInternalGitPath f'
+ f' = getTopFilePath f
+
+performUnexport :: Remote -> ExportHandle -> [ExportKey] -> ExportLocation -> CommandPerform
+performUnexport r db eks loc = do
+ ifM (allM (\ek -> removeExport (exportActions r) (asKey ek) loc) eks)
+ ( next $ cleanupUnexport r db eks loc
+ , stop
+ )
-cleanupUnexport :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> CommandCleanup
-cleanupUnexport r db ek loc = do
+cleanupUnexport :: Remote -> ExportHandle -> [ExportKey] -> ExportLocation -> CommandCleanup
+cleanupUnexport r db eks loc = do
liftIO $ do
- removeExportLocation db (asKey ek) loc
+ forM_ eks $ \ek ->
+ removeExportLocation db (asKey ek) loc
-- Flush so that getExportLocation sees this and any
-- other removals of the key.
flushDbQueue db
- whenM (liftIO $ null <$> getExportLocation db (asKey ek)) $
- logChange (asKey ek) (uuid r) InfoMissing
+ remaininglocs <- liftIO $
+ concat <$> forM eks (\ek -> getExportLocation db (asKey ek))
+ when (null remaininglocs) $
+ forM_ eks $ \ek ->
+ logChange (asKey ek) (uuid r) InfoMissing
+ return True
+
+startUnexportTempName :: Remote -> ExportHandle -> Git.Sha -> CommandStart
+startUnexportTempName r db sha
+ | sha == nullSha = stop
+ | otherwise = do
+ ek <- exportKey sha
+ let loc@(ExportLocation f) = exportTempName ek
+ stopUnless (liftIO $ elem loc <$> getExportLocation db (asKey ek)) $ do
+ showStart "unexport" f
+ next $ performUnexport r db [ek] loc
+
+startMoveToTempName :: Remote -> ExportHandle -> TopFilePath -> Git.Sha -> CommandStart
+startMoveToTempName r db f sha
+ | sha == nullSha = stop
+ | otherwise = do
+ ek <- exportKey sha
+ let tmploc@(ExportLocation tmpf) = exportTempName ek
+ showStart "rename" (f' ++ " -> " ++ tmpf)
+ next $ performRename r db ek loc tmploc
+ where
+ loc = ExportLocation $ toInternalGitPath f'
+ f' = getTopFilePath f
+
+startMoveFromTempName :: Remote -> ExportHandle -> Git.Sha -> TopFilePath -> CommandStart
+startMoveFromTempName r db sha f
+ | sha == nullSha = stop
+ | otherwise = do
+ ek <- exportKey sha
+ stopUnless (liftIO $ elem loc <$> getExportLocation db (asKey ek)) $ do
+ let tmploc@(ExportLocation tmpf) = exportTempName ek
+ showStart "rename" (tmpf ++ " -> " ++ f')
+ next $ performRename r db ek tmploc loc
+ where
+ loc = ExportLocation $ toInternalGitPath f'
+ f' = getTopFilePath f
+
+performRename :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> ExportLocation -> CommandPerform
+performRename r db ek src dest = do
+ ifM (renameExport (exportActions r) (asKey ek) src dest)
+ ( next $ cleanupRename db ek src dest
+ -- In case the special remote does not support renaming,
+ -- unexport the src instead.
+ , performUnexport r db [ek] src
+ )
+
+cleanupRename :: ExportHandle -> ExportKey -> ExportLocation -> ExportLocation -> CommandCleanup
+cleanupRename db ek src dest = do
+ liftIO $ do
+ removeExportLocation db (asKey ek) src
+ addExportLocation db (asKey ek) dest
+ -- Flush so that getExportLocation sees this.
+ flushDbQueue db
return True
diff --git a/Types/Remote.hs b/Types/Remote.hs
index 46750ee8d..6f0a312f4 100644
--- a/Types/Remote.hs
+++ b/Types/Remote.hs
@@ -178,5 +178,7 @@ data ExportActions a = ExportActions
-- Throws an exception if the remote cannot be accessed.
, checkPresentExport :: Key -> ExportLocation -> a Bool
-- Renames an already exported file.
+ -- This may fail, if the file doesn't exist, or the remote does not
+ -- support renames.
, renameExport :: Key -> ExportLocation -> ExportLocation -> a Bool
}
diff --git a/doc/design/exporting_trees_to_special_remotes.mdwn b/doc/design/exporting_trees_to_special_remotes.mdwn
index 0469a4fcc..a8247d2b9 100644
--- a/doc/design/exporting_trees_to_special_remotes.mdwn
+++ b/doc/design/exporting_trees_to_special_remotes.mdwn
@@ -205,7 +205,7 @@ a tree that resolves the conflict as they desire (it could be the same as
one of the exported trees, or some merge of them or an entirely new tree).
The UI to do this can just be another `git annex export $tree --to remote`.
To resolve, diff each exported tree in turn against the resolving tree
-and delete all files that differ.
+and delete all files that differ. Then, upload all missing files.
## when to update export.log for efficient resuming of exports
@@ -256,18 +256,48 @@ tree, so no state needs to be maintained to clean it up. Also, using the
key in the name simplifies calculation of complicated renames (eg, renaming
A to B, B to C, C to A)
-Export can first try to rename the temp name of all keys
-whose files are added in the diff. Followed by deleting the temp name
-of all keys whose files are removed in the diff. That is more renames and
+Export can first try to rename all files that are deleted/modified
+to their key's temp name (falling back to deleting since not all
+special remotes support rename), and then, in a second pass, rename
+from the temp name to the new name. Followed by deleting the temp name
+of all keys whose files are deleted in the diff. That is more renames and
deletes than strictly necessary, but it will statelessly clean up
an interruped export as long as it's run again with the same new tree.
But, an export of tree B should clean up after
an interrupted export of tree A. Some state is needed to handle this.
Before starting the export of tree A, record it somewhere. Then when
-resuming, diff A..B, and rename/delete the temp names of the keys in the
-diff. As well as diffing from the last fully exported tree to B and doing
-the same rename/delete.
+resuming, diff A..B, and delete the temp names of the keys in the
+diff. (Can't rename here, because we don't know what was the content
+of a file when an export was interrupted.)
So, before an export does anything, need to record the tree that's about
to be exported to export.log, not as an exported tree, but as a goal.
+
+## renames and export conflicts
+
+What is there's an export conflict going on at the same time that a file
+in the export gets renamed?
+
+Suppose that there are two git repos A and B, each exporting to the same
+remote. A and B are not currently communicating. A exports T1 which
+contains F. B exports T2, which has a different content for F.
+
+Then A exports T3, which renames F to G. If that rename is done
+on the remote, then A will think it's successfully exported T3,
+but G will have F's content from T2, not from T1.
+
+When A and B reconnect, the export conflict will be detected.
+To resolve the export conflict, it says above to:
+
+> To resolve, diff each exported tree in turn against the resolving tree
+> and delete all files that differ. Then, upload all missing files.
+
+Assume that the resolving tree is T3. So B's export of T2 is diffed against
+T3. F differs and is deleted (no change). G differs and is deleted,
+which fixes up the problem that the wrong content was renamed to G.
+G is missing so gets uploaded.
+
+So, this works, as long as "delete all files that differ" means it
+deletes both old and new files. And as long as conflict resolution does not
+itself stash away files in the temp name for later renaming.
diff --git a/doc/git-annex-export.mdwn b/doc/git-annex-export.mdwn
index c8d8eac9a..e3cbcbd7a 100644
--- a/doc/git-annex-export.mdwn
+++ b/doc/git-annex-export.mdwn
@@ -31,6 +31,25 @@ verification of content downloaded from an export. Some types of keys,
that are not based on checksums, cannot be downloaded from an export.
And, git-annex will never trust an export to retain the content of a key.
+# EXPORT CONFLICTS
+
+If two different git-annex repositories are both exporting different trees
+to the same special remote, it's possible for an export conflict to occur.
+This leaves the special remote with some files from one tree, and some
+files from the other. Files in the special remote may have entirely the
+wrong content as well.
+
+It's not possible for git-annex to detect when making an export will result
+in an export conflict. The best way to avoid export conflicts is to either
+only ever export to a special remote from a single repository, or to have a
+rule about the tree that you export to the special remote. For example, if
+you always export origin/master after pushing to origin, then an export
+conflict can't happen.
+
+An export conflict can only be detected after the two git repositories
+that produced it get back in sync. Then the next time you run `git annex
+export`, it will detect the export conflict, and resolve it.
+
# SEE ALSO
[[git-annex]](1)