summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2017-09-04 14:33:09 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2017-09-04 14:39:32 -0400
commit25ed1e54abcc25f729fed016ec77a8cd049142fa (patch)
treef6130202fc2f23a6b641f8646705c900b757a012
parent3db6fd7621d3fa57bebfea216019e522f6d8d781 (diff)
use export db to correctly handle duplicate files
Removed uncorrect UniqueKey key in db schema; a key can appear multiple times with different files. The database has to be flushed after each removal. But when adding files to the export, lots of changes are able to be queued up w/o flushing. So it's still fairly efficient. If large removals of files from exports are too slow, an alternative would be to make two passes over the diff, one pass queueing deletions from the database, then a flush and the a second pass updating the location log. But that would use more memory, and need to look up exportKey twice per removed file, so I've avoided such optimisation yet. This commit was supported by the NSF-funded DataLad project.
-rw-r--r--Command/Export.hs41
-rw-r--r--Database/Export.hs7
-rw-r--r--Types/Remote.hs1
-rw-r--r--doc/design/exporting_trees_to_special_remotes.mdwn3
-rw-r--r--doc/todo/export.mdwn11
5 files changed, 31 insertions, 32 deletions
diff --git a/Command/Export.hs b/Command/Export.hs
index c09253dc9..3387a14ad 100644
--- a/Command/Export.hs
+++ b/Command/Export.hs
@@ -113,21 +113,20 @@ seek o = do
startExport :: Remote -> ExportHandle -> Git.LsTree.TreeItem -> CommandStart
startExport r db ti = do
ek <- exportKey (Git.LsTree.sha ti)
- liftIO $ addExportLocation db (asKey ek) loc
- stopUnless (notElem (uuid r) <$> loggedLocations (asKey ek)) $ do
+ stopUnless (liftIO $ notElem loc <$> getExportLocation db (asKey ek)) $ do
showStart "export" f
- next $ performExport r ek (Git.LsTree.sha ti) loc
+ next $ performExport r db ek (Git.LsTree.sha ti) loc
where
loc = ExportLocation $ toInternalGitPath f
f = getTopFilePath $ Git.LsTree.file ti
-performExport :: Remote -> ExportKey -> Sha -> ExportLocation -> CommandPerform
-performExport r ek contentsha loc = do
+performExport :: Remote -> ExportHandle -> ExportKey -> Sha -> ExportLocation -> CommandPerform
+performExport r db ek contentsha loc = do
let storer = storeExport $ exportActions r
sent <- case ek of
AnnexKey k -> ifM (inAnnex k)
( metered Nothing k $ \m -> do
- let rollback = void $ performUnexport r ek loc
+ let rollback = void $ performUnexport r db ek loc
sendAnnex k rollback
(\f -> storer f k loc m)
, do
@@ -142,11 +141,12 @@ performExport r ek contentsha loc = do
liftIO $ hClose h
storer tmp sha1k loc m
if sent
- then next $ cleanupExport r ek
+ then next $ cleanupExport r db ek loc
else stop
-cleanupExport :: Remote -> ExportKey -> CommandCleanup
-cleanupExport r ek = do
+cleanupExport :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> CommandCleanup
+cleanupExport r db ek loc = do
+ liftIO $ addExportLocation db (asKey ek) loc
logChange (asKey ek) (uuid r) InfoPresent
return True
@@ -154,23 +154,28 @@ startUnexport :: Remote -> ExportHandle -> Git.DiffTree.DiffTreeItem -> CommandS
startUnexport r db diff
| Git.DiffTree.srcsha diff /= nullSha = do
showStart "unexport" f
- oldk <- exportKey (Git.DiffTree.srcsha diff)
- liftIO $ removeExportLocation db (asKey oldk) loc
- next $ performUnexport r oldk loc
+ ek <- exportKey (Git.DiffTree.srcsha diff)
+ next $ performUnexport r db ek loc
| otherwise = stop
where
loc = ExportLocation $ toInternalGitPath f
f = getTopFilePath $ Git.DiffTree.file diff
-performUnexport :: Remote -> ExportKey -> ExportLocation -> CommandPerform
-performUnexport r ek loc = do
+performUnexport :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> CommandPerform
+performUnexport r db ek loc = do
let remover = removeExport $ exportActions r
ok <- remover (asKey ek) loc
if ok
- then next $ cleanupUnexport r ek
+ then next $ cleanupUnexport r db ek loc
else stop
-cleanupUnexport :: Remote -> ExportKey -> CommandCleanup
-cleanupUnexport r ek = do
- logChange (asKey ek) (uuid r) InfoMissing
+cleanupUnexport :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> CommandCleanup
+cleanupUnexport r db ek loc = do
+ liftIO $ do
+ removeExportLocation db (asKey ek) loc
+ -- Flush so that getExportLocation sees this and any
+ -- other removals of the key.
+ flushDbQueue db
+ whenM (liftIO $ null <$> getExportLocation db (asKey ek)) $
+ logChange (asKey ek) (uuid r) InfoMissing
return True
diff --git a/Database/Export.hs b/Database/Export.hs
index e2986d075..dcef88854 100644
--- a/Database/Export.hs
+++ b/Database/Export.hs
@@ -16,6 +16,7 @@ module Database.Export (
closeDb,
addExportLocation,
removeExportLocation,
+ flushDbQueue,
getExportLocation,
ExportedId,
) where
@@ -37,7 +38,6 @@ Exported
key IKey
file SFilePath
KeyFileIndex key file
- UniqueKey key
|]
{- Opens the database, creating it if it doesn't exist yet. -}
@@ -74,7 +74,10 @@ removeExportLocation h k (ExportLocation f) = queueDb h $
ik = toIKey k
ef = toSFilePath f
-{- Doesn't know about recently queued changes. -}
+flushDbQueue :: ExportHandle -> IO ()
+flushDbQueue (ExportHandle h) = H.flushDbQueue h
+
+{- Note that this does not see recently queued changes. -}
getExportLocation :: ExportHandle -> Key -> IO [ExportLocation]
getExportLocation (ExportHandle h) k = H.queryDbQueue h $ do
l <- select $ from $ \r -> do
diff --git a/Types/Remote.hs b/Types/Remote.hs
index a0174ebee..81f1dbe23 100644
--- a/Types/Remote.hs
+++ b/Types/Remote.hs
@@ -159,6 +159,7 @@ unVerified a = do
-- The FilePath will be relative, and may contain unix-style path
-- separators.
newtype ExportLocation = ExportLocation FilePath
+ deriving (Eq)
data ExportActions a = ExportActions
{ exportSupported :: a Bool
diff --git a/doc/design/exporting_trees_to_special_remotes.mdwn b/doc/design/exporting_trees_to_special_remotes.mdwn
index 118f12978..7ff1df870 100644
--- a/doc/design/exporting_trees_to_special_remotes.mdwn
+++ b/doc/design/exporting_trees_to_special_remotes.mdwn
@@ -147,7 +147,8 @@ remotes, don't let it be turned off.
The same file contents may be in a treeish multiple times under different
filenames. That complicates using location tracking. One file may have been
exported and the other not, and location tracking says that the content
-is present in the export.
+is present in the export. A sqlite database is needed to keep track of
+this.
## recording exported filenames in git-annex branch
diff --git a/doc/todo/export.mdwn b/doc/todo/export.mdwn
index 828e1c55b..99877423b 100644
--- a/doc/todo/export.mdwn
+++ b/doc/todo/export.mdwn
@@ -18,17 +18,6 @@ there need to be a new interface in supported remotes?
Work is in progress. Todo list:
* Use retrieveExport when getting from export remotes.
- (Needs a map from key to ExportLocation)
* Efficient handling of renames.
-* If the same content is present in two different files, export
- location tracking can be messed up.
-
- When one of the files is deleted and
- that tree is exported, the location log for the key will be updated
- to say it's not present, even though the other file is still present.
-
- And, once one of the files is uploaded, the location log will
- say the content is present, so the pass over the tree won't try to
- upload the other file. (See design for a fix for this.)
* Support export to aditional special remotes (S3 etc)
* Support export to external special remotes.