summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2015-06-16 18:38:12 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2015-06-16 18:50:13 -0400
commit2a4a641a3c940a652f62371f417433b47f7f3e79 (patch)
treee610e549f25dd4cd6719fcce9dcde14a69c46bd2
parente3da28295e11972bcb14749ef294d1f39fb03efa (diff)
use bloom filter in second pass of sync --all --content
This is needed because when preferred content matches on files, the second pass would otherwise want to drop all keys. Using a bloom filter avoids this, and in the case of a false positive, a key will be left undropped that preferred content would allow dropping. Chances of that happening are a mere 1 in 1 million.
-rw-r--r--Annex/BloomFilter.hs4
-rw-r--r--Command/Sync.hs52
-rw-r--r--Types/Key.hs4
-rw-r--r--Utility/Bloom.hs7
4 files changed, 46 insertions, 21 deletions
diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs
index 3ac81fa58..5773a88ee 100644
--- a/Annex/BloomFilter.hs
+++ b/Annex/BloomFilter.hs
@@ -40,11 +40,11 @@ bloomBitsHashes = do
- Once the action completes, the mutable filter is frozen
- for later use.
-}
-genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex b) -> Annex (Bloom v)
+genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
genBloomFilter populate = do
(numbits, numhashes) <- bloomBitsHashes
bloom <- lift $ newMB (cheapHashes numhashes) numbits
- _ <- populate $ \v -> lift $ insertMB bloom v
+ populate $ \v -> lift $ insertMB bloom v
lift $ unsafeFreezeMB bloom
where
lift = liftIO . stToIO
diff --git a/Command/Sync.hs b/Command/Sync.hs
index 80ecce43e..90f4281a4 100644
--- a/Command/Sync.hs
+++ b/Command/Sync.hs
@@ -45,6 +45,7 @@ import Annex.UUID
import Logs.UUID
import Annex.AutoMerge
import Annex.Ssh
+import Annex.BloomFilter
import Utility.Bloom
import Control.Concurrent.MVar
@@ -358,7 +359,12 @@ newer remote b = do
, return True
)
-{- If it's preferred content, and we don't have it, get it from one of the
+{- Without --all, only looks at files in the work tree. With --all,
+ - makes 2 passes, first looking at the work tree and then all keys.
+ - This ensures that preferred content expressions that match on
+ - filenames work, even when in --all mode.
+ -
+ - If it's preferred content, and we don't have it, get it from one of the
- listed remotes (preferring the cheaper earlier ones).
-
- Send it to each remote that doesn't have it, and for which it's
@@ -374,22 +380,20 @@ newer remote b = do
seekSyncContent :: [Remote] -> Annex Bool
seekSyncContent rs = do
mvar <- liftIO newEmptyMVar
- -- Always start with the work tree; this ensures that preferred
- -- content expressions that match files match, even when in --all
- -- mode.
- seekworktree mvar []
- withKeyOptions' False (seekkeys mvar) (const noop) []
+ bloom <- genBloomFilter (seekworktree mvar [])
+ withKeyOptions' False (seekkeys mvar bloom) (const noop) []
liftIO $ not <$> isEmptyMVar mvar
where
- seekworktree mvar = seekHelper LsFiles.inRepo >=>
- mapM_ (\f -> ifAnnexed f (go mvar (Just f)) noop)
- seekkeys mvar getkeys = mapM_ (go mvar Nothing) =<< getkeys
- go mvar af k = do
+ seekworktree mvar l bloomfeeder = seekHelper LsFiles.inRepo l >>=
+ mapM_ (\f -> ifAnnexed f (go (Right bloomfeeder) mvar (Just f)) noop)
+ seekkeys mvar bloom getkeys =
+ mapM_ (go (Left bloom) mvar Nothing) =<< getkeys
+ go ebloom mvar af k = do
void $ liftIO $ tryPutMVar mvar ()
- syncFile rs af k
+ syncFile ebloom rs af k
-syncFile :: [Remote] -> AssociatedFile -> Key -> Annex ()
-syncFile rs af k = do
+syncFile :: Either (Bloom Key) (Key -> Annex ()) -> [Remote] -> AssociatedFile -> Key -> Annex ()
+syncFile ebloom rs af k = do
locs <- loggedLocations k
let (have, lack) = partition (\r -> Remote.uuid r `elem` locs) rs
@@ -399,10 +403,24 @@ syncFile rs af k = do
u <- getUUID
let locs' = concat [[u | got], putrs, locs]
- -- Using callCommandAction rather than includeCommandAction for drops,
- -- because a failure to drop does not mean the sync failed.
- handleDropsFrom locs' rs "unwanted" True k af
- Nothing callCommandAction
+ -- A bloom filter is populated with all the keys in the first pass.
+ -- On the second pass, avoid dropping keys that were seen in the
+ -- first pass, which would happen otherwise when preferred content
+ -- matches on the filename, which is not available in the second
+ -- pass.
+ --
+ -- When there's a false positive in the bloom filter, the result
+ -- is keeping a key that preferred content doesn't really want.
+ seenbloom <- case ebloom of
+ Left bloom -> pure (elemB k bloom)
+ Right bloomfeeder -> bloomfeeder k >> return False
+ unless seenbloom $
+ -- Using callCommandAction rather than
+ -- includeCommandAction for drops,
+ -- because a failure to drop does not mean
+ -- the sync failed.
+ handleDropsFrom locs' rs "unwanted" True k af
+ Nothing callCommandAction
where
wantget have = allM id
[ pure (not $ null have)
diff --git a/Types/Key.hs b/Types/Key.hs
index 7f9f514c7..06adc4b91 100644
--- a/Types/Key.hs
+++ b/Types/Key.hs
@@ -132,8 +132,8 @@ instance Arbitrary Key where
<*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative
instance Hashable Key where
- hashIO32 = hashIO32 . show
- hashIO64 = hashIO64 . show
+ hashIO32 = hashIO32 . key2file
+ hashIO64 = hashIO64 . key2file
prop_idempotent_key_encode :: Key -> Bool
prop_idempotent_key_encode k = Just k == (file2key . key2file) k
diff --git a/Utility/Bloom.hs b/Utility/Bloom.hs
index 9076abddb..668901f76 100644
--- a/Utility/Bloom.hs
+++ b/Utility/Bloom.hs
@@ -12,6 +12,7 @@ module Utility.Bloom (
safeSuggestSizing,
Hashable(..),
cheapHashes,
+ elemB,
notElemB,
newMB,
@@ -34,6 +35,9 @@ import Control.Monad.ST (ST)
notElemB :: a -> Bloom a -> Bool
notElemB = Bloom.notElem
+elemB :: a -> Bloom a -> Bool
+elemB = Bloom.elem
+
newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (MBloom.MBloom s a)
newMB = MBloom.new
@@ -48,6 +52,9 @@ unsafeFreezeMB = Bloom.unsafeFreeze
notElemB :: a -> Bloom a -> Bool
notElemB = Bloom.notElemB
+elemB :: a -> Bloom a -> Bool
+elemB = Bloom.elem
+
newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (Bloom.MBloom s a)
newMB = Bloom.newMB