From 2a4a641a3c940a652f62371f417433b47f7f3e79 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 16 Jun 2015 18:38:12 -0400 Subject: use bloom filter in second pass of sync --all --content This is needed because when preferred content matches on files, the second pass would otherwise want to drop all keys. Using a bloom filter avoids this, and in the case of a false positive, a key will be left undropped that preferred content would allow dropping. Chances of that happening are a mere 1 in 1 million. --- Annex/BloomFilter.hs | 4 ++-- Command/Sync.hs | 52 +++++++++++++++++++++++++++++++++++----------------- Types/Key.hs | 4 ++-- Utility/Bloom.hs | 7 +++++++ 4 files changed, 46 insertions(+), 21 deletions(-) diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs index 3ac81fa58..5773a88ee 100644 --- a/Annex/BloomFilter.hs +++ b/Annex/BloomFilter.hs @@ -40,11 +40,11 @@ bloomBitsHashes = do - Once the action completes, the mutable filter is frozen - for later use. -} -genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex b) -> Annex (Bloom v) +genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v) genBloomFilter populate = do (numbits, numhashes) <- bloomBitsHashes bloom <- lift $ newMB (cheapHashes numhashes) numbits - _ <- populate $ \v -> lift $ insertMB bloom v + populate $ \v -> lift $ insertMB bloom v lift $ unsafeFreezeMB bloom where lift = liftIO . stToIO diff --git a/Command/Sync.hs b/Command/Sync.hs index 80ecce43e..90f4281a4 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -45,6 +45,7 @@ import Annex.UUID import Logs.UUID import Annex.AutoMerge import Annex.Ssh +import Annex.BloomFilter import Utility.Bloom import Control.Concurrent.MVar @@ -358,7 +359,12 @@ newer remote b = do , return True ) -{- If it's preferred content, and we don't have it, get it from one of the +{- Without --all, only looks at files in the work tree. With --all, + - makes 2 passes, first looking at the work tree and then all keys. + - This ensures that preferred content expressions that match on + - filenames work, even when in --all mode. + - + - If it's preferred content, and we don't have it, get it from one of the - listed remotes (preferring the cheaper earlier ones). - - Send it to each remote that doesn't have it, and for which it's @@ -374,22 +380,20 @@ newer remote b = do seekSyncContent :: [Remote] -> Annex Bool seekSyncContent rs = do mvar <- liftIO newEmptyMVar - -- Always start with the work tree; this ensures that preferred - -- content expressions that match files match, even when in --all - -- mode. - seekworktree mvar [] - withKeyOptions' False (seekkeys mvar) (const noop) [] + bloom <- genBloomFilter (seekworktree mvar []) + withKeyOptions' False (seekkeys mvar bloom) (const noop) [] liftIO $ not <$> isEmptyMVar mvar where - seekworktree mvar = seekHelper LsFiles.inRepo >=> - mapM_ (\f -> ifAnnexed f (go mvar (Just f)) noop) - seekkeys mvar getkeys = mapM_ (go mvar Nothing) =<< getkeys - go mvar af k = do + seekworktree mvar l bloomfeeder = seekHelper LsFiles.inRepo l >>= + mapM_ (\f -> ifAnnexed f (go (Right bloomfeeder) mvar (Just f)) noop) + seekkeys mvar bloom getkeys = + mapM_ (go (Left bloom) mvar Nothing) =<< getkeys + go ebloom mvar af k = do void $ liftIO $ tryPutMVar mvar () - syncFile rs af k + syncFile ebloom rs af k -syncFile :: [Remote] -> AssociatedFile -> Key -> Annex () -syncFile rs af k = do +syncFile :: Either (Bloom Key) (Key -> Annex ()) -> [Remote] -> AssociatedFile -> Key -> Annex () +syncFile ebloom rs af k = do locs <- loggedLocations k let (have, lack) = partition (\r -> Remote.uuid r `elem` locs) rs @@ -399,10 +403,24 @@ syncFile rs af k = do u <- getUUID let locs' = concat [[u | got], putrs, locs] - -- Using callCommandAction rather than includeCommandAction for drops, - -- because a failure to drop does not mean the sync failed. - handleDropsFrom locs' rs "unwanted" True k af - Nothing callCommandAction + -- A bloom filter is populated with all the keys in the first pass. + -- On the second pass, avoid dropping keys that were seen in the + -- first pass, which would happen otherwise when preferred content + -- matches on the filename, which is not available in the second + -- pass. + -- + -- When there's a false positive in the bloom filter, the result + -- is keeping a key that preferred content doesn't really want. + seenbloom <- case ebloom of + Left bloom -> pure (elemB k bloom) + Right bloomfeeder -> bloomfeeder k >> return False + unless seenbloom $ + -- Using callCommandAction rather than + -- includeCommandAction for drops, + -- because a failure to drop does not mean + -- the sync failed. + handleDropsFrom locs' rs "unwanted" True k af + Nothing callCommandAction where wantget have = allM id [ pure (not $ null have) diff --git a/Types/Key.hs b/Types/Key.hs index 7f9f514c7..06adc4b91 100644 --- a/Types/Key.hs +++ b/Types/Key.hs @@ -132,8 +132,8 @@ instance Arbitrary Key where <*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative instance Hashable Key where - hashIO32 = hashIO32 . show - hashIO64 = hashIO64 . show + hashIO32 = hashIO32 . key2file + hashIO64 = hashIO64 . key2file prop_idempotent_key_encode :: Key -> Bool prop_idempotent_key_encode k = Just k == (file2key . key2file) k diff --git a/Utility/Bloom.hs b/Utility/Bloom.hs index 9076abddb..668901f76 100644 --- a/Utility/Bloom.hs +++ b/Utility/Bloom.hs @@ -12,6 +12,7 @@ module Utility.Bloom ( safeSuggestSizing, Hashable(..), cheapHashes, + elemB, notElemB, newMB, @@ -34,6 +35,9 @@ import Control.Monad.ST (ST) notElemB :: a -> Bloom a -> Bool notElemB = Bloom.notElem +elemB :: a -> Bloom a -> Bool +elemB = Bloom.elem + newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (MBloom.MBloom s a) newMB = MBloom.new @@ -48,6 +52,9 @@ unsafeFreezeMB = Bloom.unsafeFreeze notElemB :: a -> Bloom a -> Bool notElemB = Bloom.notElemB +elemB :: a -> Bloom a -> Bool +elemB = Bloom.elem + newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (Bloom.MBloom s a) newMB = Bloom.newMB -- cgit v1.2.3