diff options
author | Joey Hess <joey@kitenet.net> | 2012-03-12 15:21:20 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2012-03-12 15:21:58 -0400 |
commit | faf3a94fa7dfaaf7f95477895c645ff793dcf2f4 (patch) | |
tree | 9a7854ad4b5516f7451c71cb02625472de5e9a45 | |
parent | 32f9742a883c2a9f6ed29f1261c64c7590ecc8ed (diff) |
added second stage bloom filter
-rw-r--r-- | Command/Unused.hs | 112 |
1 files changed, 68 insertions, 44 deletions
diff --git a/Command/Unused.hs b/Command/Unused.hs index cc7ff7c71..028e20445 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -69,9 +69,7 @@ checkUnused = chain 0 return [] findunused False = do showAction "checking for unused data" - b <- genBloomFilter show withKeysReferenced' - bloomFilter b <$> getKeysPresent - -- TODO: check branches + excludeReferenced =<< getKeysPresent chain _ [] = next $ return True chain v (a:as) = do v' <- a v @@ -145,28 +143,32 @@ dropMsg (Just r) = dropMsg' $ " --from " ++ Remote.name r dropMsg' :: String -> String dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n" -{- Finds keys in the list that are not referenced in the git repository. -} +{- Finds keys in the list that are not referenced in the git repository. + - + - Strategy: + - + - * Build a bloom filter of all keys referenced by symlinks. This + - is the fastest one to build and will filter out most keys. + - * If keys remain, build a second bloom filter of keys referenced by + - all branches. + - * The list is streamed through these bloom filters lazily, so both will + - exist at the same time. This means that twice the memory is used, + - but they're relatively small, so the added complexity of using a + - mutable bloom filter does not seem worthwhile. + - * Generating the second bloom filter can take quite a while, since + - it needs enumerating all keys in all git branches. But, the common + - case, if the second filter is needed, is for some keys to be globally + - unused, and in that case, no short-circuit is possible. + - Short-circuiting if the first filter filters all the keys handles the + - other common case. + -} excludeReferenced :: [Key] -> Annex [Key] -excludeReferenced [] = return [] -- optimisation -excludeReferenced l = do - let s = S.fromList l - !s' <- withKeysReferenced s S.delete - go s' =<< refs <$> (inRepo $ Git.Command.pipeRead [Param "show-ref"]) +excludeReferenced ks = runfilter firstlevel ks >>= runfilter secondlevel where - -- Skip the git-annex branches, and get all other unique refs. - refs = map (Git.Ref . snd) . - nubBy uniqref . - filter ourbranches . - map (separate (== ' ')) . lines - uniqref (a, _) (b, _) = a == b - ourbranchend = '/' : show Annex.Branch.name - ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b - go s [] = return $ S.toList s - go s (r:rs) - | s == S.empty = return [] -- optimisation - | otherwise = do - s' <- withKeysReferencedInGit r s S.delete - go s' rs + runfilter _ [] = return [] -- optimisation + runfilter a l = bloomFilter show l <$> genBloomFilter show a + firstlevel = withKeysReferencedM + secondlevel = withKeysReferencedInGit {- Finds items in the first, smaller list, that are not - present in the second, larger list. @@ -180,7 +182,7 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller where remove a b = foldl (flip S.delete) b a -{- Creates a bloom filter, and runs an action, such as withKeysReferenced', +{- Creates a bloom filter, and runs an action, such as withKeysReferenced, - to populate it. - - The action is passed a callback that it can use to feed values into the @@ -189,29 +191,36 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller - Once the action completes, the mutable filter is frozen - for later use. -} -genBloomFilter :: Hashable t => (v -> t) -> (() -> (v -> () -> Annex ()) -> Annex b) -> Annex (Bloom t) +genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t) genBloomFilter convert populate = do - -- A bloom filter capable of holding one million keys with a - -- false positive rate of 0.1% uses 16 mb of memory. + -- A bloom filter capable of holding half a million keys with a + -- false positive rate of 0.1% uses around 8 mb of memory. -- TODO: make this configurable, for the really large repos, -- or really low false positive rates. - let (numbits, numhashes) = suggestSizing 1000000 0.0001 + let (numbits, numhashes) = suggestSizing 500000 0.001 bloom <- lift $ newMB (cheapHashes numhashes) numbits - _ <- populate () $ \v _ -> lift $ insertMB bloom (convert v) + _ <- populate $ \v -> lift $ insertMB bloom (convert v) lift $ unsafeFreezeMB bloom where lift = liftIO . stToIO -bloomFilter :: Bloom String -> [Key] -> [Key] -bloomFilter b l = filter (\k -> show k `notElemB` b) l +bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v] +bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l -{- Given an initial value, mutates it using an action for each - - key referenced by symlinks in the git repo. -} +{- Given an initial value, folds it with each key referenced by + - symlinks in the git repo. -} withKeysReferenced :: v -> (Key -> v -> v) -> Annex v -withKeysReferenced initial a = withKeysReferenced' initial reta +withKeysReferenced initial a = withKeysReferenced' initial folda + where + folda k v = return $ a k v + +{- Runs an action on each referenced key in the git repo. -} +withKeysReferencedM :: (Key -> Annex ()) -> Annex () +withKeysReferencedM a = withKeysReferenced' () calla where - reta k v = return $ a k v + calla k _ = a k + withKeysReferenced' :: v -> (Key -> v -> Annex v) -> Annex v withKeysReferenced' initial a = go initial =<< files where @@ -227,21 +236,36 @@ withKeysReferenced' initial a = go initial =<< files !v' <- a k v go v' fs -withKeysReferencedInGit :: Git.Ref -> v -> (Key -> v -> v) -> Annex v -withKeysReferencedInGit ref initial a = do + +withKeysReferencedInGit :: (Key -> Annex ()) -> Annex () +withKeysReferencedInGit a = do + rs <- relevantrefs <$> showref + forM_ rs (withKeysReferencedInGitRef a) + where + showref = inRepo $ Git.Command.pipeRead [Param "show-ref"] + relevantrefs = map (Git.Ref . snd) . + nubBy uniqref . + filter ourbranches . + map (separate (== ' ')) . lines + uniqref (x, _) (y, _) = x == y + ourbranchend = '/' : show Annex.Branch.name + ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b + +withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex () +withKeysReferencedInGitRef a ref = do showAction $ "checking " ++ Git.Ref.describe ref - go initial =<< inRepo (LsTree.lsTree ref) + go =<< inRepo (LsTree.lsTree ref) where - go v [] = return v - go v (l:ls) + go [] = return () + go (l:ls) | isSymLink (LsTree.mode l) = do content <- L.decodeUtf8 <$> catFile ref (LsTree.file l) case fileKey (takeFileName $ L.unpack content) of - Nothing -> go v ls + Nothing -> go ls Just k -> do - let !v' = a k v - go v' ls - | otherwise = go v ls + a k + go ls + | otherwise = go ls {- Looks in the specified directory for bad/tmp keys, and returns a list - of those that might still have value, or might be stale and removable. |