diff options
author | Joey Hess <joey@kitenet.net> | 2012-03-11 22:00:49 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2012-03-12 02:39:25 -0400 |
commit | 160715166b95f17b76b4b0bd47bbec4fdc6c1aac (patch) | |
tree | 4b6dab0754e8aba0a386bd2eff1f1405e1cc6c3e /Command | |
parent | 83bbb3bc9330de7c4d2b8c07ef48da3bea82433d (diff) |
try at using bloom filters
leaks memory
Diffstat (limited to 'Command')
-rw-r--r-- | Command/Unused.hs | 19 |
1 files changed, 18 insertions, 1 deletions
diff --git a/Command/Unused.hs b/Command/Unused.hs index 71cbfd470..2fb278126 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -12,6 +12,9 @@ module Command.Unused where import qualified Data.Set as S import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Encoding as L +import Data.BloomFilter +import Data.BloomFilter.Easy +import Data.BloomFilter.Hash import Common.Annex import Command @@ -53,6 +56,18 @@ start = do showStart "unused" name next action +genBloomFilter :: [Key] -> Annex (Bloom String) +genBloomFilter ks = do + -- A bloom filter capable of holding one million keys with a + -- false positive rate of 0.1% uses 16 mb of memory. + -- TODO: make this configurable, for the really large repos, + -- or really low false positive rates. + let (numbits, numhashes) = suggestSizing 1000000 0.0001 + return $ fromListB (cheapHashes numhashes) numbits $ map show ks + +bloomFilter :: Bloom String -> [Key] -> [Key] +bloomFilter b l = filter (\k -> show k `notElemB` b) l + checkUnused :: CommandPerform checkUnused = chain 0 [ check "" unusedMsg $ findunused =<< Annex.getState Annex.fast @@ -65,7 +80,9 @@ checkUnused = chain 0 return [] findunused False = do showAction "checking for unused data" - excludeReferenced =<< getKeysPresent + b <- genBloomFilter =<< withKeysReferenced [] (:) + bloomFilter b <$> getKeysPresent + -- TODO: check branches chain _ [] = next $ return True chain v (a:as) = do v' <- a v |