summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2012-03-11 22:00:49 -0400
committerGravatar Joey Hess <joey@kitenet.net>2012-03-12 02:39:25 -0400
commit160715166b95f17b76b4b0bd47bbec4fdc6c1aac (patch)
tree4b6dab0754e8aba0a386bd2eff1f1405e1cc6c3e
parent83bbb3bc9330de7c4d2b8c07ef48da3bea82433d (diff)
try at using bloom filters
leaks memory
-rw-r--r--Command/Unused.hs19
1 files changed, 18 insertions, 1 deletions
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 71cbfd470..2fb278126 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -12,6 +12,9 @@ module Command.Unused where
import qualified Data.Set as S
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Encoding as L
+import Data.BloomFilter
+import Data.BloomFilter.Easy
+import Data.BloomFilter.Hash
import Common.Annex
import Command
@@ -53,6 +56,18 @@ start = do
showStart "unused" name
next action
+genBloomFilter :: [Key] -> Annex (Bloom String)
+genBloomFilter ks = do
+ -- A bloom filter capable of holding one million keys with a
+ -- false positive rate of 0.1% uses 16 mb of memory.
+ -- TODO: make this configurable, for the really large repos,
+ -- or really low false positive rates.
+ let (numbits, numhashes) = suggestSizing 1000000 0.0001
+ return $ fromListB (cheapHashes numhashes) numbits $ map show ks
+
+bloomFilter :: Bloom String -> [Key] -> [Key]
+bloomFilter b l = filter (\k -> show k `notElemB` b) l
+
checkUnused :: CommandPerform
checkUnused = chain 0
[ check "" unusedMsg $ findunused =<< Annex.getState Annex.fast
@@ -65,7 +80,9 @@ checkUnused = chain 0
return []
findunused False = do
showAction "checking for unused data"
- excludeReferenced =<< getKeysPresent
+ b <- genBloomFilter =<< withKeysReferenced [] (:)
+ bloomFilter b <$> getKeysPresent
+ -- TODO: check branches
chain _ [] = next $ return True
chain v (a:as) = do
v' <- a v