diff options
author | Joey Hess <joey@kitenet.net> | 2012-03-12 16:18:14 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2012-03-12 16:18:35 -0400 |
commit | 25809ce2e0861a54ec63a414037b95fe29acc6df (patch) | |
tree | 0aae10e37dc9c430ce4c182b686772f9504fb332 /Command/Unused.hs | |
parent | faf3a94fa7dfaaf7f95477895c645ff793dcf2f4 (diff) |
finish bloom filters
Add tuning, docs, etc.
Not sure if status is the right place to remote size.. perhaps unused
should report the size and also warn if it sees more keys than the bloom
filter allows?
Diffstat (limited to 'Command/Unused.hs')
-rw-r--r-- | Command/Unused.hs | 24 |
1 files changed, 18 insertions, 6 deletions
diff --git a/Command/Unused.hs b/Command/Unused.hs index 028e20445..b878ab265 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -29,6 +29,7 @@ import qualified Git.Command import qualified Git.Ref import qualified Git.LsFiles as LsFiles import qualified Git.LsTree as LsTree +import qualified Git.Config import qualified Backend import qualified Remote import qualified Annex.Branch @@ -182,6 +183,22 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller where remove a b = foldl (flip S.delete) b a +{- A bloom filter capable of holding half a million keys with a + - false positive rate of 1 in 1000 uses around 8 mb of memory, + - so will easily fit on even my lowest memory systems. + -} +bloomCapacity :: Annex Int +bloomCapacity = fromMaybe 500000 . readish + <$> fromRepo (Git.Config.get "annex.bloomcapacity" "") +bloomAccuracy :: Annex Int +bloomAccuracy = fromMaybe 1000 . readish + <$> fromRepo (Git.Config.get "annex.bloomaccuracy" "") +bloomBitsHashes :: Annex (Int, Int) +bloomBitsHashes = do + capacity <- bloomCapacity + accuracy <- bloomAccuracy + return $ suggestSizing capacity (1/ fromIntegral accuracy) + {- Creates a bloom filter, and runs an action, such as withKeysReferenced, - to populate it. - @@ -193,12 +210,7 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller -} genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t) genBloomFilter convert populate = do - -- A bloom filter capable of holding half a million keys with a - -- false positive rate of 0.1% uses around 8 mb of memory. - -- TODO: make this configurable, for the really large repos, - -- or really low false positive rates. - let (numbits, numhashes) = suggestSizing 500000 0.001 - + (numbits, numhashes) <- bloomBitsHashes bloom <- lift $ newMB (cheapHashes numhashes) numbits _ <- populate $ \v -> lift $ insertMB bloom (convert v) lift $ unsafeFreezeMB bloom |