aboutsummaryrefslogtreecommitdiff
path: root/Annex/BloomFilter.hs
blob: af467503a031bec5b36250fed9543a178dc0f453 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
{- git-annex bloom filter
 -
 - Copyright 2010-2015 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU GPL version 3 or higher.
 -}

module Annex.BloomFilter where

import Annex.Common
import qualified Annex
import Utility.Bloom

import Control.Monad.ST

{- A bloom filter capable of holding half a million keys with a
 - false positive rate of 1 in 10000000 uses around 16 mb of memory,
 - so will easily fit on even my lowest memory systems.
 -}
bloomCapacity :: Annex Int
bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
bloomAccuracy :: Annex Int
bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
bloomBitsHashes :: Annex (Int, Int)
bloomBitsHashes = do
	capacity <- bloomCapacity
	accuracy <- bloomAccuracy
	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
		Left e -> do
			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
			-- precaulculated value for 500000 (1/10000000)
			return (16777216,23)
		Right v -> return v

{- Creates a bloom filter, and runs an action to populate it.
 -
 - The action is passed a callback that it can use to feed values into the
 - bloom filter. 
 -
 - Once the action completes, the mutable filter is frozen
 - for later use.
 -}
genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
genBloomFilter populate = do
	(numbits, numhashes) <- bloomBitsHashes
	bloom <- lift $ newMB (cheapHashes numhashes) numbits
	populate $ \v -> lift $ insertMB bloom v
	lift $ unsafeFreezeMB bloom
  where
	lift = liftIO . stToIO

bloomFilter :: [v] -> Bloom v -> [v]
bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l