summaryrefslogtreecommitdiff
path: root/Annex/BloomFilter.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Annex/BloomFilter.hs')
-rw-r--r--Annex/BloomFilter.hs53
1 files changed, 53 insertions, 0 deletions
diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs
new file mode 100644
index 000000000..3dcd8140b
--- /dev/null
+++ b/Annex/BloomFilter.hs
@@ -0,0 +1,53 @@
+{- git-annex bloom filter
+ -
+ - Copyright 2010-2015 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Annex.BloomFilter where
+
+import Common.Annex
+import qualified Annex
+import Utility.Bloom
+
+import Control.Monad.ST
+
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 10000000 uses around 16 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+ capacity <- bloomCapacity
+ accuracy <- bloomAccuracy
+ case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
+ Left e -> do
+ warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
+ -- precaulculated value for 500000 (1/10000000)
+ return (16777216,23)
+ Right v -> return v
+
+{- Creates a bloom filter, and runs an action to populate it.
+ -
+ - The action is passed a callback that it can use to feed values into the
+ - bloom filter.
+ -
+ - Once the action completes, the mutable filter is frozen
+ - for later use.
+ -}
+genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
+genBloomFilter convert populate = do
+ (numbits, numhashes) <- bloomBitsHashes
+ bloom <- lift $ newMB (cheapHashes numhashes) numbits
+ _ <- populate $ \v -> lift $ insertMB bloom (convert v)
+ lift $ unsafeFreezeMB bloom
+ where
+ lift = liftIO . stToIO
+
+bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
+bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l