finish bloom filters

Add tuning, docs, etc. Not sure if status is the right place to remote size.. perhaps unused should report the size and also warn if it sees more keys than the bloom filter allows?
author: Joey Hess <joey@kitenet.net> 2012-03-12 16:18:14 -0400
committer: Joey Hess <joey@kitenet.net> 2012-03-12 16:18:35 -0400
commit: 25809ce2e0861a54ec63a414037b95fe29acc6df (patch)
tree: 0aae10e37dc9c430ce4c182b686772f9504fb332
parent: faf3a94fa7dfaaf7f95477895c645ff793dcf2f4 (diff)
7 files changed, 64 insertions, 8 deletions
diff --git a/Command/Status.hs b/Command/Status.hs
index 0b1741dc0..eadb4f163 100644
--- a/Command/Status.hs
+++ b/Command/Status.hs
@@ -76,6 +76,7 @@ slow_stats =
 	, local_annex_size
 	, known_annex_keys
 	, known_annex_size
+	, bloom_info
 	, backend_usage
 	]
 
@@ -127,7 +128,7 @@ remote_list level desc = stat n $ nojson $ lift $ do
 	return $ if null s then "0" else show (length rs) ++ "\n" ++ beginning s
 	where
 		n = desc ++ " repositories"
-
+	
 local_annex_size :: Stat
 local_annex_size = stat "local annex size" $ json id $
 	showSizeKeys <$> cachedPresentData
@@ -136,6 +137,22 @@ local_annex_keys :: Stat
 local_annex_keys = stat "local annex keys" $ json show $
 	countKeys <$> cachedPresentData
 
+bloom_info :: Stat
+bloom_info = stat "bloom filter size" $ json id $ do
+	localkeys <- countKeys <$> cachedPresentData
+	capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+	let note = aside $
+		if localkeys >= capacity
+		then "appears too small for this repository; adjust annex.bloomcapacity"
+		else "has room for " ++ show (capacity - localkeys) ++ " more local annex keys"
+
+	-- Two bloom filters are used at the same time, so double the size
+	-- of one.
+	size <- roughSize memoryUnits True . (* 2) . fromIntegral . fst <$>
+		lift Command.Unused.bloomBitsHashes
+
+	return $ size ++ note
+
 known_annex_size :: Stat
 known_annex_size = stat "known annex size" $ json id $
 	showSizeKeys <$> cachedReferencedData
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 028e20445..b878ab265 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -29,6 +29,7 @@ import qualified Git.Command
 import qualified Git.Ref
 import qualified Git.LsFiles as LsFiles
 import qualified Git.LsTree as LsTree
+import qualified Git.Config
 import qualified Backend
 import qualified Remote
 import qualified Annex.Branch
@@ -182,6 +183,22 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
 	where
 		remove a b = foldl (flip S.delete) b a
 
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 1000 uses around 8 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . readish
+	<$> fromRepo (Git.Config.get "annex.bloomcapacity" "")
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 1000 . readish
+	<$> fromRepo (Git.Config.get "annex.bloomaccuracy" "")
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+	capacity <- bloomCapacity
+	accuracy <- bloomAccuracy
+	return $ suggestSizing capacity (1/ fromIntegral accuracy)
+
 {- Creates a bloom filter, and runs an action, such as withKeysReferenced,
  - to populate it.
  -
@@ -193,12 +210,7 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
  -}
 genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
 genBloomFilter convert populate = do
-	-- A bloom filter capable of holding half a million keys with a
-	-- false positive rate of 0.1% uses around 8 mb of memory.
-	-- TODO: make this configurable, for the really large repos,
-	-- or really low false positive rates.
-	let (numbits, numhashes) = suggestSizing 500000 0.001
-
+	(numbits, numhashes) <- bloomBitsHashes
 	bloom <- lift $ newMB (cheapHashes numhashes) numbits
 	_ <- populate $ \v -> lift $ insertMB bloom (convert v)
 	lift $ unsafeFreezeMB bloom
diff --git a/debian/changelog b/debian/changelog
index 120513806..8d7337116 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -7,6 +7,14 @@ git-annex (3.20120310) UNRELEASED; urgency=low
     space, but now only needs to store the set of file contents that
     are present in the annex in memory.
   * status: Fixed to run in constant space.
+  * unused: Now uses a bloom filter, and runs in constant space.
+    Use of a bloom filter does mean it will not notice a small
+    number of unused keys. For repos with up to half a million keys,
+    it will miss one key in 1000.
+  * Added annex.bloomcapacity and annex.bloomaccuracy, which can be
+    adjusted as desired to tune the bloom filter.
+  * status: Display about of memory used by bloom filter, and
+    detect then it's too small for the number of keys in a repository.
 
  -- Joey Hess <joeyh@debian.org>  Sat, 10 Mar 2012 14:03:22 -0400
 
diff --git a/debian/control b/debian/control
index 8ea1a6259..a73433c2a 100644
--- a/debian/control
+++ b/debian/control
@@ -18,6 +18,7 @@ Build-Depends:
 	libghc-lifted-base-dev,
 	libghc-json-dev,
 	libghc-ifelse-dev,
+	libghc-bloomfilter-dev,
 	ikiwiki,
 	perlmagick,
 	git,
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index a941d4420..10899d12c 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -598,6 +598,23 @@ Here are all the supported configuration settings.
   of memory and are working with very large numbers of files, increasing
   the queue size can speed it up.
 
+* `annex.bloomcapacity`
+
+  The `git annex unused` command uses a bloom filter to determine
+  what data is no longer used. The default bloom filter is sized to handle
+  up to 500000 keys. If your repository is larger than that,
+  you can adjust this to avoid `git annex unused` not noticing some unused
+  data files. Increasing this will make `git-annex unused` consume more memory;
+  run `git annex status` for memory usage numbers.
+
+* `annex.bloomaccuracy`
+
+  Adjusts the accuracy of the bloom filter used by
+  `git annex unused`. The default accuracy is 1000 -- 
+  1 unused file out of 1000 will be missed by `git annex unused`. Increasing
+  the accuracy will make `git annex unused` consume more memory;
+  run `git annex status` for memory usage numbers.
+
 * `annex.version`
 
   Automatically maintained, and used to automate upgrades between versions.
diff --git a/doc/install.mdwn b/doc/install.mdwn
index 8de24d40d..0698a8bc4 100644
--- a/doc/install.mdwn
+++ b/doc/install.mdwn
@@ -35,6 +35,7 @@ To build and use git-annex, you will need:
   * [hS3](http://hackage.haskell.org/package/hS3)
   * [json](http://hackage.haskell.org/package/json)
   * [IfElse](http://hackage.haskell.org/package/IfElse)
+  * [bloomfilter](http://hackage.haskell.org/package/bloomfilter)
 * Shell commands
   * [git](http://git-scm.com/)
   * [uuid](http://www.ossp.org/pkg/lib/uuid/)
diff --git a/git-annex.cabal b/git-annex.cabal
index 6efebc66e..278d87555 100644
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -32,7 +32,7 @@ Executable git-annex
    unix, containers, utf8-string, network, mtl, bytestring, old-locale, time,
    pcre-light, extensible-exceptions, dataenc, SHA, process, hs3, json, HTTP,
    base >= 4.5, base < 5, monad-control, transformers-base, lifted-base,
-   IfElse, text, QuickCheck >= 2.1
+   IfElse, text, QuickCheck >= 2.1, bloomfilter
   Other-Modules: Utility.StatFS, Utility.Touch
 
 Executable git-annex-shell
author	Joey Hess <joey@kitenet.net>	2012-03-12 16:18:14 -0400
committer	Joey Hess <joey@kitenet.net>	2012-03-12 16:18:35 -0400
commit	25809ce2e0861a54ec63a414037b95fe29acc6df (patch)
tree	0aae10e37dc9c430ce4c182b686772f9504fb332
parent	faf3a94fa7dfaaf7f95477895c645ff793dcf2f4 (diff)