Increased the default annex.bloomaccuracy from 1000 to 10000000

This makes git annex unused use around 48 mb more memory than it did before, but the massive increase in accuracy makes this worthwhile for all but the smallest systems. Also, I want to use the bloom filter for sync --all --content, to avoid dropping files that the preferred content doesn't want, and 1/1000 false positives would be far too many in that use case, even if it were acceptable for unused. Actual memory use numbers: 1000: 21.06user 3.42system 0:26.40elapsed 92%CPU (0avgtext+0avgdata 501552maxresident)k 1000000: 21.41user 3.55system 0:26.84elapsed 93%CPU (0avgtext+0avgdata 549496maxresident)k 10000000: 21.84user 3.52system 0:27.89elapsed 90%CPU (0avgtext+0avgdata 549920maxresident)k Based on these numbers, 10 million seemed a better pick than 1 million.
author: Joey Hess <joeyh@joeyh.name> 2015-06-16 17:58:15 -0400
committer: Joey Hess <joeyh@joeyh.name> 2015-06-16 18:12:00 -0400
commit: 87ba1abc7cd1b199b0f7d778d9f27375b50de709 (patch)
tree: a8dcb4479872a1ddfd39053a7532e987c489f85e
parent: a5ae3ecdb722219d3cdaee652450be1b96795f83 (diff)
6 files changed, 76 insertions, 56 deletions
diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs
new file mode 100644
index 000000000..3dcd8140b
--- /dev/null
+++ b/Annex/BloomFilter.hs
@@ -0,0 +1,53 @@
+{- git-annex bloom filter
+ -
+ - Copyright 2010-2015 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Annex.BloomFilter where
+
+import Common.Annex
+import qualified Annex
+import Utility.Bloom
+
+import Control.Monad.ST
+
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 10000000 uses around 16 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+	capacity <- bloomCapacity
+	accuracy <- bloomAccuracy
+	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
+		Left e -> do
+			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
+			-- precaulculated value for 500000 (1/10000000)
+			return (16777216,23)
+		Right v -> return v
+
+{- Creates a bloom filter, and runs an action to populate it.
+ -
+ - The action is passed a callback that it can use to feed values into the
+ - bloom filter. 
+ -
+ - Once the action completes, the mutable filter is frozen
+ - for later use.
+ -}
+genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
+genBloomFilter convert populate = do
+	(numbits, numhashes) <- bloomBitsHashes
+	bloom <- lift $ newMB (cheapHashes numhashes) numbits
+	_ <- populate $ \v -> lift $ insertMB bloom (convert v)
+	lift $ unsafeFreezeMB bloom
+  where
+	lift = liftIO . stToIO
+
+bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
+bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
diff --git a/Command/Info.hs b/Command/Info.hs
index f5fa9c6bf..e6e0194ce 100644
--- a/Command/Info.hs
+++ b/Command/Info.hs
@@ -16,7 +16,6 @@ import Data.Tuple
 import Data.Ord
 
 import Common.Annex
-import qualified Command.Unused
 import qualified Git
 import qualified Annex
 import qualified Remote
@@ -39,6 +38,8 @@ import Types.TrustLevel
 import Types.FileMatcher
 import qualified Limit
 import Messages.JSON (DualDisp(..))
+import Annex.BloomFilter
+import qualified Command.Unused
 
 -- a named computation that produces a statistic
 type Stat = StatState (Maybe (String, StatState String))
@@ -330,17 +331,17 @@ key_name k = simpleStat "key" $ pure $ key2file k
 bloom_info :: Stat
 bloom_info = simpleStat "bloom filter size" $ do
 	localkeys <- countKeys <$> cachedPresentData
-	capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+	capacity <- fromIntegral <$> lift bloomCapacity
 	let note = aside $
 		if localkeys >= capacity
 		then "appears too small for this repository; adjust annex.bloomcapacity"
 		else showPercentage 1 (percentage capacity localkeys) ++ " full"
 
-	-- Two bloom filters are used at the same time, so double the size
-	-- of one.
+	-- Two bloom filters are used at the same time when running
+	-- git-annex unused, so double the size of one.
 	sizer <- lift mkSizer
 	size <- sizer memoryUnits False . (* 2) . fromIntegral . fst <$>
-		lift Command.Unused.bloomBitsHashes
+		lift bloomBitsHashes
 
 	return $ size ++ note
 
diff --git a/Command/Sync.hs b/Command/Sync.hs
index 88449384d..80ecce43e 100644
--- a/Command/Sync.hs
+++ b/Command/Sync.hs
@@ -45,6 +45,7 @@ import Annex.UUID
 import Logs.UUID
 import Annex.AutoMerge
 import Annex.Ssh
+import Utility.Bloom
 
 import Control.Concurrent.MVar
 import qualified Data.Map as M
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 4f844081a..82a605290 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -9,7 +9,6 @@
 
 module Command.Unused where
 
-import Control.Monad.ST
 import qualified Data.Map as M
 
 import Common.Annex
@@ -32,7 +31,7 @@ import Types.Key
 import Types.RefSpec
 import Git.FilePath
 import Logs.View (is_branchView)
-import Utility.Bloom
+import Annex.BloomFilter
 
 cmd :: [Command]
 cmd = [withOptions [unusedFromOption, refSpecOption] $
@@ -172,46 +171,6 @@ excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel
 	firstlevel = withKeysReferencedM
 	secondlevel = withKeysReferencedInGit refspec
 
-{- A bloom filter capable of holding half a million keys with a
- - false positive rate of 1 in 1000 uses around 8 mb of memory,
- - so will easily fit on even my lowest memory systems.
- -}
-bloomCapacity :: Annex Int
-bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
-bloomAccuracy :: Annex Int
-bloomAccuracy = fromMaybe 1000 . annexBloomAccuracy <$> Annex.getGitConfig
-bloomBitsHashes :: Annex (Int, Int)
-bloomBitsHashes = do
-	capacity <- bloomCapacity
-	accuracy <- bloomAccuracy
-	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
-		Left e -> do
-			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-			-- precaulculated value for 500000 (1/1000)
-			return (8388608,10)
-		Right v -> return v
-
-{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
- - to populate it.
- -
- - The action is passed a callback that it can use to feed values into the
- - bloom filter. 
- -
- - Once the action completes, the mutable filter is frozen
- - for later use.
- -}
-genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
-genBloomFilter convert populate = do
-	(numbits, numhashes) <- bloomBitsHashes
-	bloom <- lift $ newMB (cheapHashes numhashes) numbits
-	_ <- populate $ \v -> lift $ insertMB bloom (convert v)
-	lift $ unsafeFreezeMB bloom
-  where
-	lift = liftIO . stToIO
-
-bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
-bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
-
 {- Given an initial value, folds it with each key referenced by
  - symlinks in the git repo. -}
 withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
diff --git a/debian/changelog b/debian/changelog
index c7b4f34e6..0c04ef8e0 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -44,6 +44,10 @@ git-annex (5.20150616) UNRELEASED; urgency=medium
   * Fix incremental backup standard preferred content expression to match
     its documentation, which says it does not want files that have reached
     a backup repository.
+  * Increased the default annex.bloomaccuracy from 1000 to 10000000.
+    This makes git annex unused use up to 16 mb more memory than it did
+    before, but the massive increase in accuracy makes this worthwhile
+    for all but the smallest systems.
 
  -- Joey Hess <id@joeyh.name>  Sat, 30 May 2015 02:07:18 -0400
 
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index e7c80f3cd..c90ef5ec2 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -830,20 +830,22 @@ Here are all the supported configuration settings.
 
 * `annex.bloomcapacity`
 
-  The `git annex unused` command uses a bloom filter to determine
-  what data is no longer used. The default bloom filter is sized to handle
-  up to 500000 keys. If your repository is larger than that,
-  you can adjust this to avoid `git annex unused` not noticing some unused
-  data files. Increasing this will make `git-annex unused` consume more memory;
+  The `git annex unused` and `git annex sync --content` commands use
+  a bloom filter to determine what files are present in eg, the work tree.
+  The default bloom filter is sized to handle
+  up to 500000 files. If your repository is larger than that,
+  you should increase this value. Larger values will
+  make `git-annex unused` and `git annex sync --content` consume more memory;
   run `git annex info` for memory usage numbers.
 
 * `annex.bloomaccuracy`
 
   Adjusts the accuracy of the bloom filter used by
-  `git annex unused`. The default accuracy is 1000 --
-  1 unused file out of 1000 will be missed by `git annex unused`. Increasing
-  the accuracy will make `git annex unused` consume more memory;
-  run `git annex info` for memory usage numbers.
+  `git annex unused` and `git annex sync --content`. 
+  The default accuracy is 10000000 -- 1 unused file out of 10000000
+  will be missed by `git annex unused`. Increasing the accuracy will make
+  `git annex unused` consume more memory; run `git annex info`
+  for memory usage numbers.
 
 * `annex.sshcaching`
author	Joey Hess <joeyh@joeyh.name>	2015-06-16 17:58:15 -0400
committer	Joey Hess <joeyh@joeyh.name>	2015-06-16 18:12:00 -0400
commit	87ba1abc7cd1b199b0f7d778d9f27375b50de709 (patch)
tree	a8dcb4479872a1ddfd39053a7532e987c489f85e
parent	a5ae3ecdb722219d3cdaee652450be1b96795f83 (diff)