6 files changed, 76 insertions, 56 deletions
diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs
new file mode 100644
index 000000000..3dcd8140b
--- /dev/null
+++ b/Annex/BloomFilter.hs
@@ -0,0 +1,53 @@
+{- git-annex bloom filter
+ -
+ - Copyright 2010-2015 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Annex.BloomFilter where
+
+import Common.Annex
+import qualified Annex
+import Utility.Bloom
+
+import Control.Monad.ST
+
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 10000000 uses around 16 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+	capacity <- bloomCapacity
+	accuracy <- bloomAccuracy
+	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
+		Left e -> do
+			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
+			-- precaulculated value for 500000 (1/10000000)
+			return (16777216,23)
+		Right v -> return v
+
+{- Creates a bloom filter, and runs an action to populate it.
+ -
+ - The action is passed a callback that it can use to feed values into the
+ - bloom filter. 
+ -
+ - Once the action completes, the mutable filter is frozen
+ - for later use.
+ -}
+genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
+genBloomFilter convert populate = do
+	(numbits, numhashes) <- bloomBitsHashes
+	bloom <- lift $ newMB (cheapHashes numhashes) numbits
+	_ <- populate $ \v -> lift $ insertMB bloom (convert v)
+	lift $ unsafeFreezeMB bloom
+  where
+	lift = liftIO . stToIO
+
+bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
+bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
diff --git a/Command/Info.hs b/Command/Info.hs
index f5fa9c6bf..e6e0194ce 100644
--- a/Command/Info.hs
+++ b/Command/Info.hs
@@ -16,7 +16,6 @@ import Data.Tuple
 import Data.Ord
 
 import Common.Annex
-import qualified Command.Unused
 import qualified Git
 import qualified Annex
 import qualified Remote
@@ -39,6 +38,8 @@ import Types.TrustLevel
 import Types.FileMatcher
 import qualified Limit
 import Messages.JSON (DualDisp(..))
+import Annex.BloomFilter
+import qualified Command.Unused
 
 -- a named computation that produces a statistic
 type Stat = StatState (Maybe (String, StatState String))
@@ -330,17 +331,17 @@ key_name k = simpleStat "key" $ pure $ key2file k
 bloom_info :: Stat
 bloom_info = simpleStat "bloom filter size" $ do
 	localkeys <- countKeys <$> cachedPresentData
-	capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+	capacity <- fromIntegral <$> lift bloomCapacity
 	let note = aside $
 		if localkeys >= capacity
 		then "appears too small for this repository; adjust annex.bloomcapacity"
 		else showPercentage 1 (percentage capacity localkeys) ++ " full"
 
-	-- Two bloom filters are used at the same time, so double the size
-	-- of one.
+	-- Two bloom filters are used at the same time when running
+	-- git-annex unused, so double the size of one.
 	sizer <- lift mkSizer
 	size <- sizer memoryUnits False . (* 2) . fromIntegral . fst <$>
-		lift Command.Unused.bloomBitsHashes
+		lift bloomBitsHashes
 
 	return $ size ++ note
 
diff --git a/Command/Sync.hs b/Command/Sync.hs
index 88449384d..80ecce43e 100644
--- a/Command/Sync.hs
+++ b/Command/Sync.hs
@@ -45,6 +45,7 @@ import Annex.UUID
 import Logs.UUID
 import Annex.AutoMerge
 import Annex.Ssh
+import Utility.Bloom
 
 import Control.Concurrent.MVar
 import qualified Data.Map as M
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 4f844081a..82a605290 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -9,7 +9,6 @@
 
 module Command.Unused where
 
-import Control.Monad.ST
 import qualified Data.Map as M
 
 import Common.Annex
@@ -32,7 +31,7 @@ import Types.Key
 import Types.RefSpec
 import Git.FilePath
 import Logs.View (is_branchView)
-import Utility.Bloom
+import Annex.BloomFilter
 
 cmd :: [Command]
 cmd = [withOptions [unusedFromOption, refSpecOption] $
@@ -172,46 +171,6 @@ excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel
 	firstlevel = withKeysReferencedM
 	secondlevel = withKeysReferencedInGit refspec
 
-{- A bloom filter capable of holding half a million keys with a
- - false positive rate of 1 in 1000 uses around 8 mb of memory,
- - so will easily fit on even my lowest memory systems.
- -}
-bloomCapacity :: Annex Int
-bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
-bloomAccuracy :: Annex Int
-bloomAccuracy = fromMaybe 1000 . annexBloomAccuracy <$> Annex.getGitConfig
-bloomBitsHashes :: Annex (Int, Int)
-bloomBitsHashes = do
-	capacity <- bloomCapacity
-	accuracy <- bloomAccuracy
-	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
-		Left e -> do
-			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-			-- precaulculated value for 500000 (1/1000)
-			return (8388608,10)
-		Right v -> return v
-
-{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
- - to populate it.
- -
- - The action is passed a callback that it can use to feed values into the
- - bloom filter. 
- -
- - Once the action completes, the mutable filter is frozen
- - for later use.
- -}
-genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
-genBloomFilter convert populate = do
-	(numbits, numhashes) <- bloomBitsHashes
-	bloom <- lift $ newMB (cheapHashes numhashes) numbits
-	_ <- populate $ \v -> lift $ insertMB bloom (convert v)
-	lift $ unsafeFreezeMB bloom
-  where
-	lift = liftIO . stToIO
-
-bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
-bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
-
 {- Given an initial value, folds it with each key referenced by
  - symlinks in the git repo. -}
 withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
diff --git a/debian/changelog b/debian/changelog
index c7b4f34e6..0c04ef8e0 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -44,6 +44,10 @@ git-annex (5.20150616) UNRELEASED; urgency=medium
   * Fix incremental backup standard preferred content expression to match
     its documentation, which says it does not want files that have reached
     a backup repository.
+  * Increased the default annex.bloomaccuracy from 1000 to 10000000.
+    This makes git annex unused use up to 16 mb more memory than it did
+    before, but the massive increase in accuracy makes this worthwhile
+    for all but the smallest systems.
 
  -- Joey Hess <id@joeyh.name>  Sat, 30 May 2015 02:07:18 -0400
 
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index e7c80f3cd..c90ef5ec2 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -830,20 +830,22 @@ Here are all the supported configuration settings.
 
 * `annex.bloomcapacity`
 
-  The `git annex unused` command uses a bloom filter to determine
-  what data is no longer used. The default bloom filter is sized to handle
-  up to 500000 keys. If your repository is larger than that,
-  you can adjust this to avoid `git annex unused` not noticing some unused
-  data files. Increasing this will make `git-annex unused` consume more memory;
+  The `git annex unused` and `git annex sync --content` commands use
+  a bloom filter to determine what files are present in eg, the work tree.
+  The default bloom filter is sized to handle
+  up to 500000 files. If your repository is larger than that,
+  you should increase this value. Larger values will
+  make `git-annex unused` and `git annex sync --content` consume more memory;
   run `git annex info` for memory usage numbers.
 
 * `annex.bloomaccuracy`
 
   Adjusts the accuracy of the bloom filter used by
-  `git annex unused`. The default accuracy is 1000 --
-  1 unused file out of 1000 will be missed by `git annex unused`. Increasing
-  the accuracy will make `git annex unused` consume more memory;
-  run `git annex info` for memory usage numbers.
+  `git annex unused` and `git annex sync --content`. 
+  The default accuracy is 10000000 -- 1 unused file out of 10000000
+  will be missed by `git annex unused`. Increasing the accuracy will make
+  `git annex unused` consume more memory; run `git annex info`
+  for memory usage numbers.
 
 * `annex.sshcaching`