summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Annex/BloomFilter.hs53
-rw-r--r--Command/Info.hs11
-rw-r--r--Command/Sync.hs1
-rw-r--r--Command/Unused.hs43
-rw-r--r--debian/changelog4
-rw-r--r--doc/git-annex.mdwn20
6 files changed, 76 insertions, 56 deletions
diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs
new file mode 100644
index 000000000..3dcd8140b
--- /dev/null
+++ b/Annex/BloomFilter.hs
@@ -0,0 +1,53 @@
+{- git-annex bloom filter
+ -
+ - Copyright 2010-2015 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Annex.BloomFilter where
+
+import Common.Annex
+import qualified Annex
+import Utility.Bloom
+
+import Control.Monad.ST
+
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 10000000 uses around 16 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+ capacity <- bloomCapacity
+ accuracy <- bloomAccuracy
+ case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
+ Left e -> do
+ warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
+ -- precaulculated value for 500000 (1/10000000)
+ return (16777216,23)
+ Right v -> return v
+
+{- Creates a bloom filter, and runs an action to populate it.
+ -
+ - The action is passed a callback that it can use to feed values into the
+ - bloom filter.
+ -
+ - Once the action completes, the mutable filter is frozen
+ - for later use.
+ -}
+genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
+genBloomFilter convert populate = do
+ (numbits, numhashes) <- bloomBitsHashes
+ bloom <- lift $ newMB (cheapHashes numhashes) numbits
+ _ <- populate $ \v -> lift $ insertMB bloom (convert v)
+ lift $ unsafeFreezeMB bloom
+ where
+ lift = liftIO . stToIO
+
+bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
+bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
diff --git a/Command/Info.hs b/Command/Info.hs
index f5fa9c6bf..e6e0194ce 100644
--- a/Command/Info.hs
+++ b/Command/Info.hs
@@ -16,7 +16,6 @@ import Data.Tuple
import Data.Ord
import Common.Annex
-import qualified Command.Unused
import qualified Git
import qualified Annex
import qualified Remote
@@ -39,6 +38,8 @@ import Types.TrustLevel
import Types.FileMatcher
import qualified Limit
import Messages.JSON (DualDisp(..))
+import Annex.BloomFilter
+import qualified Command.Unused
-- a named computation that produces a statistic
type Stat = StatState (Maybe (String, StatState String))
@@ -330,17 +331,17 @@ key_name k = simpleStat "key" $ pure $ key2file k
bloom_info :: Stat
bloom_info = simpleStat "bloom filter size" $ do
localkeys <- countKeys <$> cachedPresentData
- capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+ capacity <- fromIntegral <$> lift bloomCapacity
let note = aside $
if localkeys >= capacity
then "appears too small for this repository; adjust annex.bloomcapacity"
else showPercentage 1 (percentage capacity localkeys) ++ " full"
- -- Two bloom filters are used at the same time, so double the size
- -- of one.
+ -- Two bloom filters are used at the same time when running
+ -- git-annex unused, so double the size of one.
sizer <- lift mkSizer
size <- sizer memoryUnits False . (* 2) . fromIntegral . fst <$>
- lift Command.Unused.bloomBitsHashes
+ lift bloomBitsHashes
return $ size ++ note
diff --git a/Command/Sync.hs b/Command/Sync.hs
index 88449384d..80ecce43e 100644
--- a/Command/Sync.hs
+++ b/Command/Sync.hs
@@ -45,6 +45,7 @@ import Annex.UUID
import Logs.UUID
import Annex.AutoMerge
import Annex.Ssh
+import Utility.Bloom
import Control.Concurrent.MVar
import qualified Data.Map as M
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 4f844081a..82a605290 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -9,7 +9,6 @@
module Command.Unused where
-import Control.Monad.ST
import qualified Data.Map as M
import Common.Annex
@@ -32,7 +31,7 @@ import Types.Key
import Types.RefSpec
import Git.FilePath
import Logs.View (is_branchView)
-import Utility.Bloom
+import Annex.BloomFilter
cmd :: [Command]
cmd = [withOptions [unusedFromOption, refSpecOption] $
@@ -172,46 +171,6 @@ excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel
firstlevel = withKeysReferencedM
secondlevel = withKeysReferencedInGit refspec
-{- A bloom filter capable of holding half a million keys with a
- - false positive rate of 1 in 1000 uses around 8 mb of memory,
- - so will easily fit on even my lowest memory systems.
- -}
-bloomCapacity :: Annex Int
-bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
-bloomAccuracy :: Annex Int
-bloomAccuracy = fromMaybe 1000 . annexBloomAccuracy <$> Annex.getGitConfig
-bloomBitsHashes :: Annex (Int, Int)
-bloomBitsHashes = do
- capacity <- bloomCapacity
- accuracy <- bloomAccuracy
- case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
- Left e -> do
- warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
- -- precaulculated value for 500000 (1/1000)
- return (8388608,10)
- Right v -> return v
-
-{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
- - to populate it.
- -
- - The action is passed a callback that it can use to feed values into the
- - bloom filter.
- -
- - Once the action completes, the mutable filter is frozen
- - for later use.
- -}
-genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
-genBloomFilter convert populate = do
- (numbits, numhashes) <- bloomBitsHashes
- bloom <- lift $ newMB (cheapHashes numhashes) numbits
- _ <- populate $ \v -> lift $ insertMB bloom (convert v)
- lift $ unsafeFreezeMB bloom
- where
- lift = liftIO . stToIO
-
-bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
-bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
-
{- Given an initial value, folds it with each key referenced by
- symlinks in the git repo. -}
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
diff --git a/debian/changelog b/debian/changelog
index c7b4f34e6..0c04ef8e0 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -44,6 +44,10 @@ git-annex (5.20150616) UNRELEASED; urgency=medium
* Fix incremental backup standard preferred content expression to match
its documentation, which says it does not want files that have reached
a backup repository.
+ * Increased the default annex.bloomaccuracy from 1000 to 10000000.
+ This makes git annex unused use up to 16 mb more memory than it did
+ before, but the massive increase in accuracy makes this worthwhile
+ for all but the smallest systems.
-- Joey Hess <id@joeyh.name> Sat, 30 May 2015 02:07:18 -0400
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index e7c80f3cd..c90ef5ec2 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -830,20 +830,22 @@ Here are all the supported configuration settings.
* `annex.bloomcapacity`
- The `git annex unused` command uses a bloom filter to determine
- what data is no longer used. The default bloom filter is sized to handle
- up to 500000 keys. If your repository is larger than that,
- you can adjust this to avoid `git annex unused` not noticing some unused
- data files. Increasing this will make `git-annex unused` consume more memory;
+ The `git annex unused` and `git annex sync --content` commands use
+ a bloom filter to determine what files are present in eg, the work tree.
+ The default bloom filter is sized to handle
+ up to 500000 files. If your repository is larger than that,
+ you should increase this value. Larger values will
+ make `git-annex unused` and `git annex sync --content` consume more memory;
run `git annex info` for memory usage numbers.
* `annex.bloomaccuracy`
Adjusts the accuracy of the bloom filter used by
- `git annex unused`. The default accuracy is 1000 --
- 1 unused file out of 1000 will be missed by `git annex unused`. Increasing
- the accuracy will make `git annex unused` consume more memory;
- run `git annex info` for memory usage numbers.
+ `git annex unused` and `git annex sync --content`.
+ The default accuracy is 10000000 -- 1 unused file out of 10000000
+ will be missed by `git annex unused`. Increasing the accuracy will make
+ `git annex unused` consume more memory; run `git annex info`
+ for memory usage numbers.
* `annex.sshcaching`