summaryrefslogtreecommitdiff
path: root/Command
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2015-06-16 17:58:15 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2015-06-16 18:12:00 -0400
commit87ba1abc7cd1b199b0f7d778d9f27375b50de709 (patch)
treea8dcb4479872a1ddfd39053a7532e987c489f85e /Command
parenta5ae3ecdb722219d3cdaee652450be1b96795f83 (diff)
Increased the default annex.bloomaccuracy from 1000 to 10000000
This makes git annex unused use around 48 mb more memory than it did before, but the massive increase in accuracy makes this worthwhile for all but the smallest systems. Also, I want to use the bloom filter for sync --all --content, to avoid dropping files that the preferred content doesn't want, and 1/1000 false positives would be far too many in that use case, even if it were acceptable for unused. Actual memory use numbers: 1000: 21.06user 3.42system 0:26.40elapsed 92%CPU (0avgtext+0avgdata 501552maxresident)k 1000000: 21.41user 3.55system 0:26.84elapsed 93%CPU (0avgtext+0avgdata 549496maxresident)k 10000000: 21.84user 3.52system 0:27.89elapsed 90%CPU (0avgtext+0avgdata 549920maxresident)k Based on these numbers, 10 million seemed a better pick than 1 million.
Diffstat (limited to 'Command')
-rw-r--r--Command/Info.hs11
-rw-r--r--Command/Sync.hs1
-rw-r--r--Command/Unused.hs43
3 files changed, 8 insertions, 47 deletions
diff --git a/Command/Info.hs b/Command/Info.hs
index f5fa9c6bf..e6e0194ce 100644
--- a/Command/Info.hs
+++ b/Command/Info.hs
@@ -16,7 +16,6 @@ import Data.Tuple
import Data.Ord
import Common.Annex
-import qualified Command.Unused
import qualified Git
import qualified Annex
import qualified Remote
@@ -39,6 +38,8 @@ import Types.TrustLevel
import Types.FileMatcher
import qualified Limit
import Messages.JSON (DualDisp(..))
+import Annex.BloomFilter
+import qualified Command.Unused
-- a named computation that produces a statistic
type Stat = StatState (Maybe (String, StatState String))
@@ -330,17 +331,17 @@ key_name k = simpleStat "key" $ pure $ key2file k
bloom_info :: Stat
bloom_info = simpleStat "bloom filter size" $ do
localkeys <- countKeys <$> cachedPresentData
- capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+ capacity <- fromIntegral <$> lift bloomCapacity
let note = aside $
if localkeys >= capacity
then "appears too small for this repository; adjust annex.bloomcapacity"
else showPercentage 1 (percentage capacity localkeys) ++ " full"
- -- Two bloom filters are used at the same time, so double the size
- -- of one.
+ -- Two bloom filters are used at the same time when running
+ -- git-annex unused, so double the size of one.
sizer <- lift mkSizer
size <- sizer memoryUnits False . (* 2) . fromIntegral . fst <$>
- lift Command.Unused.bloomBitsHashes
+ lift bloomBitsHashes
return $ size ++ note
diff --git a/Command/Sync.hs b/Command/Sync.hs
index 88449384d..80ecce43e 100644
--- a/Command/Sync.hs
+++ b/Command/Sync.hs
@@ -45,6 +45,7 @@ import Annex.UUID
import Logs.UUID
import Annex.AutoMerge
import Annex.Ssh
+import Utility.Bloom
import Control.Concurrent.MVar
import qualified Data.Map as M
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 4f844081a..82a605290 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -9,7 +9,6 @@
module Command.Unused where
-import Control.Monad.ST
import qualified Data.Map as M
import Common.Annex
@@ -32,7 +31,7 @@ import Types.Key
import Types.RefSpec
import Git.FilePath
import Logs.View (is_branchView)
-import Utility.Bloom
+import Annex.BloomFilter
cmd :: [Command]
cmd = [withOptions [unusedFromOption, refSpecOption] $
@@ -172,46 +171,6 @@ excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel
firstlevel = withKeysReferencedM
secondlevel = withKeysReferencedInGit refspec
-{- A bloom filter capable of holding half a million keys with a
- - false positive rate of 1 in 1000 uses around 8 mb of memory,
- - so will easily fit on even my lowest memory systems.
- -}
-bloomCapacity :: Annex Int
-bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
-bloomAccuracy :: Annex Int
-bloomAccuracy = fromMaybe 1000 . annexBloomAccuracy <$> Annex.getGitConfig
-bloomBitsHashes :: Annex (Int, Int)
-bloomBitsHashes = do
- capacity <- bloomCapacity
- accuracy <- bloomAccuracy
- case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
- Left e -> do
- warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
- -- precaulculated value for 500000 (1/1000)
- return (8388608,10)
- Right v -> return v
-
-{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
- - to populate it.
- -
- - The action is passed a callback that it can use to feed values into the
- - bloom filter.
- -
- - Once the action completes, the mutable filter is frozen
- - for later use.
- -}
-genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
-genBloomFilter convert populate = do
- (numbits, numhashes) <- bloomBitsHashes
- bloom <- lift $ newMB (cheapHashes numhashes) numbits
- _ <- populate $ \v -> lift $ insertMB bloom (convert v)
- lift $ unsafeFreezeMB bloom
- where
- lift = liftIO . stToIO
-
-bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
-bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
-
{- Given an initial value, folds it with each key referenced by
- symlinks in the git repo. -}
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v