summaryrefslogtreecommitdiff
path: root/Command
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2012-03-12 16:18:14 -0400
committerGravatar Joey Hess <joey@kitenet.net>2012-03-12 16:18:35 -0400
commit25809ce2e0861a54ec63a414037b95fe29acc6df (patch)
tree0aae10e37dc9c430ce4c182b686772f9504fb332 /Command
parentfaf3a94fa7dfaaf7f95477895c645ff793dcf2f4 (diff)
finish bloom filters
Add tuning, docs, etc. Not sure if status is the right place to remote size.. perhaps unused should report the size and also warn if it sees more keys than the bloom filter allows?
Diffstat (limited to 'Command')
-rw-r--r--Command/Status.hs19
-rw-r--r--Command/Unused.hs24
2 files changed, 36 insertions, 7 deletions
diff --git a/Command/Status.hs b/Command/Status.hs
index 0b1741dc0..eadb4f163 100644
--- a/Command/Status.hs
+++ b/Command/Status.hs
@@ -76,6 +76,7 @@ slow_stats =
, local_annex_size
, known_annex_keys
, known_annex_size
+ , bloom_info
, backend_usage
]
@@ -127,7 +128,7 @@ remote_list level desc = stat n $ nojson $ lift $ do
return $ if null s then "0" else show (length rs) ++ "\n" ++ beginning s
where
n = desc ++ " repositories"
-
+
local_annex_size :: Stat
local_annex_size = stat "local annex size" $ json id $
showSizeKeys <$> cachedPresentData
@@ -136,6 +137,22 @@ local_annex_keys :: Stat
local_annex_keys = stat "local annex keys" $ json show $
countKeys <$> cachedPresentData
+bloom_info :: Stat
+bloom_info = stat "bloom filter size" $ json id $ do
+ localkeys <- countKeys <$> cachedPresentData
+ capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+ let note = aside $
+ if localkeys >= capacity
+ then "appears too small for this repository; adjust annex.bloomcapacity"
+ else "has room for " ++ show (capacity - localkeys) ++ " more local annex keys"
+
+ -- Two bloom filters are used at the same time, so double the size
+ -- of one.
+ size <- roughSize memoryUnits True . (* 2) . fromIntegral . fst <$>
+ lift Command.Unused.bloomBitsHashes
+
+ return $ size ++ note
+
known_annex_size :: Stat
known_annex_size = stat "known annex size" $ json id $
showSizeKeys <$> cachedReferencedData
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 028e20445..b878ab265 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -29,6 +29,7 @@ import qualified Git.Command
import qualified Git.Ref
import qualified Git.LsFiles as LsFiles
import qualified Git.LsTree as LsTree
+import qualified Git.Config
import qualified Backend
import qualified Remote
import qualified Annex.Branch
@@ -182,6 +183,22 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
where
remove a b = foldl (flip S.delete) b a
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 1000 uses around 8 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . readish
+ <$> fromRepo (Git.Config.get "annex.bloomcapacity" "")
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 1000 . readish
+ <$> fromRepo (Git.Config.get "annex.bloomaccuracy" "")
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+ capacity <- bloomCapacity
+ accuracy <- bloomAccuracy
+ return $ suggestSizing capacity (1/ fromIntegral accuracy)
+
{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
- to populate it.
-
@@ -193,12 +210,7 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
-}
genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
genBloomFilter convert populate = do
- -- A bloom filter capable of holding half a million keys with a
- -- false positive rate of 0.1% uses around 8 mb of memory.
- -- TODO: make this configurable, for the really large repos,
- -- or really low false positive rates.
- let (numbits, numhashes) = suggestSizing 500000 0.001
-
+ (numbits, numhashes) <- bloomBitsHashes
bloom <- lift $ newMB (cheapHashes numhashes) numbits
_ <- populate $ \v -> lift $ insertMB bloom (convert v)
lift $ unsafeFreezeMB bloom