summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Command/Status.hs19
-rw-r--r--Command/Unused.hs24
-rw-r--r--debian/changelog8
-rw-r--r--debian/control1
-rw-r--r--doc/git-annex.mdwn17
-rw-r--r--doc/install.mdwn1
-rw-r--r--git-annex.cabal2
7 files changed, 64 insertions, 8 deletions
diff --git a/Command/Status.hs b/Command/Status.hs
index 0b1741dc0..eadb4f163 100644
--- a/Command/Status.hs
+++ b/Command/Status.hs
@@ -76,6 +76,7 @@ slow_stats =
, local_annex_size
, known_annex_keys
, known_annex_size
+ , bloom_info
, backend_usage
]
@@ -127,7 +128,7 @@ remote_list level desc = stat n $ nojson $ lift $ do
return $ if null s then "0" else show (length rs) ++ "\n" ++ beginning s
where
n = desc ++ " repositories"
-
+
local_annex_size :: Stat
local_annex_size = stat "local annex size" $ json id $
showSizeKeys <$> cachedPresentData
@@ -136,6 +137,22 @@ local_annex_keys :: Stat
local_annex_keys = stat "local annex keys" $ json show $
countKeys <$> cachedPresentData
+bloom_info :: Stat
+bloom_info = stat "bloom filter size" $ json id $ do
+ localkeys <- countKeys <$> cachedPresentData
+ capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+ let note = aside $
+ if localkeys >= capacity
+ then "appears too small for this repository; adjust annex.bloomcapacity"
+ else "has room for " ++ show (capacity - localkeys) ++ " more local annex keys"
+
+ -- Two bloom filters are used at the same time, so double the size
+ -- of one.
+ size <- roughSize memoryUnits True . (* 2) . fromIntegral . fst <$>
+ lift Command.Unused.bloomBitsHashes
+
+ return $ size ++ note
+
known_annex_size :: Stat
known_annex_size = stat "known annex size" $ json id $
showSizeKeys <$> cachedReferencedData
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 028e20445..b878ab265 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -29,6 +29,7 @@ import qualified Git.Command
import qualified Git.Ref
import qualified Git.LsFiles as LsFiles
import qualified Git.LsTree as LsTree
+import qualified Git.Config
import qualified Backend
import qualified Remote
import qualified Annex.Branch
@@ -182,6 +183,22 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
where
remove a b = foldl (flip S.delete) b a
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 1000 uses around 8 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . readish
+ <$> fromRepo (Git.Config.get "annex.bloomcapacity" "")
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 1000 . readish
+ <$> fromRepo (Git.Config.get "annex.bloomaccuracy" "")
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+ capacity <- bloomCapacity
+ accuracy <- bloomAccuracy
+ return $ suggestSizing capacity (1/ fromIntegral accuracy)
+
{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
- to populate it.
-
@@ -193,12 +210,7 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
-}
genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
genBloomFilter convert populate = do
- -- A bloom filter capable of holding half a million keys with a
- -- false positive rate of 0.1% uses around 8 mb of memory.
- -- TODO: make this configurable, for the really large repos,
- -- or really low false positive rates.
- let (numbits, numhashes) = suggestSizing 500000 0.001
-
+ (numbits, numhashes) <- bloomBitsHashes
bloom <- lift $ newMB (cheapHashes numhashes) numbits
_ <- populate $ \v -> lift $ insertMB bloom (convert v)
lift $ unsafeFreezeMB bloom
diff --git a/debian/changelog b/debian/changelog
index 120513806..8d7337116 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -7,6 +7,14 @@ git-annex (3.20120310) UNRELEASED; urgency=low
space, but now only needs to store the set of file contents that
are present in the annex in memory.
* status: Fixed to run in constant space.
+ * unused: Now uses a bloom filter, and runs in constant space.
+ Use of a bloom filter does mean it will not notice a small
+ number of unused keys. For repos with up to half a million keys,
+ it will miss one key in 1000.
+ * Added annex.bloomcapacity and annex.bloomaccuracy, which can be
+ adjusted as desired to tune the bloom filter.
+ * status: Display about of memory used by bloom filter, and
+ detect then it's too small for the number of keys in a repository.
-- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400
diff --git a/debian/control b/debian/control
index 8ea1a6259..a73433c2a 100644
--- a/debian/control
+++ b/debian/control
@@ -18,6 +18,7 @@ Build-Depends:
libghc-lifted-base-dev,
libghc-json-dev,
libghc-ifelse-dev,
+ libghc-bloomfilter-dev,
ikiwiki,
perlmagick,
git,
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index a941d4420..10899d12c 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -598,6 +598,23 @@ Here are all the supported configuration settings.
of memory and are working with very large numbers of files, increasing
the queue size can speed it up.
+* `annex.bloomcapacity`
+
+ The `git annex unused` command uses a bloom filter to determine
+ what data is no longer used. The default bloom filter is sized to handle
+ up to 500000 keys. If your repository is larger than that,
+ you can adjust this to avoid `git annex unused` not noticing some unused
+ data files. Increasing this will make `git-annex unused` consume more memory;
+ run `git annex status` for memory usage numbers.
+
+* `annex.bloomaccuracy`
+
+ Adjusts the accuracy of the bloom filter used by
+ `git annex unused`. The default accuracy is 1000 --
+ 1 unused file out of 1000 will be missed by `git annex unused`. Increasing
+ the accuracy will make `git annex unused` consume more memory;
+ run `git annex status` for memory usage numbers.
+
* `annex.version`
Automatically maintained, and used to automate upgrades between versions.
diff --git a/doc/install.mdwn b/doc/install.mdwn
index 8de24d40d..0698a8bc4 100644
--- a/doc/install.mdwn
+++ b/doc/install.mdwn
@@ -35,6 +35,7 @@ To build and use git-annex, you will need:
* [hS3](http://hackage.haskell.org/package/hS3)
* [json](http://hackage.haskell.org/package/json)
* [IfElse](http://hackage.haskell.org/package/IfElse)
+ * [bloomfilter](http://hackage.haskell.org/package/bloomfilter)
* Shell commands
* [git](http://git-scm.com/)
* [uuid](http://www.ossp.org/pkg/lib/uuid/)
diff --git a/git-annex.cabal b/git-annex.cabal
index 6efebc66e..278d87555 100644
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -32,7 +32,7 @@ Executable git-annex
unix, containers, utf8-string, network, mtl, bytestring, old-locale, time,
pcre-light, extensible-exceptions, dataenc, SHA, process, hs3, json, HTTP,
base >= 4.5, base < 5, monad-control, transformers-base, lifted-base,
- IfElse, text, QuickCheck >= 2.1
+ IfElse, text, QuickCheck >= 2.1, bloomfilter
Other-Modules: Utility.StatFS, Utility.Touch
Executable git-annex-shell