diff options
-rw-r--r-- | Command/Status.hs | 19 | ||||
-rw-r--r-- | Command/Unused.hs | 24 | ||||
-rw-r--r-- | debian/changelog | 8 | ||||
-rw-r--r-- | debian/control | 1 | ||||
-rw-r--r-- | doc/git-annex.mdwn | 17 | ||||
-rw-r--r-- | doc/install.mdwn | 1 | ||||
-rw-r--r-- | git-annex.cabal | 2 |
7 files changed, 64 insertions, 8 deletions
diff --git a/Command/Status.hs b/Command/Status.hs index 0b1741dc0..eadb4f163 100644 --- a/Command/Status.hs +++ b/Command/Status.hs @@ -76,6 +76,7 @@ slow_stats = , local_annex_size , known_annex_keys , known_annex_size + , bloom_info , backend_usage ] @@ -127,7 +128,7 @@ remote_list level desc = stat n $ nojson $ lift $ do return $ if null s then "0" else show (length rs) ++ "\n" ++ beginning s where n = desc ++ " repositories" - + local_annex_size :: Stat local_annex_size = stat "local annex size" $ json id $ showSizeKeys <$> cachedPresentData @@ -136,6 +137,22 @@ local_annex_keys :: Stat local_annex_keys = stat "local annex keys" $ json show $ countKeys <$> cachedPresentData +bloom_info :: Stat +bloom_info = stat "bloom filter size" $ json id $ do + localkeys <- countKeys <$> cachedPresentData + capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity + let note = aside $ + if localkeys >= capacity + then "appears too small for this repository; adjust annex.bloomcapacity" + else "has room for " ++ show (capacity - localkeys) ++ " more local annex keys" + + -- Two bloom filters are used at the same time, so double the size + -- of one. + size <- roughSize memoryUnits True . (* 2) . fromIntegral . fst <$> + lift Command.Unused.bloomBitsHashes + + return $ size ++ note + known_annex_size :: Stat known_annex_size = stat "known annex size" $ json id $ showSizeKeys <$> cachedReferencedData diff --git a/Command/Unused.hs b/Command/Unused.hs index 028e20445..b878ab265 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -29,6 +29,7 @@ import qualified Git.Command import qualified Git.Ref import qualified Git.LsFiles as LsFiles import qualified Git.LsTree as LsTree +import qualified Git.Config import qualified Backend import qualified Remote import qualified Annex.Branch @@ -182,6 +183,22 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller where remove a b = foldl (flip S.delete) b a +{- A bloom filter capable of holding half a million keys with a + - false positive rate of 1 in 1000 uses around 8 mb of memory, + - so will easily fit on even my lowest memory systems. + -} +bloomCapacity :: Annex Int +bloomCapacity = fromMaybe 500000 . readish + <$> fromRepo (Git.Config.get "annex.bloomcapacity" "") +bloomAccuracy :: Annex Int +bloomAccuracy = fromMaybe 1000 . readish + <$> fromRepo (Git.Config.get "annex.bloomaccuracy" "") +bloomBitsHashes :: Annex (Int, Int) +bloomBitsHashes = do + capacity <- bloomCapacity + accuracy <- bloomAccuracy + return $ suggestSizing capacity (1/ fromIntegral accuracy) + {- Creates a bloom filter, and runs an action, such as withKeysReferenced, - to populate it. - @@ -193,12 +210,7 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller -} genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t) genBloomFilter convert populate = do - -- A bloom filter capable of holding half a million keys with a - -- false positive rate of 0.1% uses around 8 mb of memory. - -- TODO: make this configurable, for the really large repos, - -- or really low false positive rates. - let (numbits, numhashes) = suggestSizing 500000 0.001 - + (numbits, numhashes) <- bloomBitsHashes bloom <- lift $ newMB (cheapHashes numhashes) numbits _ <- populate $ \v -> lift $ insertMB bloom (convert v) lift $ unsafeFreezeMB bloom diff --git a/debian/changelog b/debian/changelog index 120513806..8d7337116 100644 --- a/debian/changelog +++ b/debian/changelog @@ -7,6 +7,14 @@ git-annex (3.20120310) UNRELEASED; urgency=low space, but now only needs to store the set of file contents that are present in the annex in memory. * status: Fixed to run in constant space. + * unused: Now uses a bloom filter, and runs in constant space. + Use of a bloom filter does mean it will not notice a small + number of unused keys. For repos with up to half a million keys, + it will miss one key in 1000. + * Added annex.bloomcapacity and annex.bloomaccuracy, which can be + adjusted as desired to tune the bloom filter. + * status: Display about of memory used by bloom filter, and + detect then it's too small for the number of keys in a repository. -- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400 diff --git a/debian/control b/debian/control index 8ea1a6259..a73433c2a 100644 --- a/debian/control +++ b/debian/control @@ -18,6 +18,7 @@ Build-Depends: libghc-lifted-base-dev, libghc-json-dev, libghc-ifelse-dev, + libghc-bloomfilter-dev, ikiwiki, perlmagick, git, diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index a941d4420..10899d12c 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -598,6 +598,23 @@ Here are all the supported configuration settings. of memory and are working with very large numbers of files, increasing the queue size can speed it up. +* `annex.bloomcapacity` + + The `git annex unused` command uses a bloom filter to determine + what data is no longer used. The default bloom filter is sized to handle + up to 500000 keys. If your repository is larger than that, + you can adjust this to avoid `git annex unused` not noticing some unused + data files. Increasing this will make `git-annex unused` consume more memory; + run `git annex status` for memory usage numbers. + +* `annex.bloomaccuracy` + + Adjusts the accuracy of the bloom filter used by + `git annex unused`. The default accuracy is 1000 -- + 1 unused file out of 1000 will be missed by `git annex unused`. Increasing + the accuracy will make `git annex unused` consume more memory; + run `git annex status` for memory usage numbers. + * `annex.version` Automatically maintained, and used to automate upgrades between versions. diff --git a/doc/install.mdwn b/doc/install.mdwn index 8de24d40d..0698a8bc4 100644 --- a/doc/install.mdwn +++ b/doc/install.mdwn @@ -35,6 +35,7 @@ To build and use git-annex, you will need: * [hS3](http://hackage.haskell.org/package/hS3) * [json](http://hackage.haskell.org/package/json) * [IfElse](http://hackage.haskell.org/package/IfElse) + * [bloomfilter](http://hackage.haskell.org/package/bloomfilter) * Shell commands * [git](http://git-scm.com/) * [uuid](http://www.ossp.org/pkg/lib/uuid/) diff --git a/git-annex.cabal b/git-annex.cabal index 6efebc66e..278d87555 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -32,7 +32,7 @@ Executable git-annex unix, containers, utf8-string, network, mtl, bytestring, old-locale, time, pcre-light, extensible-exceptions, dataenc, SHA, process, hs3, json, HTTP, base >= 4.5, base < 5, monad-control, transformers-base, lifted-base, - IfElse, text, QuickCheck >= 2.1 + IfElse, text, QuickCheck >= 2.1, bloomfilter Other-Modules: Utility.StatFS, Utility.Touch Executable git-annex-shell |