aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Command/Status.hs19
-rw-r--r--Command/Unused.hs138
-rw-r--r--debian/changelog10
-rw-r--r--debian/control1
-rw-r--r--doc/git-annex.mdwn17
-rw-r--r--doc/install.mdwn1
-rw-r--r--doc/todo/git-annex_unused_eats_memory.mdwn1
-rw-r--r--git-annex.cabal2
8 files changed, 153 insertions, 36 deletions
diff --git a/Command/Status.hs b/Command/Status.hs
index 96345e92b..39e71e750 100644
--- a/Command/Status.hs
+++ b/Command/Status.hs
@@ -76,6 +76,7 @@ slow_stats =
, local_annex_size
, known_annex_keys
, known_annex_size
+ , bloom_info
, backend_usage
]
@@ -127,7 +128,7 @@ remote_list level desc = stat n $ nojson $ lift $ do
return $ if null s then "0" else show (length rs) ++ "\n" ++ beginning s
where
n = desc ++ " repositories"
-
+
local_annex_size :: Stat
local_annex_size = stat "local annex size" $ json id $
showSizeKeys <$> cachedPresentData
@@ -136,6 +137,22 @@ local_annex_keys :: Stat
local_annex_keys = stat "local annex keys" $ json show $
countKeys <$> cachedPresentData
+bloom_info :: Stat
+bloom_info = stat "bloom filter size" $ json id $ do
+ localkeys <- countKeys <$> cachedPresentData
+ capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+ let note = aside $
+ if localkeys >= capacity
+ then "appears too small for this repository; adjust annex.bloomcapacity"
+ else "has room for " ++ show (capacity - localkeys) ++ " more local annex keys"
+
+ -- Two bloom filters are used at the same time, so double the size
+ -- of one.
+ size <- roughSize memoryUnits True . (* 2) . fromIntegral . fst <$>
+ lift Command.Unused.bloomBitsHashes
+
+ return $ size ++ note
+
known_annex_size :: Stat
known_annex_size = stat "known annex size" $ json id $
showSizeKeys <$> cachedReferencedData
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 71cbfd470..b878ab265 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -12,6 +12,10 @@ module Command.Unused where
import qualified Data.Set as S
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Encoding as L
+import Data.BloomFilter
+import Data.BloomFilter.Easy
+import Data.BloomFilter.Hash
+import Control.Monad.ST
import Common.Annex
import Command
@@ -25,6 +29,7 @@ import qualified Git.Command
import qualified Git.Ref
import qualified Git.LsFiles as LsFiles
import qualified Git.LsTree as LsTree
+import qualified Git.Config
import qualified Backend
import qualified Remote
import qualified Annex.Branch
@@ -139,28 +144,32 @@ dropMsg (Just r) = dropMsg' $ " --from " ++ Remote.name r
dropMsg' :: String -> String
dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n"
-{- Finds keys in the list that are not referenced in the git repository. -}
+{- Finds keys in the list that are not referenced in the git repository.
+ -
+ - Strategy:
+ -
+ - * Build a bloom filter of all keys referenced by symlinks. This
+ - is the fastest one to build and will filter out most keys.
+ - * If keys remain, build a second bloom filter of keys referenced by
+ - all branches.
+ - * The list is streamed through these bloom filters lazily, so both will
+ - exist at the same time. This means that twice the memory is used,
+ - but they're relatively small, so the added complexity of using a
+ - mutable bloom filter does not seem worthwhile.
+ - * Generating the second bloom filter can take quite a while, since
+ - it needs enumerating all keys in all git branches. But, the common
+ - case, if the second filter is needed, is for some keys to be globally
+ - unused, and in that case, no short-circuit is possible.
+ - Short-circuiting if the first filter filters all the keys handles the
+ - other common case.
+ -}
excludeReferenced :: [Key] -> Annex [Key]
-excludeReferenced [] = return [] -- optimisation
-excludeReferenced l = do
- let s = S.fromList l
- !s' <- withKeysReferenced s S.delete
- go s' =<< refs <$> (inRepo $ Git.Command.pipeRead [Param "show-ref"])
+excludeReferenced ks = runfilter firstlevel ks >>= runfilter secondlevel
where
- -- Skip the git-annex branches, and get all other unique refs.
- refs = map (Git.Ref . snd) .
- nubBy uniqref .
- filter ourbranches .
- map (separate (== ' ')) . lines
- uniqref (a, _) (b, _) = a == b
- ourbranchend = '/' : show Annex.Branch.name
- ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b
- go s [] = return $ S.toList s
- go s (r:rs)
- | s == S.empty = return [] -- optimisation
- | otherwise = do
- s' <- withKeysReferencedInGit r s S.delete
- go s' rs
+ runfilter _ [] = return [] -- optimisation
+ runfilter a l = bloomFilter show l <$> genBloomFilter show a
+ firstlevel = withKeysReferencedM
+ secondlevel = withKeysReferencedInGit
{- Finds items in the first, smaller list, that are not
- present in the second, larger list.
@@ -174,10 +183,58 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
where
remove a b = foldl (flip S.delete) b a
-{- Given an initial value, mutates it using an action for each
- - key referenced by symlinks in the git repo. -}
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 1000 uses around 8 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . readish
+ <$> fromRepo (Git.Config.get "annex.bloomcapacity" "")
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 1000 . readish
+ <$> fromRepo (Git.Config.get "annex.bloomaccuracy" "")
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+ capacity <- bloomCapacity
+ accuracy <- bloomAccuracy
+ return $ suggestSizing capacity (1/ fromIntegral accuracy)
+
+{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
+ - to populate it.
+ -
+ - The action is passed a callback that it can use to feed values into the
+ - bloom filter.
+ -
+ - Once the action completes, the mutable filter is frozen
+ - for later use.
+ -}
+genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
+genBloomFilter convert populate = do
+ (numbits, numhashes) <- bloomBitsHashes
+ bloom <- lift $ newMB (cheapHashes numhashes) numbits
+ _ <- populate $ \v -> lift $ insertMB bloom (convert v)
+ lift $ unsafeFreezeMB bloom
+ where
+ lift = liftIO . stToIO
+
+bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
+bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
+
+{- Given an initial value, folds it with each key referenced by
+ - symlinks in the git repo. -}
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
-withKeysReferenced initial a = go initial =<< files
+withKeysReferenced initial a = withKeysReferenced' initial folda
+ where
+ folda k v = return $ a k v
+
+{- Runs an action on each referenced key in the git repo. -}
+withKeysReferencedM :: (Key -> Annex ()) -> Annex ()
+withKeysReferencedM a = withKeysReferenced' () calla
+ where
+ calla k _ = a k
+
+withKeysReferenced' :: v -> (Key -> v -> Annex v) -> Annex v
+withKeysReferenced' initial a = go initial =<< files
where
files = do
top <- fromRepo Git.workTree
@@ -188,24 +245,39 @@ withKeysReferenced initial a = go initial =<< files
case x of
Nothing -> go v fs
Just (k, _) -> do
- let !v' = a k v
+ !v' <- a k v
go v' fs
-withKeysReferencedInGit :: Git.Ref -> v -> (Key -> v -> v) -> Annex v
-withKeysReferencedInGit ref initial a = do
+
+withKeysReferencedInGit :: (Key -> Annex ()) -> Annex ()
+withKeysReferencedInGit a = do
+ rs <- relevantrefs <$> showref
+ forM_ rs (withKeysReferencedInGitRef a)
+ where
+ showref = inRepo $ Git.Command.pipeRead [Param "show-ref"]
+ relevantrefs = map (Git.Ref . snd) .
+ nubBy uniqref .
+ filter ourbranches .
+ map (separate (== ' ')) . lines
+ uniqref (x, _) (y, _) = x == y
+ ourbranchend = '/' : show Annex.Branch.name
+ ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b
+
+withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex ()
+withKeysReferencedInGitRef a ref = do
showAction $ "checking " ++ Git.Ref.describe ref
- go initial =<< inRepo (LsTree.lsTree ref)
+ go =<< inRepo (LsTree.lsTree ref)
where
- go v [] = return v
- go v (l:ls)
+ go [] = return ()
+ go (l:ls)
| isSymLink (LsTree.mode l) = do
content <- L.decodeUtf8 <$> catFile ref (LsTree.file l)
case fileKey (takeFileName $ L.unpack content) of
- Nothing -> go v ls
+ Nothing -> go ls
Just k -> do
- let !v' = a k v
- go v' ls
- | otherwise = go v ls
+ a k
+ go ls
+ | otherwise = go ls
{- Looks in the specified directory for bad/tmp keys, and returns a list
- of those that might still have value, or might be stale and removable.
diff --git a/debian/changelog b/debian/changelog
index b6b62708e..393234ba5 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -8,9 +8,17 @@ git-annex (3.20120310) UNRELEASED; urgency=low
are present in the annex in memory.
* status: Fixed to run in constant space.
* status: More accurate display of sizes of tmp and bad keys.
- * Work around a bug in rsync (IMHO) introduced by openSUSE's SIP patch.
+ * unused: Now uses a bloom filter, and runs in constant space.
+ Use of a bloom filter does mean it will not notice a small
+ number of unused keys. For repos with up to half a million keys,
+ it will miss one key in 1000.
+ * Added annex.bloomcapacity and annex.bloomaccuracy, which can be
+ adjusted as desired to tune the bloom filter.
+ * status: Display about of memory used by bloom filter, and
+ detect then it's too small for the number of keys in a repository.
* git-annex-shell: Runs hooks/annex-content after content is received
or dropped.
+ * Work around a bug in rsync (IMHO) introduced by openSUSE's SIP patch.
-- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400
diff --git a/debian/control b/debian/control
index 8ea1a6259..a73433c2a 100644
--- a/debian/control
+++ b/debian/control
@@ -18,6 +18,7 @@ Build-Depends:
libghc-lifted-base-dev,
libghc-json-dev,
libghc-ifelse-dev,
+ libghc-bloomfilter-dev,
ikiwiki,
perlmagick,
git,
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index a941d4420..10899d12c 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -598,6 +598,23 @@ Here are all the supported configuration settings.
of memory and are working with very large numbers of files, increasing
the queue size can speed it up.
+* `annex.bloomcapacity`
+
+ The `git annex unused` command uses a bloom filter to determine
+ what data is no longer used. The default bloom filter is sized to handle
+ up to 500000 keys. If your repository is larger than that,
+ you can adjust this to avoid `git annex unused` not noticing some unused
+ data files. Increasing this will make `git-annex unused` consume more memory;
+ run `git annex status` for memory usage numbers.
+
+* `annex.bloomaccuracy`
+
+ Adjusts the accuracy of the bloom filter used by
+ `git annex unused`. The default accuracy is 1000 --
+ 1 unused file out of 1000 will be missed by `git annex unused`. Increasing
+ the accuracy will make `git annex unused` consume more memory;
+ run `git annex status` for memory usage numbers.
+
* `annex.version`
Automatically maintained, and used to automate upgrades between versions.
diff --git a/doc/install.mdwn b/doc/install.mdwn
index 8de24d40d..0698a8bc4 100644
--- a/doc/install.mdwn
+++ b/doc/install.mdwn
@@ -35,6 +35,7 @@ To build and use git-annex, you will need:
* [hS3](http://hackage.haskell.org/package/hS3)
* [json](http://hackage.haskell.org/package/json)
* [IfElse](http://hackage.haskell.org/package/IfElse)
+ * [bloomfilter](http://hackage.haskell.org/package/bloomfilter)
* Shell commands
* [git](http://git-scm.com/)
* [uuid](http://www.ossp.org/pkg/lib/uuid/)
diff --git a/doc/todo/git-annex_unused_eats_memory.mdwn b/doc/todo/git-annex_unused_eats_memory.mdwn
index caaecfa8c..760a6ccf5 100644
--- a/doc/todo/git-annex_unused_eats_memory.mdwn
+++ b/doc/todo/git-annex_unused_eats_memory.mdwn
@@ -22,6 +22,7 @@ miss finding some unused keys. The probability/size of filter
could be tunable.
> Fixed in `bloom` branch in git. --[[Joey]]
+>> [[done]]! --[[Joey]]
Another way might be to scan the git log for files that got removed
or changed what key they pointed to. Correlate with keys with content
diff --git a/git-annex.cabal b/git-annex.cabal
index 6efebc66e..278d87555 100644
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -32,7 +32,7 @@ Executable git-annex
unix, containers, utf8-string, network, mtl, bytestring, old-locale, time,
pcre-light, extensible-exceptions, dataenc, SHA, process, hs3, json, HTTP,
base >= 4.5, base < 5, monad-control, transformers-base, lifted-base,
- IfElse, text, QuickCheck >= 2.1
+ IfElse, text, QuickCheck >= 2.1, bloomfilter
Other-Modules: Utility.StatFS, Utility.Touch
Executable git-annex-shell