diff options
-rw-r--r-- | Command/Status.hs | 19 | ||||
-rw-r--r-- | Command/Unused.hs | 138 | ||||
-rw-r--r-- | debian/changelog | 8 | ||||
-rw-r--r-- | debian/control | 1 | ||||
-rw-r--r-- | doc/git-annex.mdwn | 17 | ||||
-rw-r--r-- | doc/install.mdwn | 1 | ||||
-rw-r--r-- | doc/todo/git-annex_unused_eats_memory.mdwn | 1 | ||||
-rw-r--r-- | git-annex.cabal | 2 |
8 files changed, 152 insertions, 35 deletions
diff --git a/Command/Status.hs b/Command/Status.hs index 96345e92b..39e71e750 100644 --- a/Command/Status.hs +++ b/Command/Status.hs @@ -76,6 +76,7 @@ slow_stats = , local_annex_size , known_annex_keys , known_annex_size + , bloom_info , backend_usage ] @@ -127,7 +128,7 @@ remote_list level desc = stat n $ nojson $ lift $ do return $ if null s then "0" else show (length rs) ++ "\n" ++ beginning s where n = desc ++ " repositories" - + local_annex_size :: Stat local_annex_size = stat "local annex size" $ json id $ showSizeKeys <$> cachedPresentData @@ -136,6 +137,22 @@ local_annex_keys :: Stat local_annex_keys = stat "local annex keys" $ json show $ countKeys <$> cachedPresentData +bloom_info :: Stat +bloom_info = stat "bloom filter size" $ json id $ do + localkeys <- countKeys <$> cachedPresentData + capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity + let note = aside $ + if localkeys >= capacity + then "appears too small for this repository; adjust annex.bloomcapacity" + else "has room for " ++ show (capacity - localkeys) ++ " more local annex keys" + + -- Two bloom filters are used at the same time, so double the size + -- of one. + size <- roughSize memoryUnits True . (* 2) . fromIntegral . fst <$> + lift Command.Unused.bloomBitsHashes + + return $ size ++ note + known_annex_size :: Stat known_annex_size = stat "known annex size" $ json id $ showSizeKeys <$> cachedReferencedData diff --git a/Command/Unused.hs b/Command/Unused.hs index 71cbfd470..b878ab265 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -12,6 +12,10 @@ module Command.Unused where import qualified Data.Set as S import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Encoding as L +import Data.BloomFilter +import Data.BloomFilter.Easy +import Data.BloomFilter.Hash +import Control.Monad.ST import Common.Annex import Command @@ -25,6 +29,7 @@ import qualified Git.Command import qualified Git.Ref import qualified Git.LsFiles as LsFiles import qualified Git.LsTree as LsTree +import qualified Git.Config import qualified Backend import qualified Remote import qualified Annex.Branch @@ -139,28 +144,32 @@ dropMsg (Just r) = dropMsg' $ " --from " ++ Remote.name r dropMsg' :: String -> String dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n" -{- Finds keys in the list that are not referenced in the git repository. -} +{- Finds keys in the list that are not referenced in the git repository. + - + - Strategy: + - + - * Build a bloom filter of all keys referenced by symlinks. This + - is the fastest one to build and will filter out most keys. + - * If keys remain, build a second bloom filter of keys referenced by + - all branches. + - * The list is streamed through these bloom filters lazily, so both will + - exist at the same time. This means that twice the memory is used, + - but they're relatively small, so the added complexity of using a + - mutable bloom filter does not seem worthwhile. + - * Generating the second bloom filter can take quite a while, since + - it needs enumerating all keys in all git branches. But, the common + - case, if the second filter is needed, is for some keys to be globally + - unused, and in that case, no short-circuit is possible. + - Short-circuiting if the first filter filters all the keys handles the + - other common case. + -} excludeReferenced :: [Key] -> Annex [Key] -excludeReferenced [] = return [] -- optimisation -excludeReferenced l = do - let s = S.fromList l - !s' <- withKeysReferenced s S.delete - go s' =<< refs <$> (inRepo $ Git.Command.pipeRead [Param "show-ref"]) +excludeReferenced ks = runfilter firstlevel ks >>= runfilter secondlevel where - -- Skip the git-annex branches, and get all other unique refs. - refs = map (Git.Ref . snd) . - nubBy uniqref . - filter ourbranches . - map (separate (== ' ')) . lines - uniqref (a, _) (b, _) = a == b - ourbranchend = '/' : show Annex.Branch.name - ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b - go s [] = return $ S.toList s - go s (r:rs) - | s == S.empty = return [] -- optimisation - | otherwise = do - s' <- withKeysReferencedInGit r s S.delete - go s' rs + runfilter _ [] = return [] -- optimisation + runfilter a l = bloomFilter show l <$> genBloomFilter show a + firstlevel = withKeysReferencedM + secondlevel = withKeysReferencedInGit {- Finds items in the first, smaller list, that are not - present in the second, larger list. @@ -174,10 +183,58 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller where remove a b = foldl (flip S.delete) b a -{- Given an initial value, mutates it using an action for each - - key referenced by symlinks in the git repo. -} +{- A bloom filter capable of holding half a million keys with a + - false positive rate of 1 in 1000 uses around 8 mb of memory, + - so will easily fit on even my lowest memory systems. + -} +bloomCapacity :: Annex Int +bloomCapacity = fromMaybe 500000 . readish + <$> fromRepo (Git.Config.get "annex.bloomcapacity" "") +bloomAccuracy :: Annex Int +bloomAccuracy = fromMaybe 1000 . readish + <$> fromRepo (Git.Config.get "annex.bloomaccuracy" "") +bloomBitsHashes :: Annex (Int, Int) +bloomBitsHashes = do + capacity <- bloomCapacity + accuracy <- bloomAccuracy + return $ suggestSizing capacity (1/ fromIntegral accuracy) + +{- Creates a bloom filter, and runs an action, such as withKeysReferenced, + - to populate it. + - + - The action is passed a callback that it can use to feed values into the + - bloom filter. + - + - Once the action completes, the mutable filter is frozen + - for later use. + -} +genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t) +genBloomFilter convert populate = do + (numbits, numhashes) <- bloomBitsHashes + bloom <- lift $ newMB (cheapHashes numhashes) numbits + _ <- populate $ \v -> lift $ insertMB bloom (convert v) + lift $ unsafeFreezeMB bloom + where + lift = liftIO . stToIO + +bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v] +bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l + +{- Given an initial value, folds it with each key referenced by + - symlinks in the git repo. -} withKeysReferenced :: v -> (Key -> v -> v) -> Annex v -withKeysReferenced initial a = go initial =<< files +withKeysReferenced initial a = withKeysReferenced' initial folda + where + folda k v = return $ a k v + +{- Runs an action on each referenced key in the git repo. -} +withKeysReferencedM :: (Key -> Annex ()) -> Annex () +withKeysReferencedM a = withKeysReferenced' () calla + where + calla k _ = a k + +withKeysReferenced' :: v -> (Key -> v -> Annex v) -> Annex v +withKeysReferenced' initial a = go initial =<< files where files = do top <- fromRepo Git.workTree @@ -188,24 +245,39 @@ withKeysReferenced initial a = go initial =<< files case x of Nothing -> go v fs Just (k, _) -> do - let !v' = a k v + !v' <- a k v go v' fs -withKeysReferencedInGit :: Git.Ref -> v -> (Key -> v -> v) -> Annex v -withKeysReferencedInGit ref initial a = do + +withKeysReferencedInGit :: (Key -> Annex ()) -> Annex () +withKeysReferencedInGit a = do + rs <- relevantrefs <$> showref + forM_ rs (withKeysReferencedInGitRef a) + where + showref = inRepo $ Git.Command.pipeRead [Param "show-ref"] + relevantrefs = map (Git.Ref . snd) . + nubBy uniqref . + filter ourbranches . + map (separate (== ' ')) . lines + uniqref (x, _) (y, _) = x == y + ourbranchend = '/' : show Annex.Branch.name + ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b + +withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex () +withKeysReferencedInGitRef a ref = do showAction $ "checking " ++ Git.Ref.describe ref - go initial =<< inRepo (LsTree.lsTree ref) + go =<< inRepo (LsTree.lsTree ref) where - go v [] = return v - go v (l:ls) + go [] = return () + go (l:ls) | isSymLink (LsTree.mode l) = do content <- L.decodeUtf8 <$> catFile ref (LsTree.file l) case fileKey (takeFileName $ L.unpack content) of - Nothing -> go v ls + Nothing -> go ls Just k -> do - let !v' = a k v - go v' ls - | otherwise = go v ls + a k + go ls + | otherwise = go ls {- Looks in the specified directory for bad/tmp keys, and returns a list - of those that might still have value, or might be stale and removable. diff --git a/debian/changelog b/debian/changelog index 8fc2cc330..fd5838683 100644 --- a/debian/changelog +++ b/debian/changelog @@ -8,6 +8,14 @@ git-annex (3.20120310) UNRELEASED; urgency=low are present in the annex in memory. * status: Fixed to run in constant space. * status: More accurate display of sizes of tmp and bad keys. + * unused: Now uses a bloom filter, and runs in constant space. + Use of a bloom filter does mean it will not notice a small + number of unused keys. For repos with up to half a million keys, + it will miss one key in 1000. + * Added annex.bloomcapacity and annex.bloomaccuracy, which can be + adjusted as desired to tune the bloom filter. + * status: Display about of memory used by bloom filter, and + detect then it's too small for the number of keys in a repository. -- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400 diff --git a/debian/control b/debian/control index 8ea1a6259..a73433c2a 100644 --- a/debian/control +++ b/debian/control @@ -18,6 +18,7 @@ Build-Depends: libghc-lifted-base-dev, libghc-json-dev, libghc-ifelse-dev, + libghc-bloomfilter-dev, ikiwiki, perlmagick, git, diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index a941d4420..10899d12c 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -598,6 +598,23 @@ Here are all the supported configuration settings. of memory and are working with very large numbers of files, increasing the queue size can speed it up. +* `annex.bloomcapacity` + + The `git annex unused` command uses a bloom filter to determine + what data is no longer used. The default bloom filter is sized to handle + up to 500000 keys. If your repository is larger than that, + you can adjust this to avoid `git annex unused` not noticing some unused + data files. Increasing this will make `git-annex unused` consume more memory; + run `git annex status` for memory usage numbers. + +* `annex.bloomaccuracy` + + Adjusts the accuracy of the bloom filter used by + `git annex unused`. The default accuracy is 1000 -- + 1 unused file out of 1000 will be missed by `git annex unused`. Increasing + the accuracy will make `git annex unused` consume more memory; + run `git annex status` for memory usage numbers. + * `annex.version` Automatically maintained, and used to automate upgrades between versions. diff --git a/doc/install.mdwn b/doc/install.mdwn index 8de24d40d..0698a8bc4 100644 --- a/doc/install.mdwn +++ b/doc/install.mdwn @@ -35,6 +35,7 @@ To build and use git-annex, you will need: * [hS3](http://hackage.haskell.org/package/hS3) * [json](http://hackage.haskell.org/package/json) * [IfElse](http://hackage.haskell.org/package/IfElse) + * [bloomfilter](http://hackage.haskell.org/package/bloomfilter) * Shell commands * [git](http://git-scm.com/) * [uuid](http://www.ossp.org/pkg/lib/uuid/) diff --git a/doc/todo/git-annex_unused_eats_memory.mdwn b/doc/todo/git-annex_unused_eats_memory.mdwn index caaecfa8c..760a6ccf5 100644 --- a/doc/todo/git-annex_unused_eats_memory.mdwn +++ b/doc/todo/git-annex_unused_eats_memory.mdwn @@ -22,6 +22,7 @@ miss finding some unused keys. The probability/size of filter could be tunable. > Fixed in `bloom` branch in git. --[[Joey]] +>> [[done]]! --[[Joey]] Another way might be to scan the git log for files that got removed or changed what key they pointed to. Correlate with keys with content diff --git a/git-annex.cabal b/git-annex.cabal index 6efebc66e..278d87555 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -32,7 +32,7 @@ Executable git-annex unix, containers, utf8-string, network, mtl, bytestring, old-locale, time, pcre-light, extensible-exceptions, dataenc, SHA, process, hs3, json, HTTP, base >= 4.5, base < 5, monad-control, transformers-base, lifted-base, - IfElse, text, QuickCheck >= 2.1 + IfElse, text, QuickCheck >= 2.1, bloomfilter Other-Modules: Utility.StatFS, Utility.Touch Executable git-annex-shell |