diff options
Diffstat (limited to 'Command')
-rw-r--r-- | Command/Status.hs | 19 | ||||
-rw-r--r-- | Command/Unused.hs | 138 |
2 files changed, 123 insertions, 34 deletions
diff --git a/Command/Status.hs b/Command/Status.hs index 96345e92b..39e71e750 100644 --- a/Command/Status.hs +++ b/Command/Status.hs @@ -76,6 +76,7 @@ slow_stats = , local_annex_size , known_annex_keys , known_annex_size + , bloom_info , backend_usage ] @@ -127,7 +128,7 @@ remote_list level desc = stat n $ nojson $ lift $ do return $ if null s then "0" else show (length rs) ++ "\n" ++ beginning s where n = desc ++ " repositories" - + local_annex_size :: Stat local_annex_size = stat "local annex size" $ json id $ showSizeKeys <$> cachedPresentData @@ -136,6 +137,22 @@ local_annex_keys :: Stat local_annex_keys = stat "local annex keys" $ json show $ countKeys <$> cachedPresentData +bloom_info :: Stat +bloom_info = stat "bloom filter size" $ json id $ do + localkeys <- countKeys <$> cachedPresentData + capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity + let note = aside $ + if localkeys >= capacity + then "appears too small for this repository; adjust annex.bloomcapacity" + else "has room for " ++ show (capacity - localkeys) ++ " more local annex keys" + + -- Two bloom filters are used at the same time, so double the size + -- of one. + size <- roughSize memoryUnits True . (* 2) . fromIntegral . fst <$> + lift Command.Unused.bloomBitsHashes + + return $ size ++ note + known_annex_size :: Stat known_annex_size = stat "known annex size" $ json id $ showSizeKeys <$> cachedReferencedData diff --git a/Command/Unused.hs b/Command/Unused.hs index 71cbfd470..b878ab265 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -12,6 +12,10 @@ module Command.Unused where import qualified Data.Set as S import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Encoding as L +import Data.BloomFilter +import Data.BloomFilter.Easy +import Data.BloomFilter.Hash +import Control.Monad.ST import Common.Annex import Command @@ -25,6 +29,7 @@ import qualified Git.Command import qualified Git.Ref import qualified Git.LsFiles as LsFiles import qualified Git.LsTree as LsTree +import qualified Git.Config import qualified Backend import qualified Remote import qualified Annex.Branch @@ -139,28 +144,32 @@ dropMsg (Just r) = dropMsg' $ " --from " ++ Remote.name r dropMsg' :: String -> String dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n" -{- Finds keys in the list that are not referenced in the git repository. -} +{- Finds keys in the list that are not referenced in the git repository. + - + - Strategy: + - + - * Build a bloom filter of all keys referenced by symlinks. This + - is the fastest one to build and will filter out most keys. + - * If keys remain, build a second bloom filter of keys referenced by + - all branches. + - * The list is streamed through these bloom filters lazily, so both will + - exist at the same time. This means that twice the memory is used, + - but they're relatively small, so the added complexity of using a + - mutable bloom filter does not seem worthwhile. + - * Generating the second bloom filter can take quite a while, since + - it needs enumerating all keys in all git branches. But, the common + - case, if the second filter is needed, is for some keys to be globally + - unused, and in that case, no short-circuit is possible. + - Short-circuiting if the first filter filters all the keys handles the + - other common case. + -} excludeReferenced :: [Key] -> Annex [Key] -excludeReferenced [] = return [] -- optimisation -excludeReferenced l = do - let s = S.fromList l - !s' <- withKeysReferenced s S.delete - go s' =<< refs <$> (inRepo $ Git.Command.pipeRead [Param "show-ref"]) +excludeReferenced ks = runfilter firstlevel ks >>= runfilter secondlevel where - -- Skip the git-annex branches, and get all other unique refs. - refs = map (Git.Ref . snd) . - nubBy uniqref . - filter ourbranches . - map (separate (== ' ')) . lines - uniqref (a, _) (b, _) = a == b - ourbranchend = '/' : show Annex.Branch.name - ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b - go s [] = return $ S.toList s - go s (r:rs) - | s == S.empty = return [] -- optimisation - | otherwise = do - s' <- withKeysReferencedInGit r s S.delete - go s' rs + runfilter _ [] = return [] -- optimisation + runfilter a l = bloomFilter show l <$> genBloomFilter show a + firstlevel = withKeysReferencedM + secondlevel = withKeysReferencedInGit {- Finds items in the first, smaller list, that are not - present in the second, larger list. @@ -174,10 +183,58 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller where remove a b = foldl (flip S.delete) b a -{- Given an initial value, mutates it using an action for each - - key referenced by symlinks in the git repo. -} +{- A bloom filter capable of holding half a million keys with a + - false positive rate of 1 in 1000 uses around 8 mb of memory, + - so will easily fit on even my lowest memory systems. + -} +bloomCapacity :: Annex Int +bloomCapacity = fromMaybe 500000 . readish + <$> fromRepo (Git.Config.get "annex.bloomcapacity" "") +bloomAccuracy :: Annex Int +bloomAccuracy = fromMaybe 1000 . readish + <$> fromRepo (Git.Config.get "annex.bloomaccuracy" "") +bloomBitsHashes :: Annex (Int, Int) +bloomBitsHashes = do + capacity <- bloomCapacity + accuracy <- bloomAccuracy + return $ suggestSizing capacity (1/ fromIntegral accuracy) + +{- Creates a bloom filter, and runs an action, such as withKeysReferenced, + - to populate it. + - + - The action is passed a callback that it can use to feed values into the + - bloom filter. + - + - Once the action completes, the mutable filter is frozen + - for later use. + -} +genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t) +genBloomFilter convert populate = do + (numbits, numhashes) <- bloomBitsHashes + bloom <- lift $ newMB (cheapHashes numhashes) numbits + _ <- populate $ \v -> lift $ insertMB bloom (convert v) + lift $ unsafeFreezeMB bloom + where + lift = liftIO . stToIO + +bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v] +bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l + +{- Given an initial value, folds it with each key referenced by + - symlinks in the git repo. -} withKeysReferenced :: v -> (Key -> v -> v) -> Annex v -withKeysReferenced initial a = go initial =<< files +withKeysReferenced initial a = withKeysReferenced' initial folda + where + folda k v = return $ a k v + +{- Runs an action on each referenced key in the git repo. -} +withKeysReferencedM :: (Key -> Annex ()) -> Annex () +withKeysReferencedM a = withKeysReferenced' () calla + where + calla k _ = a k + +withKeysReferenced' :: v -> (Key -> v -> Annex v) -> Annex v +withKeysReferenced' initial a = go initial =<< files where files = do top <- fromRepo Git.workTree @@ -188,24 +245,39 @@ withKeysReferenced initial a = go initial =<< files case x of Nothing -> go v fs Just (k, _) -> do - let !v' = a k v + !v' <- a k v go v' fs -withKeysReferencedInGit :: Git.Ref -> v -> (Key -> v -> v) -> Annex v -withKeysReferencedInGit ref initial a = do + +withKeysReferencedInGit :: (Key -> Annex ()) -> Annex () +withKeysReferencedInGit a = do + rs <- relevantrefs <$> showref + forM_ rs (withKeysReferencedInGitRef a) + where + showref = inRepo $ Git.Command.pipeRead [Param "show-ref"] + relevantrefs = map (Git.Ref . snd) . + nubBy uniqref . + filter ourbranches . + map (separate (== ' ')) . lines + uniqref (x, _) (y, _) = x == y + ourbranchend = '/' : show Annex.Branch.name + ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b + +withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex () +withKeysReferencedInGitRef a ref = do showAction $ "checking " ++ Git.Ref.describe ref - go initial =<< inRepo (LsTree.lsTree ref) + go =<< inRepo (LsTree.lsTree ref) where - go v [] = return v - go v (l:ls) + go [] = return () + go (l:ls) | isSymLink (LsTree.mode l) = do content <- L.decodeUtf8 <$> catFile ref (LsTree.file l) case fileKey (takeFileName $ L.unpack content) of - Nothing -> go v ls + Nothing -> go ls Just k -> do - let !v' = a k v - go v' ls - | otherwise = go v ls + a k + go ls + | otherwise = go ls {- Looks in the specified directory for bad/tmp keys, and returns a list - of those that might still have value, or might be stale and removable. |