summaryrefslogtreecommitdiff
path: root/Command
diff options
context:
space:
mode:
Diffstat (limited to 'Command')
-rw-r--r--Command/Status.hs19
-rw-r--r--Command/Unused.hs138
2 files changed, 123 insertions, 34 deletions
diff --git a/Command/Status.hs b/Command/Status.hs
index 96345e92b..39e71e750 100644
--- a/Command/Status.hs
+++ b/Command/Status.hs
@@ -76,6 +76,7 @@ slow_stats =
, local_annex_size
, known_annex_keys
, known_annex_size
+ , bloom_info
, backend_usage
]
@@ -127,7 +128,7 @@ remote_list level desc = stat n $ nojson $ lift $ do
return $ if null s then "0" else show (length rs) ++ "\n" ++ beginning s
where
n = desc ++ " repositories"
-
+
local_annex_size :: Stat
local_annex_size = stat "local annex size" $ json id $
showSizeKeys <$> cachedPresentData
@@ -136,6 +137,22 @@ local_annex_keys :: Stat
local_annex_keys = stat "local annex keys" $ json show $
countKeys <$> cachedPresentData
+bloom_info :: Stat
+bloom_info = stat "bloom filter size" $ json id $ do
+ localkeys <- countKeys <$> cachedPresentData
+ capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity
+ let note = aside $
+ if localkeys >= capacity
+ then "appears too small for this repository; adjust annex.bloomcapacity"
+ else "has room for " ++ show (capacity - localkeys) ++ " more local annex keys"
+
+ -- Two bloom filters are used at the same time, so double the size
+ -- of one.
+ size <- roughSize memoryUnits True . (* 2) . fromIntegral . fst <$>
+ lift Command.Unused.bloomBitsHashes
+
+ return $ size ++ note
+
known_annex_size :: Stat
known_annex_size = stat "known annex size" $ json id $
showSizeKeys <$> cachedReferencedData
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 71cbfd470..b878ab265 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -12,6 +12,10 @@ module Command.Unused where
import qualified Data.Set as S
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Encoding as L
+import Data.BloomFilter
+import Data.BloomFilter.Easy
+import Data.BloomFilter.Hash
+import Control.Monad.ST
import Common.Annex
import Command
@@ -25,6 +29,7 @@ import qualified Git.Command
import qualified Git.Ref
import qualified Git.LsFiles as LsFiles
import qualified Git.LsTree as LsTree
+import qualified Git.Config
import qualified Backend
import qualified Remote
import qualified Annex.Branch
@@ -139,28 +144,32 @@ dropMsg (Just r) = dropMsg' $ " --from " ++ Remote.name r
dropMsg' :: String -> String
dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n"
-{- Finds keys in the list that are not referenced in the git repository. -}
+{- Finds keys in the list that are not referenced in the git repository.
+ -
+ - Strategy:
+ -
+ - * Build a bloom filter of all keys referenced by symlinks. This
+ - is the fastest one to build and will filter out most keys.
+ - * If keys remain, build a second bloom filter of keys referenced by
+ - all branches.
+ - * The list is streamed through these bloom filters lazily, so both will
+ - exist at the same time. This means that twice the memory is used,
+ - but they're relatively small, so the added complexity of using a
+ - mutable bloom filter does not seem worthwhile.
+ - * Generating the second bloom filter can take quite a while, since
+ - it needs enumerating all keys in all git branches. But, the common
+ - case, if the second filter is needed, is for some keys to be globally
+ - unused, and in that case, no short-circuit is possible.
+ - Short-circuiting if the first filter filters all the keys handles the
+ - other common case.
+ -}
excludeReferenced :: [Key] -> Annex [Key]
-excludeReferenced [] = return [] -- optimisation
-excludeReferenced l = do
- let s = S.fromList l
- !s' <- withKeysReferenced s S.delete
- go s' =<< refs <$> (inRepo $ Git.Command.pipeRead [Param "show-ref"])
+excludeReferenced ks = runfilter firstlevel ks >>= runfilter secondlevel
where
- -- Skip the git-annex branches, and get all other unique refs.
- refs = map (Git.Ref . snd) .
- nubBy uniqref .
- filter ourbranches .
- map (separate (== ' ')) . lines
- uniqref (a, _) (b, _) = a == b
- ourbranchend = '/' : show Annex.Branch.name
- ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b
- go s [] = return $ S.toList s
- go s (r:rs)
- | s == S.empty = return [] -- optimisation
- | otherwise = do
- s' <- withKeysReferencedInGit r s S.delete
- go s' rs
+ runfilter _ [] = return [] -- optimisation
+ runfilter a l = bloomFilter show l <$> genBloomFilter show a
+ firstlevel = withKeysReferencedM
+ secondlevel = withKeysReferencedInGit
{- Finds items in the first, smaller list, that are not
- present in the second, larger list.
@@ -174,10 +183,58 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
where
remove a b = foldl (flip S.delete) b a
-{- Given an initial value, mutates it using an action for each
- - key referenced by symlinks in the git repo. -}
+{- A bloom filter capable of holding half a million keys with a
+ - false positive rate of 1 in 1000 uses around 8 mb of memory,
+ - so will easily fit on even my lowest memory systems.
+ -}
+bloomCapacity :: Annex Int
+bloomCapacity = fromMaybe 500000 . readish
+ <$> fromRepo (Git.Config.get "annex.bloomcapacity" "")
+bloomAccuracy :: Annex Int
+bloomAccuracy = fromMaybe 1000 . readish
+ <$> fromRepo (Git.Config.get "annex.bloomaccuracy" "")
+bloomBitsHashes :: Annex (Int, Int)
+bloomBitsHashes = do
+ capacity <- bloomCapacity
+ accuracy <- bloomAccuracy
+ return $ suggestSizing capacity (1/ fromIntegral accuracy)
+
+{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
+ - to populate it.
+ -
+ - The action is passed a callback that it can use to feed values into the
+ - bloom filter.
+ -
+ - Once the action completes, the mutable filter is frozen
+ - for later use.
+ -}
+genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
+genBloomFilter convert populate = do
+ (numbits, numhashes) <- bloomBitsHashes
+ bloom <- lift $ newMB (cheapHashes numhashes) numbits
+ _ <- populate $ \v -> lift $ insertMB bloom (convert v)
+ lift $ unsafeFreezeMB bloom
+ where
+ lift = liftIO . stToIO
+
+bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
+bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
+
+{- Given an initial value, folds it with each key referenced by
+ - symlinks in the git repo. -}
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
-withKeysReferenced initial a = go initial =<< files
+withKeysReferenced initial a = withKeysReferenced' initial folda
+ where
+ folda k v = return $ a k v
+
+{- Runs an action on each referenced key in the git repo. -}
+withKeysReferencedM :: (Key -> Annex ()) -> Annex ()
+withKeysReferencedM a = withKeysReferenced' () calla
+ where
+ calla k _ = a k
+
+withKeysReferenced' :: v -> (Key -> v -> Annex v) -> Annex v
+withKeysReferenced' initial a = go initial =<< files
where
files = do
top <- fromRepo Git.workTree
@@ -188,24 +245,39 @@ withKeysReferenced initial a = go initial =<< files
case x of
Nothing -> go v fs
Just (k, _) -> do
- let !v' = a k v
+ !v' <- a k v
go v' fs
-withKeysReferencedInGit :: Git.Ref -> v -> (Key -> v -> v) -> Annex v
-withKeysReferencedInGit ref initial a = do
+
+withKeysReferencedInGit :: (Key -> Annex ()) -> Annex ()
+withKeysReferencedInGit a = do
+ rs <- relevantrefs <$> showref
+ forM_ rs (withKeysReferencedInGitRef a)
+ where
+ showref = inRepo $ Git.Command.pipeRead [Param "show-ref"]
+ relevantrefs = map (Git.Ref . snd) .
+ nubBy uniqref .
+ filter ourbranches .
+ map (separate (== ' ')) . lines
+ uniqref (x, _) (y, _) = x == y
+ ourbranchend = '/' : show Annex.Branch.name
+ ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b
+
+withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex ()
+withKeysReferencedInGitRef a ref = do
showAction $ "checking " ++ Git.Ref.describe ref
- go initial =<< inRepo (LsTree.lsTree ref)
+ go =<< inRepo (LsTree.lsTree ref)
where
- go v [] = return v
- go v (l:ls)
+ go [] = return ()
+ go (l:ls)
| isSymLink (LsTree.mode l) = do
content <- L.decodeUtf8 <$> catFile ref (LsTree.file l)
case fileKey (takeFileName $ L.unpack content) of
- Nothing -> go v ls
+ Nothing -> go ls
Just k -> do
- let !v' = a k v
- go v' ls
- | otherwise = go v ls
+ a k
+ go ls
+ | otherwise = go ls
{- Looks in the specified directory for bad/tmp keys, and returns a list
- of those that might still have value, or might be stale and removable.