summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2016-01-06 20:38:02 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2016-01-06 20:38:02 -0400
commit6dab3f4271138bf732caa37338e79f96863923a3 (patch)
treed7ad969f0ab5c7294971b0a584d8cad82ead0659
parent336a5da9097f9f2d6aa92188a05c5e7658f5a087 (diff)
optimise
d1ce927d95fe7c331cbff3317797a60aa288738b put a cat-file into the fast bloomfilter generation path. Instead, add another bloom filter which diffs from the work tree to the index. Also, pull the sha of the changed object out of the diffs, and cat that object directly, rather than indirecting through the filename. Finally, removed some hacks that are unncessary thanks to the worktree to index diff.
-rw-r--r--Command/Unused.hs101
-rw-r--r--Git/DiffTree.hs5
2 files changed, 54 insertions, 52 deletions
diff --git a/Command/Unused.hs b/Command/Unused.hs
index 1e404ac9f..de91951b4 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -30,8 +30,8 @@ import Annex.Link
import Annex.CatFile
import Types.Key
import Types.RefSpec
-import Git.FilePath
import Git.Types
+import Git.Sha
import Logs.View (is_branchView)
import Annex.BloomFilter
@@ -158,28 +158,23 @@ dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\
-
- Strategy:
-
- - * Build a bloom filter of all keys referenced by symlinks. This
- - is the fastest one to build and will filter out most keys.
- - * If keys remain, build a second bloom filter of keys referenced by
- - branches maching the RefSpec.
- - * The list is streamed through these bloom filters lazily, so both will
- - exist at the same time. This means that twice the memory is used,
- - but they're relatively small, so the added complexity of using a
- - mutable bloom filter does not seem worthwhile.
- - * Generating the second bloom filter can take quite a while, since
- - it needs enumerating all keys in all git branches. But, the common
- - case, if the second filter is needed, is for some keys to be globally
- - unused, and in that case, no short-circuit is possible.
- - Short-circuiting if the first filter filters all the keys handles the
- - other common case.
+ - Pass keys through 3 bloom filters in order, only creating each bloom
+ - filter on demand if the previous one didn't filter out all keys.
+ -
+ - 1. All keys referenced by files in the work tree.
+ - This is the fastest one to build and will filter out most keys.
+ - 2. All keys in the diff from the work tree to the index.
+ - 3. All keys in the diffs between the index and branches matching the
+ - RefSpec. (This can take quite a while).
-}
excludeReferenced :: RefSpec -> [Key] -> Annex [Key]
-excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel
+excludeReferenced refspec ks =
+ runfilter withKeysReferencedM ks
+ >>= runfilter withKeysReferencedDiffIndex
+ >>= runfilter (withKeysReferencedDiffGitRefs refspec)
where
runfilter _ [] = return [] -- optimisation
runfilter a l = bloomFilter l <$> genBloomFilter a
- firstlevel = withKeysReferencedM
- secondlevel = withKeysReferencedInGit refspec
{- Given an initial value, folds it with each key referenced by
- files in the working tree. -}
@@ -218,7 +213,6 @@ withKeysReferenced' mdir initial a = do
mk <- getM id
[ isAnnexLink f
, liftIO (isPointerFile f)
- , catKeyFile f
]
case mk of
Nothing -> go v fs
@@ -226,53 +220,56 @@ withKeysReferenced' mdir initial a = do
!v' <- a k f v
go v' fs
-withKeysReferencedInGit :: RefSpec -> (Key -> Annex ()) -> Annex ()
-withKeysReferencedInGit refspec a = do
- current <- inRepo Git.Branch.currentUnsafe
- shaHead <- maybe (return Nothing) (inRepo . Git.Ref.sha) current
- rs <- relevantrefs (shaHead, current)
- <$> inRepo (Git.Command.pipeReadStrict [Param "show-ref"])
- usedrefs <- applyRefSpec refspec rs (getreflog rs)
- forM_ usedrefs $
- withKeysReferencedInGitRef a
+withKeysReferencedDiffGitRefs :: RefSpec -> (Key -> Annex ()) -> Annex ()
+withKeysReferencedDiffGitRefs refspec a = do
+ rs <- relevantrefs <$> inRepo (Git.Command.pipeReadStrict [Param "show-ref"])
+ shaHead <- maybe (return Nothing) (inRepo . Git.Ref.sha)
+ =<< inRepo Git.Branch.currentUnsafe
+ let haveHead = any (\(shaRef, _) -> Just shaRef == shaHead) rs
+ let rs' = map snd (nubRefs rs)
+ usedrefs <- applyRefSpec refspec rs' (getreflog rs')
+ forM_ (if haveHead then usedrefs else Git.Ref.headRef : usedrefs) $
+ withKeysReferencedDiffGitRef a
where
- relevantrefs headRef = addHead headRef .
+ relevantrefs = map (\(r, h) -> (Git.Ref r, Git.Ref h)) .
filter ourbranches .
map (separate (== ' ')) .
lines
- nubRefs = map (Git.Ref . snd) . nubBy (\(x, _) (y, _) -> x == y)
+ nubRefs = nubBy (\(x, _) (y, _) -> x == y)
ourbranchend = '/' : Git.fromRef Annex.Branch.name
ourbranches (_, b) = not (ourbranchend `isSuffixOf` b)
&& not ("refs/synced/" `isPrefixOf` b)
&& not (is_branchView (Git.Ref b))
- addHead headRef refs = case headRef of
- -- if HEAD diverges from all branches (except the branch it
- -- points to), run the actions on staged keys (and keys
- -- that are only present in the work tree if the repo is
- -- non bare)
- (Just (Git.Ref x), Just (Git.Ref b))
- | all (\(x',b') -> x /= x' || b == b') refs ->
- Git.Ref.headRef
- : nubRefs (filter ((/= x) . fst) refs)
- _ -> nubRefs refs
getreflog rs = inRepo $ Git.RefLog.getMulti rs
{- Runs an action on keys referenced in the given Git reference which
- - differ from those referenced in the work tree. -}
-withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex ()
-withKeysReferencedInGitRef a ref = do
+ - differ from those referenced in the index. -}
+withKeysReferencedDiffGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex ()
+withKeysReferencedDiffGitRef a ref = do
showAction $ "checking " ++ Git.Ref.describe ref
- bare <- isBareRepo
- (ts,clean) <- inRepo $ if bare
- then DiffTree.diffIndex ref
- else DiffTree.diffWorkTree ref
- let lookAtWorkingTree = not bare && ref == Git.Ref.headRef
- forM_ ts $ tKey lookAtWorkingTree >=> maybe noop a
+ withKeysReferencedDiff a
+ (inRepo $ DiffTree.diffIndex ref)
+ DiffTree.srcsha
+
+{- Runs an action on keys referenced in the index which differ from the
+ - work tree. -}
+withKeysReferencedDiffIndex :: (Key -> Annex ()) -> Annex ()
+withKeysReferencedDiffIndex a = unlessM (isBareRepo) $
+ withKeysReferencedDiff a
+ (inRepo $ DiffTree.diffFiles [])
+ DiffTree.srcsha
+
+withKeysReferencedDiff :: (Key -> Annex ()) -> (Annex ([DiffTree.DiffTreeItem], IO Bool)) -> (DiffTree.DiffTreeItem -> Sha) -> Annex ()
+withKeysReferencedDiff a getdiff extractsha = do
+ (ds, clean) <- getdiff
+ forM_ ds go
liftIO $ void clean
where
- tKey True = lookupFile . getTopFilePath . DiffTree.file
- tKey False = parseLinkOrPointer
- <$$> catFile ref . getTopFilePath . DiffTree.file
+ go d = do
+ let sha = extractsha d
+ unless (sha == nullSha) $
+ (parseLinkOrPointer <$> catObject sha)
+ >>= maybe noop a
data UnusedMaps = UnusedMaps
{ unusedMap :: UnusedMap
diff --git a/Git/DiffTree.hs b/Git/DiffTree.hs
index fecc9307c..838114872 100644
--- a/Git/DiffTree.hs
+++ b/Git/DiffTree.hs
@@ -12,6 +12,7 @@ module Git.DiffTree (
diffTreeRecursive,
diffIndex,
diffWorkTree,
+ diffFiles,
diffLog,
) where
@@ -66,6 +67,10 @@ diffIndex' ref params repo =
, return ([], return True)
)
+{- Diff between the index and work tree. -}
+diffFiles :: [CommandParam] -> Repo -> IO ([DiffTreeItem], IO Bool)
+diffFiles = getdiff (Param "diff-files")
+
{- Runs git log in --raw mode to get the changes that were made in
- a particular commit. The output format is adjusted to be the same
- as diff-tree --raw._-}