summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2014-03-10 15:12:54 -0400
committerGravatar Joey Hess <joey@kitenet.net>2014-03-10 15:14:09 -0400
commitb7ac0db88f367ceb1df863c35c44d4a6e0f51a1e (patch)
tree14fb3dbd1201a879cfd2eb79e2c2d3dde30fc27b
parentb590db53ef36f9446d32cf29423b658697cdf6b7 (diff)
Improve memory usage when git fsck finds a great many broken objects.
From 1.7 gb to 900 mb on 300 thousand unique reported shas. When shas are not unique, this streams much better than before, so won't buffer the full list before putting them into the Set and throwing away dups. And when fsck output includes ignorable lines, especially dangling object lines, they won't be buffered in memory at all.
-rw-r--r--Git/Fsck.hs20
-rw-r--r--debian/changelog2
2 files changed, 19 insertions, 3 deletions
diff --git a/Git/Fsck.hs b/Git/Fsck.hs
index e90683bc0..b3948cb1d 100644
--- a/Git/Fsck.hs
+++ b/Git/Fsck.hs
@@ -23,6 +23,7 @@ import Utility.Batch
import qualified Git.Version
import qualified Data.Set as S
+import System.Process (std_out, std_err)
type MissingObjects = S.Set Sha
@@ -46,9 +47,17 @@ findBroken batchmode r = do
(command', params') <- if batchmode
then toBatchCommand (command, params)
else return (command, params)
- (output, fsckok) <- processTranscript command' (toCommand params') Nothing
- let objs = findShas supportsNoDangling output
- badobjs <- findMissing objs r
+
+ p@(_, _, _, pid) <- createProcess $
+ (proc command' (toCommand params'))
+ { std_out = CreatePipe
+ , std_err = CreatePipe
+ }
+ bad1 <- readMissingObjs r supportsNoDangling (stdoutHandle p)
+ bad2 <- readMissingObjs r supportsNoDangling (stderrHandle p)
+ fsckok <- checkSuccessProcess pid
+ let badobjs = S.union bad1 bad2
+
if S.null badobjs && not fsckok
then return FsckFailed
else return $ FsckFoundMissing badobjs
@@ -69,6 +78,11 @@ knownMissing (FsckFoundMissing s) = s
findMissing :: [Sha] -> Repo -> IO MissingObjects
findMissing objs r = S.fromList <$> filterM (`isMissing` r) objs
+readMissingObjs :: Repo -> Bool -> Handle -> IO MissingObjects
+readMissingObjs r supportsNoDangling h = do
+ objs <- findShas supportsNoDangling <$> hGetContents h
+ findMissing objs r
+
isMissing :: Sha -> Repo -> IO Bool
isMissing s r = either (const True) (const False) <$> tryIO dump
where
diff --git a/debian/changelog b/debian/changelog
index 26153c5fd..7ff502ad3 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -5,6 +5,8 @@ git-annex (5.20140307) UNRELEASED; urgency=medium
* webapp: Added a "Sync now" item to each repository's menu.
* unused: In direct mode, files that are deleted from the work tree
are no longer incorrectly detected as unused.
+ * repair: Improve memory usage when git fsck finds a great many broken
+ objects.
-- Joey Hess <joeyh@debian.org> Thu, 06 Mar 2014 16:17:01 -0400