From b54de1dad4874b7561d2c5a345954b6b5c594078 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 4 Jul 2014 15:28:07 -0400 Subject: Fix memory leak when committing millions of changes to the git-annex branch Eg after git-annex add has run on 2 million files in one go. Slightly unhappy with the neeed to use a temp file here, but I cannot see any other alternative (see comments on the bug report). This commit was sponsored by Hamish Coleman. --- Annex/Branch.hs | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'Annex/Branch.hs') diff --git a/Annex/Branch.hs b/Annex/Branch.hs index 3443730d2..5415876f8 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -390,18 +390,34 @@ stageJournal jl = withIndex $ do g <- gitRepo let dir = gitAnnexJournalDir g fs <- getJournalFiles jl + (jlogf, jlogh) <- openjlog liftIO $ do h <- hashObjectStart g Git.UpdateIndex.streamUpdateIndex g - [genstream dir h fs] + [genstream dir h fs jlogh] hashObjectStop h - return $ liftIO $ mapM_ (removeFile . (dir )) fs + return $ cleanup dir jlogh jlogf where - genstream dir h fs streamer = forM_ fs $ \file -> do + genstream dir h fs jlogh streamer = forM_ fs $ \file -> do let path = dir file sha <- hashFile h path + hPutStrLn jlogh file streamer $ Git.UpdateIndex.updateIndexLine sha FileBlob (asTopFilePath $ fileJournal file) + -- Clean up the staged files, as listed in the temp log file. + -- The temp file is used to avoid needing to buffer all the + -- filenames in memory. + cleanup dir jlogh jlogf = do + hFlush jlogh + hSeek jlogh AbsoluteSeek 0 + stagedfs <- lines <$> hGetContents jlogh + mapM_ (removeFile . (dir )) stagedfs + hClose jlogh + nukeFile jlogf + openjlog = do + tmpdir <- fromRepo gitAnnexTmpMiscDir + createAnnexDirectory tmpdir + liftIO $ openTempFile tmpdir "jlog" {- This is run after the refs have been merged into the index, - but before the result is committed to the branch. -- cgit v1.2.3