From b54de1dad4874b7561d2c5a345954b6b5c594078 Mon Sep 17 00:00:00 2001
From: Joey Hess <joey@kitenet.net>
Date: Fri, 4 Jul 2014 15:28:07 -0400
Subject: Fix memory leak when committing millions of changes to the git-annex
 branch

Eg after git-annex add has run on 2 million files in one go.

Slightly unhappy with the neeed to use a temp file here, but I cannot see
any other alternative (see comments on the bug report).

This commit was sponsored by Hamish Coleman.
---
 Annex/Branch.hs | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'Annex/Branch.hs')

diff --git a/Annex/Branch.hs b/Annex/Branch.hs
index 3443730d2..5415876f8 100644
--- a/Annex/Branch.hs
+++ b/Annex/Branch.hs
@@ -390,18 +390,34 @@ stageJournal jl = withIndex $ do
 	g <- gitRepo
 	let dir = gitAnnexJournalDir g
 	fs <- getJournalFiles jl
+	(jlogf, jlogh) <- openjlog
 	liftIO $ do
 		h <- hashObjectStart g
 		Git.UpdateIndex.streamUpdateIndex g
-			[genstream dir h fs]
+			[genstream dir h fs jlogh]
 		hashObjectStop h
-	return $ liftIO $ mapM_ (removeFile . (dir </>)) fs
+	return $ cleanup dir jlogh jlogf
   where
-	genstream dir h fs streamer = forM_ fs $ \file -> do
+	genstream dir h fs jlogh streamer = forM_ fs $ \file -> do
 		let path = dir </> file
 		sha <- hashFile h path
+		hPutStrLn jlogh file
 		streamer $ Git.UpdateIndex.updateIndexLine
 			sha FileBlob (asTopFilePath $ fileJournal file)
+	-- Clean up the staged files, as listed in the temp log file.
+	-- The temp file is used to avoid needing to buffer all the
+	-- filenames in memory.
+	cleanup dir jlogh jlogf = do
+		hFlush jlogh
+		hSeek jlogh AbsoluteSeek 0
+		stagedfs <- lines <$> hGetContents jlogh
+		mapM_ (removeFile . (dir </>)) stagedfs
+		hClose jlogh
+		nukeFile jlogf
+	openjlog = do
+		tmpdir <- fromRepo gitAnnexTmpMiscDir
+		createAnnexDirectory tmpdir
+		liftIO $ openTempFile tmpdir "jlog"
 
 {- This is run after the refs have been merged into the index,
  - but before the result is committed to the branch.
-- 
cgit v1.2.3