From 7523a9d447483f0e68d1bc1055df735389208f25 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 9 Jul 2014 23:36:53 -0400 Subject: Fix minor FD leak in journal code. Minor because normally only 1 FD is leaked per git-annex run. However, the test suite leaks a few hundred FDs, and this broke it on the Debian autobuilders, which seem to have a tigher than usual ulimit. The leak was introduced by the lazy getDirectoryContents' that was introduced in b54de1dad4874b7561d2c5a345954b6b5c594078 in order to scale to millions of journal files -- if the lazy list was never fully consumed, the directory handle did not get closed. Instead, pull in openDirectory/readDirectory/closeDirectory code that I already developed and submitted in a patch to the haskell directory library earlier. Using this in journalDirty avoids the place that the lazy list caused a problem. And using it in stageJournal eliminates the need for getDirectoryContents'. The getJournalFiles* functions are switched back to using the regular strict getDirectoryContents. I'm not sure if those always consume the whole list, so this avoids any leak. And the things that call those are things like git annex unused, which also look at every file committed to the git-annex branch, so would need more work to scale to insane numbers of files anyway. --- Annex/Branch.hs | 23 ++++++---- Annex/Journal.hs | 19 ++++++++- Utility/Directory.hs | 116 ++++++++++++++++++++++++++++++++++----------------- debian/changelog | 6 +++ 4 files changed, 114 insertions(+), 50 deletions(-) diff --git a/Annex/Branch.hs b/Annex/Branch.hs index 5415876f8..a03d6ddf3 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -389,21 +389,26 @@ stageJournal jl = withIndex $ do prepareModifyIndex jl g <- gitRepo let dir = gitAnnexJournalDir g - fs <- getJournalFiles jl (jlogf, jlogh) <- openjlog - liftIO $ do + withJournalHandle $ \jh -> do h <- hashObjectStart g Git.UpdateIndex.streamUpdateIndex g - [genstream dir h fs jlogh] + [genstream dir h jh jlogh] hashObjectStop h return $ cleanup dir jlogh jlogf where - genstream dir h fs jlogh streamer = forM_ fs $ \file -> do - let path = dir file - sha <- hashFile h path - hPutStrLn jlogh file - streamer $ Git.UpdateIndex.updateIndexLine - sha FileBlob (asTopFilePath $ fileJournal file) + genstream dir h jh jlogh streamer = do + v <- readDirectory jh + case v of + Nothing -> return () + Just file -> do + unless (dirCruft file) $ do + let path = dir file + sha <- hashFile h path + hPutStrLn jlogh file + streamer $ Git.UpdateIndex.updateIndexLine + sha FileBlob (asTopFilePath $ fileJournal file) + genstream dir h jh jlogh streamer -- Clean up the staged files, as listed in the temp log file. -- The temp file is used to avoid needing to buffer all the -- filenames in memory. diff --git a/Annex/Journal.hs b/Annex/Journal.hs index 406155750..198388aa8 100644 --- a/Annex/Journal.hs +++ b/Annex/Journal.hs @@ -77,12 +77,27 @@ getJournalFilesStale :: Annex [FilePath] getJournalFilesStale = do g <- gitRepo fs <- liftIO $ catchDefaultIO [] $ - getDirectoryContents' $ gitAnnexJournalDir g + getDirectoryContents $ gitAnnexJournalDir g return $ filter (`notElem` [".", ".."]) fs +withJournalHandle :: (DirectoryHandle -> IO a) -> Annex a +withJournalHandle a = do + d <- fromRepo gitAnnexJournalDir + bracketIO (openDirectory d) closeDirectory (liftIO . a) + {- Checks if there are changes in the journal. -} journalDirty :: Annex Bool -journalDirty = not . null <$> getJournalFilesStale +journalDirty = withJournalHandle go + where + go h = do + v <- readDirectory h + case v of + (Just f) + | not (dirCruft f) -> do + closeDirectory h + return True + | otherwise -> go h + Nothing -> return False {- Produces a filename to use in the journal for a file on the branch. - diff --git a/Utility/Directory.hs b/Utility/Directory.hs index e769be7ef..d4e9b358f 100644 --- a/Utility/Directory.hs +++ b/Utility/Directory.hs @@ -1,4 +1,4 @@ -{- directory manipulation +{- directory traversal and manipulation - - Copyright 2011-2014 Joey Hess - @@ -16,7 +16,9 @@ import Control.Monad import Control.Monad.IfElse import System.FilePath import Control.Applicative +import Control.Concurrent import System.IO.Unsafe (unsafeInterleaveIO) +import Data.Maybe #ifdef mingw32_HOST_OS import qualified System.Win32 as Win32 @@ -31,44 +33,6 @@ import Utility.Exception import Utility.Monad import Utility.Applicative -{- Unlike getDirectoryContents, this can be used in arbitrarily - - large directories without using much memory; the list steams lazily. - - - - However, any errors that may be encountered while reading the directory - - contents are *ignored*, rather than throw them in the context of - - whatever code consumes the lazy list. - - - - See https://ghc.haskell.org/trac/ghc/ticket/9266 - -} -getDirectoryContents' :: FilePath -> IO [FilePath] -getDirectoryContents' path = loop =<< opendir - where -#ifndef mingw32_HOST_OS - opendir = Posix.openDirStream path - loop dirp = do - v <- tryNonAsync $ Posix.readDirStream dirp - case v of - (Right ent) | not (null ent) -> do - rest <- unsafeInterleaveIO (loop dirp) - return (ent:rest) - _ -> do - void $ tryNonAsync $ Posix.closeDirStream dirp - return [] -#else - opendir = Win32.findFirstFile (path "*") - loop (h, fdat) = do - -- there is always at least 1 file ("." and "..") - ent <- Win32.getFindDataFileName fdat - v <- tryNonAsync $ Win32.findNextFile h fdat - case v of - Right True -> do - rest <- unsafeInterleaveIO (loop (h, fdat)) - return (ent:rest) - _ -> do - void $ tryNonAsync $ Win32.findClose h - return [ent] -#endif - dirCruft :: FilePath -> Bool dirCruft "." = True dirCruft ".." = True @@ -177,3 +141,77 @@ nukeFile file = void $ tryWhenExists go #else go = removeFile file #endif + +#ifndef mingw32_HOST_OS +data DirectoryHandle = DirectoryHandle IsOpen Posix.DirStream +#else +data DirectoryHandle = DirectoryHandle IsOpen Win32.HANDLE Win32.FindData (MVar ()) +#endif + +type IsOpen = MVar () -- full when the handle is open + +openDirectory :: FilePath -> IO DirectoryHandle +openDirectory path = do +#ifndef mingw32_HOST_OS + dirp <- Posix.openDirStream path + isopen <- newMVar () + return (DirectoryHandle isopen dirp) +#else + (h, fdat) <- Win32.findFirstFile (path "*") + -- Indicate that the fdat contains a filename that readDirectory + -- has not yet returned, by making the MVar be full. + -- (There's always at least a "." entry.) + alreadyhave <- newMVar () + isopen <- newMVar () + return (DirectoryHandle isopen h fdat alreadyhave) +#endif + +closeDirectory :: DirectoryHandle -> IO () +#ifndef mingw32_HOST_OS +closeDirectory (DirectoryHandle isopen dirp) = + whenOpen isopen $ + Posix.closeDirStream dirp +#else +closeDirectory (DirectoryHandle isopen h _ alreadyhave) = + whenOpen isopen $ do + _ <- tryTakeMVar alreadyhave + Win32.findClose h +#endif + where + whenOpen :: IsOpen -> IO () -> IO () + whenOpen mv f = do + v <- tryTakeMVar mv + when (isJust v) f + +{- |Reads the next entry from the handle. Once the end of the directory +is reached, returns Nothing and automatically closes the handle. +-} +readDirectory :: DirectoryHandle -> IO (Maybe FilePath) +#ifndef mingw32_HOST_OS +readDirectory hdl@(DirectoryHandle _ dirp) = do + e <- Posix.readDirStream dirp + if null e + then do + closeDirectory hdl + return Nothing + else return (Just e) +#else +readDirectory hdl@(DirectoryHandle _ h fdat mv) = do + -- If the MVar is full, then the filename in fdat has + -- not yet been returned. Otherwise, need to find the next + -- file. + r <- tryTakeMVar mv + case r of + Just () -> getfn + Nothing -> do + more <- Win32.findNextFile h fdat + if more + then getfn + else do + closeDirectory hdl + return Nothing + where + getfn = do + filename <- Win32.getFindDataFileName fdat + return (Just filename) +#endif diff --git a/debian/changelog b/debian/changelog index c5fdcd600..b8d091925 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +git-annex (5.20140710) UNRELEASED; urgency=medium + + * Fix minor FD leak in journal code. + + -- Joey Hess Wed, 09 Jul 2014 23:29:21 -0400 + git-annex (5.20140709) unstable; urgency=medium * Fix race in direct mode merge code that could cause all files in the -- cgit v1.2.3