summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2011-06-29 21:23:40 -0400
committerGravatar Joey Hess <joey@kitenet.net>2011-06-29 21:47:31 -0400
commite1c18ddec455e5d1259ab46ccccbe6a9c7079de6 (patch)
tree260e2570f87c85508712aa355b9c60531c67f08b
parent8725fde5c66984d9769558a07612361b112be58f (diff)
Sped back up fsck, copy --from etc
All commands that often have to read a lot of information from the git-annex branch should now be nearly as fast as before the branch was introduced. Before fsck was taking approximatly 3 hours, now it's running in 8 minutes. The code is very nasty. It should be rewritten to read the header line from git cat-file, and then read the specified number of bytes of content.
-rw-r--r--Branch.hs74
-rw-r--r--Types/BranchState.hs11
-rw-r--r--debian/changelog3
-rw-r--r--doc/todo/speed_up_fsck.mdwn2
4 files changed, 54 insertions, 36 deletions
diff --git a/Branch.hs b/Branch.hs
index 26aad4407..4f568e36b 100644
--- a/Branch.hs
+++ b/Branch.hs
@@ -26,9 +26,6 @@ import System.Cmd.Utils
import Data.Maybe
import Data.List
import System.IO
-import System.Posix.IO
-import System.Posix.Process
-import System.Log.Logger
import Types.BranchState
import qualified GitRepo as Git
@@ -142,7 +139,7 @@ commit message = whenM stageJournalFiles $ do
- data is read from it. Runs only once per git-annex run. -}
update :: Annex ()
update = do
- state <- Annex.getState Annex.branchstate
+ state <- getState
unless (branchUpdated state) $ withIndex $ do
{- Since branches get merged into the index, it's important to
- first stage the journal into the index. Otherwise, any
@@ -226,39 +223,48 @@ get file = do
setCache file content
return content
Nothing -> withIndexUpdate $ do
- g <- Annex.gitRepo
- content <- liftIO $ catch (cat g) (const $ return "")
+ content <- catFile file
setCache file content
return content
+
+{- Uses git cat-file in batch mode to read the content of a file.
+ -
+ - Only one process is run, and it persists and is used for all accesses. -}
+catFile :: FilePath -> Annex String
+catFile file = do
+ state <- getState
+ maybe (startup state) ask (catFileHandles state)
where
- cat g = cmdOutput "git" $ toCommand $ Git.gitCommandLine g
- [Param "cat-file", Param "blob", Param $ ':':file]
-
-{- Runs a command, returning its output, ignoring nonzero exit
- - status, and discarding stderr. -}
-cmdOutput :: FilePath -> [String] -> IO String
-cmdOutput cmd params = do
- pipepair <- createPipe
- let callfunc _ = do
- closeFd (snd pipepair)
- h <- fdToHandle (fst pipepair)
- x <- hGetContentsStrict h
- hClose h
- return $! x
- let child = do
- closeFd (fst pipepair)
- -- disable stderr output by this child,
- -- and since the logger uses it, also disable it
- liftIO $ updateGlobalLogger rootLoggerName $ setLevel EMERGENCY
- closeFd stdError
-
- debugM "Utility.executeFile" $ cmd ++ " " ++ show params
-
- pid <- pOpen3Raw Nothing (Just (snd pipepair)) Nothing cmd params child
- retval <- callfunc $! pid
- let rv = seq retval retval
- _ <- getProcessStatus True False pid
- return rv
+ startup state = do
+ g <- Annex.gitRepo
+ let cmd = Git.gitCommandLine g
+ [Param "cat-file", Param "--batch"]
+ let gitcmd = join " " $ "git" : toCommand cmd
+ (_, from, to) <- liftIO $ hPipeBoth "sh"
+ -- want stderr on stdin for sentinal, and
+ -- to ignore other error messages
+ ["-c", gitcmd ++ " 2>&1"]
+ setState state { catFileHandles = Just (from, to) }
+ ask (from, to)
+ ask (from, to) = do
+ _ <- liftIO $ do
+ hPutStr to $
+ fullname ++ ":" ++ file ++ "\n" ++
+ sentinal ++ "\n"
+ hFlush to
+ return . unlines =<< readContent from []
+ readContent from ls = do
+ l <- liftIO $ hGetLine from
+ if l == sentinal_line
+ -- first line is blob info,
+ -- or maybe an error message
+ then return $ drop 1 $ reverse ls
+ else readContent from (l:ls)
+ -- To find the end of a catted file, ask for a sentinal
+ -- value that is always missing, and look for the error
+ -- message. Utterly nasty, probably will break one day.
+ sentinal = ":"
+ sentinal_line = sentinal ++ " missing"
{- Lists all files on the branch. There may be duplicates in the list. -}
files :: Annex [FilePath]
diff --git a/Types/BranchState.hs b/Types/BranchState.hs
index 40d7f5c2c..bc1d32e69 100644
--- a/Types/BranchState.hs
+++ b/Types/BranchState.hs
@@ -7,11 +7,18 @@
module Types.BranchState where
+import System.IO
+
data BranchState = BranchState {
- branchUpdated :: Bool,
+ branchUpdated :: Bool, -- has the branch been updated this run?
+
+ -- (from, to) handles used to talk to a git-cat-file process
+ catFileHandles :: Maybe (Handle, Handle),
+
+ -- the content of one file is cached
cachedFile :: Maybe FilePath,
cachedContent :: String
}
startBranchState :: BranchState
-startBranchState = BranchState False Nothing ""
+startBranchState = BranchState False Nothing Nothing ""
diff --git a/debian/changelog b/debian/changelog
index 5dff0bbe5..a87a89860 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -3,6 +3,9 @@ git-annex (3.20110625) UNRELEASED; urgency=low
* Always ensure git-annex branch exists.
* Modify location log parser to allow future expansion.
* --force will cause add, etc, to operate on ignored files.
+ * Sped back up fsck, copy --from, and other commands that often
+ have to read a lot of information from the git-annex branch. Should
+ now be nearly as fast as before the branch was introduced.
-- Joey Hess <joeyh@debian.org> Sun, 26 Jun 2011 21:01:06 -0400
diff --git a/doc/todo/speed_up_fsck.mdwn b/doc/todo/speed_up_fsck.mdwn
index e22c01766..5d5e867f8 100644
--- a/doc/todo/speed_up_fsck.mdwn
+++ b/doc/todo/speed_up_fsck.mdwn
@@ -36,3 +36,5 @@ commands like whereis and add. --[[Joey]]
> Hmm, except that's actually an error message sent to stderr. Unless
> stderr is connected to stdout, it might be better to look for a known,
> empty object. Could just add a git-annex:empty file to that end.
+
+[[done]] --[[Joey]]