summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2011-06-23 11:37:26 -0400
committerGravatar Joey Hess <joey@kitenet.net>2011-06-23 11:37:26 -0400
commit5f494154a34bad7cc54915a2a408b830e8ca77be (patch)
tree2adfc75ea36e18b3655bb2dcfa94b702288fc6c5
parent23e765b67c38a9f02b3b5152e7e123819bb696de (diff)
add journaling to speed up changes to the git-annex branch
git is slow when the index file is large and has to be rewritten each time a file is changed. To speed this up, added a journal where changes are recorded before being fed into the index file and committed to the git-annex branch. The entire journal can be fed into git with just 2 commands, and only one write of the index file.
-rw-r--r--Branch.hs139
-rw-r--r--GitUnionMerge.hs14
-rw-r--r--Locations.hs1
-rw-r--r--Types/BranchState.hs3
-rw-r--r--doc/internals.mdwn10
-rw-r--r--doc/upgrades.mdwn5
-rw-r--r--git-union-merge.hs2
7 files changed, 133 insertions, 41 deletions
diff --git a/Branch.hs b/Branch.hs
index 00f406135..f0d97bfc3 100644
--- a/Branch.hs
+++ b/Branch.hs
@@ -33,6 +33,7 @@ import qualified Annex
import Utility
import Types
import Messages
+import Locations
{- Name of the branch that is used to store git-annex's information. -}
name :: String
@@ -42,6 +43,8 @@ name = "git-annex"
fullname :: String
fullname = "refs/heads/" ++ name
+{- Converts a fully qualified git ref into a short version for human
+ - consumptiom. -}
shortref :: String -> String
shortref = remove "refs/heads/" . remove "refs/remotes/"
where
@@ -56,7 +59,8 @@ index g = Git.workTree g </> Git.gitDir g </> "index." ++ name
{- Populates the branch's index file with the current branch contents.
-
- Usually, this is only done when the index doesn't yet exist, and
- - the index is used to build up changes to be commited to the branch.
+ - the index is used to build up changes to be commited to the branch,
+ - and merge in changes from other branches.
-}
genIndex :: Git.Repo -> IO ()
genIndex g = do
@@ -97,11 +101,6 @@ setCache file content = do
state <- getState
setState state { cachedFile = Just file, cachedContent = content }
-setCacheChanged :: FilePath -> String -> Annex ()
-setCacheChanged file content = do
- state <- getState
- setState state { cachedFile = Just file, cachedContent = content, branchChanged = True }
-
invalidateCache :: Annex ()
invalidateCache = do
state <- getState
@@ -133,11 +132,11 @@ create = do
liftIO $ Git.runBool g "show-ref"
[Param "--verify", Param "-q", Param ref]
-{- Commits any staged changes to the branch. -}
+{- Stages the journal, and commits staged changes to the branch. -}
commit :: String -> Annex ()
commit message = do
- state <- getState
- when (branchChanged state) $ do
+ staged <- stageJournalFiles
+ when staged $ do
g <- Annex.gitRepo
withIndex $ liftIO $
GitUnionMerge.commit g message fullname [fullname]
@@ -187,28 +186,32 @@ updateRef ref
liftIO $ GitUnionMerge.merge g [ref]
return $ Just ref
-{- Stages the content of a file into the branch's index. -}
+{- Records changed content of a file into the journal. -}
change :: FilePath -> String -> Annex ()
change file content = do
- g <- Annex.gitRepo
- sha <- liftIO $ Git.hashObject g content
- withIndex $ liftIO $ Git.run g "update-index"
- [ Param "--add", Param "--cacheinfo", Param "100644",
- Param sha, File file]
- setCacheChanged file content
-
-{- Gets the content of a file on the branch, or content staged in the index
- - if it's newer. Returns an empty string if the file didn't exist yet. -}
+ setJournalFile file content
+ setCache file content
+
+{- Gets the content of a file on the branch, or content from the journal, or
+ - staged in the index.
+ -
+ - Returns an empty string if the file doesn't exist yet. -}
get :: FilePath -> Annex String
get file = do
cached <- getCache file
case cached of
Just content -> return content
- Nothing -> withIndexUpdate $ do
- g <- Annex.gitRepo
- content <- liftIO $ catch (cat g) (const $ return "")
- setCache file content
- return content
+ Nothing -> do
+ j <- getJournalFile file
+ case j of
+ Just content -> do
+ setCache file content
+ return content
+ Nothing -> withIndexUpdate $ do
+ g <- Annex.gitRepo
+ content <- liftIO $ catch (cat g) (const $ return "")
+ setCache file content
+ return content
where
cat g = cmdOutput "git" $ toCommand $ Git.gitCommandLine g
[Param "cat-file", Param "blob", Param $ ':':file]
@@ -231,9 +234,93 @@ cmdOutput cmd params = do
_ <- getProcessStatus True False pid
return rv
-{- Lists all files on the branch. -}
+{- Lists all files on the branch. There may be duplicates in the list. -}
files :: Annex [FilePath]
files = withIndexUpdate $ do
g <- Annex.gitRepo
- liftIO $ Git.pipeNullSplit g
+ bfiles <- liftIO $ Git.pipeNullSplit g
[Params "ls-tree --name-only -r -z", Param fullname]
+ jfiles <- getJournalFiles
+ return $ jfiles ++ bfiles
+
+{- Records content for a file in the branch to the journal.
+ -
+ - Using the journal, rather than immediatly staging content to the index
+ - avoids git needing to rewrite the index after every change. -}
+setJournalFile :: FilePath -> String -> Annex ()
+setJournalFile file content = do
+ g <- Annex.gitRepo
+ liftIO $ catch (write g) $ const $ do
+ createDirectoryIfMissing True $ gitAnnexJournalDir g
+ createDirectoryIfMissing True $ gitAnnexTmpDir g
+ write g
+ where
+ -- journal file is written atomically
+ write g = do
+ let jfile = journalFile g file
+ let tmpfile = gitAnnexTmpDir g </> takeFileName jfile
+ writeFile tmpfile content
+ renameFile tmpfile jfile
+
+{- Gets journalled content for a file in the branch. -}
+getJournalFile :: FilePath -> Annex (Maybe String)
+getJournalFile file = do
+ g <- Annex.gitRepo
+ liftIO $ catch (liftM Just . readFileStrict $ journalFile g file)
+ (const $ return Nothing)
+
+{- List of journal files. -}
+getJournalFiles :: Annex [FilePath]
+getJournalFiles = getJournalFilesRaw >>= return . map fileJournal
+
+getJournalFilesRaw :: Annex [FilePath]
+getJournalFilesRaw = do
+ g <- Annex.gitRepo
+ fs <- liftIO $ catch (getDirectoryContents $ gitAnnexJournalDir g)
+ (const $ return [])
+ return $ filter (\f -> f /= "." && f /= "..") fs
+
+{- Stages all journal files into the index, and returns True if the index
+ - was modified. -}
+stageJournalFiles :: Annex Bool
+stageJournalFiles = do
+ l <- getJournalFilesRaw
+ if null l
+ then return False
+ else do
+ g <- Annex.gitRepo
+ withIndex $ liftIO $ stage g l
+ return True
+ where
+ stage g fs = do
+ let dir = gitAnnexJournalDir g
+ let paths = map (dir </>) fs
+ -- inject all the journal files directly into git
+ -- in one quick command
+ (h, s) <- Git.pipeWriteRead g [Param "hash-object",
+ Param "-w", Param "--stdin-paths"] $ unlines paths
+ -- update the index, also in just one command
+ GitUnionMerge.update_index g $
+ index_lines (lines s) $ map fileJournal fs
+ forceSuccess h
+ mapM_ removeFile paths
+ index_lines shas fs = map genline $ zip shas fs
+ genline (sha, file) = GitUnionMerge.update_index_line sha file
+
+{- Produces a filename to use in the journal for a file on the branch.
+ -
+ - The journal typically won't have a lot of files in it, so the hashing
+ - used in the branch is not necessary, and all the files are put directly
+ - in the journal directory.
+ -}
+journalFile :: Git.Repo -> FilePath -> FilePath
+journalFile repo file = gitAnnexJournalDir repo </> concatMap mangle file
+ where
+ mangle '/' = "_"
+ mangle '_' = "__"
+ mangle c = [c]
+
+{- Converts a journal file (relative to the journal dir) back to the
+ - filename on the branch. -}
+fileJournal :: FilePath -> FilePath
+fileJournal = replace "//" "_" . replace "_" "/"
diff --git a/GitUnionMerge.hs b/GitUnionMerge.hs
index 096a153a4..fa14a6bc3 100644
--- a/GitUnionMerge.hs
+++ b/GitUnionMerge.hs
@@ -7,7 +7,9 @@
module GitUnionMerge (
merge,
- commit
+ commit,
+ update_index,
+ update_index_line
) where
import System.Cmd.Utils
@@ -43,6 +45,11 @@ update_index g l = togit ["update-index", "-z", "--index-info"] (join "\0" l)
togit ps content = Git.pipeWrite g (map Param ps) content
>>= forceSuccess
+{- Generates a line suitable to be fed into update-index, to add
+ - a given file with a given sha. -}
+update_index_line :: String -> FilePath -> String
+update_index_line sha file = "100644 blob " ++ sha ++ "\t" ++ file
+
{- Gets the contents of a tree in a format suitable for update_index. -}
ls_tree :: Git.Repo -> String -> IO [String]
ls_tree g x = Git.pipeNullSplit g $
@@ -76,14 +83,13 @@ calc_merge g differ = do
mergeFile :: Git.Repo -> (String, FilePath) -> IO (Maybe String)
mergeFile g (info, file) = case filter (/= nullsha) [asha, bsha] of
[] -> return Nothing
- (sha:[]) -> return $ Just $ ls_tree_line sha
+ (sha:[]) -> return $ Just $ update_index_line sha file
shas -> do
content <- Git.pipeRead g $ map Param ("show":shas)
sha <- Git.hashObject g $ unionmerge content
- return $ Just $ ls_tree_line sha
+ return $ Just $ update_index_line sha file
where
[_colonamode, _bmode, asha, bsha, _status] = words info
- ls_tree_line sha = "100644 blob " ++ sha ++ "\t" ++ file
nullsha = take Git.shaSize $ repeat '0'
unionmerge = unlines . nub . lines
diff --git a/Locations.hs b/Locations.hs
index f93b0cc50..bfb0d3af9 100644
--- a/Locations.hs
+++ b/Locations.hs
@@ -17,6 +17,7 @@ module Locations (
gitAnnexBadDir,
gitAnnexBadLocation,
gitAnnexUnusedLog,
+ gitAnnexJournalDir,
isLinkToAnnex,
logFile,
logFileKey,
diff --git a/Types/BranchState.hs b/Types/BranchState.hs
index 65d0642a1..40d7f5c2c 100644
--- a/Types/BranchState.hs
+++ b/Types/BranchState.hs
@@ -9,10 +9,9 @@ module Types.BranchState where
data BranchState = BranchState {
branchUpdated :: Bool,
- branchChanged :: Bool,
cachedFile :: Maybe FilePath,
cachedContent :: String
}
startBranchState :: BranchState
-startBranchState = BranchState False False Nothing ""
+startBranchState = BranchState False Nothing ""
diff --git a/doc/internals.mdwn b/doc/internals.mdwn
index aaa125599..27b5bb1f2 100644
--- a/doc/internals.mdwn
+++ b/doc/internals.mdwn
@@ -21,9 +21,13 @@ deleting or changing the file contents.
This branch is managed by git-annex, with the contents listed below.
-Note that git-annex assumes only it will modify this branch. If you go in
-and make changes directly, it will probably revert your changes in its next
-commit to the branch.
+The file `.git/index.git-annex` is a separate git index file it uses
+to accumlate changes for the branch. Also, `.git/annex/journal/` is used
+to record changes before they are added to git.
+
+Note that for speed reasons, git-annex assumes only it will modify this
+branch. If you go in and make changes directly, it will probably revert
+your changes in its next commit to the branch.
The best way to make changes to the git-annex branch is instead
to create a branch of it, with a name like "my/git-annex", and then
diff --git a/doc/upgrades.mdwn b/doc/upgrades.mdwn
index 63fbcf75b..2e8f201fb 100644
--- a/doc/upgrades.mdwn
+++ b/doc/upgrades.mdwn
@@ -29,11 +29,6 @@ This upgrade is easier than the previous upgrades. You don't need to
upgrade every repository at once; it's sufficient to upgrade each
repository only when you next use it.
-This upgrade can be sped up by, before you start, making
-.git/index.git-annex into a symlink to a file on a ramdisk.
-For example: `ln -s /run/shm/index.git-annex.$(git config annex.uuid) .git/index.git-annex`
-but, if you do that, be sure to remove the symlink after the upgrade!
-
After the upgrade is complete, commit the changes it staged.
git commit -m "upgrade v2 to v3"
diff --git a/git-union-merge.hs b/git-union-merge.hs
index 7c0c1cd84..57232be67 100644
--- a/git-union-merge.hs
+++ b/git-union-merge.hs
@@ -42,7 +42,7 @@ main :: IO ()
main = do
[aref, bref, newref] <- parseArgs
g <- Git.configRead =<< Git.repoFromCwd
- Git.useIndex (tmpIndex g)
+ _ <- Git.useIndex (tmpIndex g)
setup g
GitUnionMerge.merge g [aref, bref]
GitUnionMerge.commit g "union merge" newref [aref, bref]