diff options
author | Joey Hess <joey@kitenet.net> | 2013-09-19 16:30:37 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2013-09-19 16:41:21 -0400 |
commit | 4aaa584eb632a981f5364c844f9293d4cdedaa65 (patch) | |
tree | 082addc10fa1a1554a616d54bb8e41d03e94c074 | |
parent | 5fddb08efccedd5c1542f5e16ec63a57498bc1f0 (diff) |
more completely solve catKey memory leak
Done using a mode witness, which ensures it's fixed everywhere.
Fixing catFileKey was a bear, because git cat-file does not provide a
nice way to query for the mode of a file and there is no other efficient
way to do it. Oh, for libgit2..
Note that I am looking at tree objects from HEAD, rather than the index.
Because I cat-file cannot show a tree object for the index.
So this fix is technically incomplete. The only cases where it matters
are:
1. A new large file has been directly staged in git, but not committed.
2. A file that was committed to HEAD as a symlink has been staged
directly in the index.
This could be fixed a lot better using libgit2.
-rw-r--r-- | Annex/CatFile.hs | 55 | ||||
-rw-r--r-- | Annex/Direct.hs | 10 | ||||
-rw-r--r-- | Command/Indirect.hs | 2 | ||||
-rw-r--r-- | Command/PreCommit.hs | 5 | ||||
-rw-r--r-- | Command/Sync.hs | 3 | ||||
-rw-r--r-- | Git/CatFile.hs | 2 | ||||
-rw-r--r-- | Git/FileMode.hs | 5 | ||||
-rw-r--r-- | debian/changelog | 4 | ||||
-rw-r--r-- | doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn | 6 | ||||
-rw-r--r-- | doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn | 6 |
10 files changed, 71 insertions, 27 deletions
diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs index f70800302..c8be04b02 100644 --- a/Annex/CatFile.hs +++ b/Annex/CatFile.hs @@ -8,6 +8,7 @@ module Annex.CatFile ( catFile, catObject, + catTree, catObjectDetails, catFileHandle, catKey, @@ -17,6 +18,7 @@ module Annex.CatFile ( import qualified Data.ByteString.Lazy as L import qualified Data.Map as M +import System.PosixCompat.Types import Common.Annex import qualified Git @@ -24,6 +26,7 @@ import qualified Git.CatFile import qualified Annex import Git.Types import Git.FilePath +import Git.FileMode catFile :: Git.Branch -> FilePath -> Annex L.ByteString catFile branch file = do @@ -35,6 +38,11 @@ catObject ref = do h <- catFileHandle liftIO $ Git.CatFile.catObject h ref +catTree :: Git.Ref -> Annex [(FilePath, FileMode)] +catTree ref = do + h <- catFileHandle + liftIO $ Git.CatFile.catTree h ref + catObjectDetails :: Git.Ref -> Annex (Maybe (L.ByteString, Sha)) catObjectDetails ref = do h <- catFileHandle @@ -55,13 +63,39 @@ catFileHandle = do Annex.changeState $ \s -> s { Annex.catfilehandles = m' } return h -{- From the Sha or Ref of a symlink back to the key. -} -catKey :: Ref -> Annex (Maybe Key) -catKey ref = do - l <- fromInternalGitPath . encodeW8 . L.unpack <$> catObject ref - return $ if isLinkToAnnex l - then fileKey $ takeFileName l - else Nothing +{- From the Sha or Ref of a symlink back to the key. + - + - Requires a mode witness, to guarantee that the file is a symlink. + -} +catKey :: Ref -> FileMode -> Annex (Maybe Key) +catKey ref mode + | isSymLink mode = do + l <- fromInternalGitPath . encodeW8 . L.unpack <$> catObject ref + return $ if isLinkToAnnex l + then fileKey $ takeFileName l + else Nothing + | otherwise = return Nothing + +{- Looks up the file mode corresponding to the Ref using the running + - cat-file. + - + - Currently this always has to look in HEAD, because cat-file --batch + - does not offer a way to specify that we want to look up a tree object + - in the index. So if the index has a file staged not as a symlink, + - and it is a sylink in head, the wrong mode is gotten. This is a bug. + - Also, we have to assume the file is a symlink if it's not yet committed + - to HEAD. + -} +catKeyChecked :: Bool -> Ref -> Annex (Maybe Key) +catKeyChecked needhead ref@(Ref r) = + catKey ref =<< findmode <$> catTree treeref + where + pathparts = split "/" r + dir = intercalate "/" $ take (length pathparts - 1) pathparts + file = fromMaybe "" $ lastMaybe pathparts + treeref = Ref $ if needhead then "HEAD" ++ dir ++ "/" else dir ++ "/" + findmode = fromMaybe symLinkMode . headMaybe . + map snd . filter (\p -> fst p == file) {- From a file in the repository back to the key. - @@ -76,7 +110,8 @@ catKey ref = do - - For command-line git-annex use, that doesn't matter. It's perfectly - reasonable for things staged in the index after the currently running - - git-annex process to not be noticed by it. + - git-annex process to not be noticed by it. However, we do want to see + - what's in the index, since it may have uncommitted changes not in HEAD> - - For the assistant, this is much more of a problem, since it commits - files and then needs to be able to immediately look up their keys. @@ -89,8 +124,8 @@ catKey ref = do catKeyFile :: FilePath -> Annex (Maybe Key) catKeyFile f = ifM (Annex.getState Annex.daemon) ( catKeyFileHEAD f - , catKey $ Ref $ ":./" ++ f + , catKeyChecked True (Ref $ ":./" ++ f) ) catKeyFileHEAD :: FilePath -> Annex (Maybe Key) -catKeyFileHEAD f = catKey $ Ref $ "HEAD:./" ++ f +catKeyFileHEAD f = catKeyChecked False (Ref $ "HEAD:./" ++ f) diff --git a/Annex/Direct.hs b/Annex/Direct.hs index a6c30ad08..ad9338ec7 100644 --- a/Annex/Direct.hs +++ b/Annex/Direct.hs @@ -14,7 +14,6 @@ import qualified Git.Merge import qualified Git.DiffTree as DiffTree import Git.Sha import Git.Types -import Git.FileMode import Annex.CatFile import qualified Annex.Queue import Logs.Location @@ -46,9 +45,7 @@ stageDirect = do - efficiently as we can, by getting any key that's associated - with it in git, as well as its stat info. -} go (file, Just sha, Just mode) = do - shakey <- if isSymLink mode - then catKey sha - else return Nothing + shakey <- catKey sha mode mstat <- liftIO $ catchMaybeIO $ getSymbolicLinkStatus file filekey <- isAnnexLink file case (shakey, filekey, mstat, toInodeCache =<< mstat) of @@ -149,10 +146,9 @@ mergeDirectCleanup d oldsha newsha = do where go getsha getmode a araw | getsha item == nullSha = noop - | isSymLink (getmode item) = + | otherwise = maybe (araw f) (\k -> void $ a k f) - =<< catKey (getsha item) - | otherwise = araw f + =<< catKey (getsha item) (getmode item) f = DiffTree.file item moveout = removeDirect diff --git a/Command/Indirect.hs b/Command/Indirect.hs index 79e736d11..f866a93b6 100644 --- a/Command/Indirect.hs +++ b/Command/Indirect.hs @@ -78,7 +78,7 @@ perform = do return Nothing | otherwise -> maybe noop (fromdirect f) - =<< catKey sha + =<< catKey sha mode _ -> noop go _ = noop diff --git a/Command/PreCommit.hs b/Command/PreCommit.hs index c35cf61e1..afc5882d4 100644 --- a/Command/PreCommit.hs +++ b/Command/PreCommit.hs @@ -16,7 +16,6 @@ import qualified Git.Ref import Annex.CatFile import Annex.Content.Direct import Git.Sha -import Git.FileMode def :: [Command] def = [command "pre-commit" paramPaths seek SectionPlumbing @@ -48,8 +47,8 @@ startDirect _ = next $ do withkey (Git.DiffTree.srcsha diff) (Git.DiffTree.srcmode diff) removeAssociatedFile withkey (Git.DiffTree.dstsha diff) (Git.DiffTree.dstmode diff) addAssociatedFile where - withkey sha mode a = when (sha /= nullSha && isSymLink mode) $ do - k <- catKey sha + withkey sha mode a = when (sha /= nullSha) $ do + k <- catKey sha mode case k of Nothing -> noop Just key -> void $ a key (Git.DiffTree.file diff) diff --git a/Command/Sync.hs b/Command/Sync.hs index ca823c736..d8c6fb8d4 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -29,6 +29,7 @@ import qualified Remote.Git import Types.Key import Config import Annex.ReplaceFile +import Git.FileMode import Data.Hash.MD5 @@ -321,7 +322,7 @@ resolveMerge' u case msha of Nothing -> a Nothing Just sha -> do - key <- catKey sha + key <- catKey sha symLinkMode maybe (return False) (a . Just) key {- The filename to use when resolving a conflicted merge of a file, diff --git a/Git/CatFile.hs b/Git/CatFile.hs index 984d2f465..bd86ff326 100644 --- a/Git/CatFile.hs +++ b/Git/CatFile.hs @@ -130,4 +130,4 @@ catTree h treeref = go <$> catObjectDetails h treeref parsemodefile b = let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b) in (file, readmode modestr) - readmode = fst . Prelude.head . readOct + readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct diff --git a/Git/FileMode.hs b/Git/FileMode.hs index d42df9833..fc4d0264e 100644 --- a/Git/FileMode.hs +++ b/Git/FileMode.hs @@ -13,8 +13,11 @@ import Utility.FileMode import System.PosixCompat.Types +symLinkMode :: FileMode +symLinkMode = 40960 + {- Git uses a special file mode to indicate a symlink. This is the case - even on Windows, so we hard code the valuse here, rather than using - System.Posix.Files.symbolicLinkMode. -} isSymLink :: FileMode -> Bool -isSymLink = checkMode 40960 +isSymLink = checkMode symLinkMode diff --git a/debian/changelog b/debian/changelog index 6cc20e412..ff374fd04 100644 --- a/debian/changelog +++ b/debian/changelog @@ -18,8 +18,8 @@ git-annex (4.20130912) UNRELEASED; urgency=low numcopies levels. (--fast avoids calculating these) * gcrypt: Ensure that signing key is set to one of the participants keys. * webapp: Show encryption information when editing a remote. - * sync, pre-commit, indirect: Avoid unnecessarily catting non-symlink - files from git, which can be so large it runs out of memory. + * Avoid unnecessarily catting non-symlink files from git, which can be + so large it runs out of memory. -- Joey Hess <joeyh@debian.org> Thu, 12 Sep 2013 12:14:46 -0400 diff --git a/doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn b/doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn index 2b28cb089..07d6f3eb3 100644 --- a/doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn +++ b/doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn @@ -61,3 +61,9 @@ commit git-annex: out of memory (requested 985661440 bytes) # End of transcript or log. """]] + +> [[fixed|done]]. However, if you saw this behavior, +> you have large files checked directly into git. You may +> want to examine your repository and use git filter-branch to clean +> it up. +> --[[Joey]] diff --git a/doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn b/doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn index d63394b02..c127bac15 100644 --- a/doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn +++ b/doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn @@ -77,4 +77,8 @@ Any thoughts on how I can get git-annex (esp. fsck) to complete would be appreci Thanks Giovanni -[[!tag moreinfo]] +> [[fixed|done]]. However, if you saw this behavior, +> you have large files checked directly into git. You may +> want to examine your repository and use git filter-branch to clean +> it up. +> --[[Joey]] |