summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2013-09-19 16:30:37 -0400
committerGravatar Joey Hess <joey@kitenet.net>2013-09-19 16:41:21 -0400
commit4aaa584eb632a981f5364c844f9293d4cdedaa65 (patch)
tree082addc10fa1a1554a616d54bb8e41d03e94c074
parent5fddb08efccedd5c1542f5e16ec63a57498bc1f0 (diff)
more completely solve catKey memory leak
Done using a mode witness, which ensures it's fixed everywhere. Fixing catFileKey was a bear, because git cat-file does not provide a nice way to query for the mode of a file and there is no other efficient way to do it. Oh, for libgit2.. Note that I am looking at tree objects from HEAD, rather than the index. Because I cat-file cannot show a tree object for the index. So this fix is technically incomplete. The only cases where it matters are: 1. A new large file has been directly staged in git, but not committed. 2. A file that was committed to HEAD as a symlink has been staged directly in the index. This could be fixed a lot better using libgit2.
-rw-r--r--Annex/CatFile.hs55
-rw-r--r--Annex/Direct.hs10
-rw-r--r--Command/Indirect.hs2
-rw-r--r--Command/PreCommit.hs5
-rw-r--r--Command/Sync.hs3
-rw-r--r--Git/CatFile.hs2
-rw-r--r--Git/FileMode.hs5
-rw-r--r--debian/changelog4
-rw-r--r--doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn6
-rw-r--r--doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn6
10 files changed, 71 insertions, 27 deletions
diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs
index f70800302..c8be04b02 100644
--- a/Annex/CatFile.hs
+++ b/Annex/CatFile.hs
@@ -8,6 +8,7 @@
module Annex.CatFile (
catFile,
catObject,
+ catTree,
catObjectDetails,
catFileHandle,
catKey,
@@ -17,6 +18,7 @@ module Annex.CatFile (
import qualified Data.ByteString.Lazy as L
import qualified Data.Map as M
+import System.PosixCompat.Types
import Common.Annex
import qualified Git
@@ -24,6 +26,7 @@ import qualified Git.CatFile
import qualified Annex
import Git.Types
import Git.FilePath
+import Git.FileMode
catFile :: Git.Branch -> FilePath -> Annex L.ByteString
catFile branch file = do
@@ -35,6 +38,11 @@ catObject ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catObject h ref
+catTree :: Git.Ref -> Annex [(FilePath, FileMode)]
+catTree ref = do
+ h <- catFileHandle
+ liftIO $ Git.CatFile.catTree h ref
+
catObjectDetails :: Git.Ref -> Annex (Maybe (L.ByteString, Sha))
catObjectDetails ref = do
h <- catFileHandle
@@ -55,13 +63,39 @@ catFileHandle = do
Annex.changeState $ \s -> s { Annex.catfilehandles = m' }
return h
-{- From the Sha or Ref of a symlink back to the key. -}
-catKey :: Ref -> Annex (Maybe Key)
-catKey ref = do
- l <- fromInternalGitPath . encodeW8 . L.unpack <$> catObject ref
- return $ if isLinkToAnnex l
- then fileKey $ takeFileName l
- else Nothing
+{- From the Sha or Ref of a symlink back to the key.
+ -
+ - Requires a mode witness, to guarantee that the file is a symlink.
+ -}
+catKey :: Ref -> FileMode -> Annex (Maybe Key)
+catKey ref mode
+ | isSymLink mode = do
+ l <- fromInternalGitPath . encodeW8 . L.unpack <$> catObject ref
+ return $ if isLinkToAnnex l
+ then fileKey $ takeFileName l
+ else Nothing
+ | otherwise = return Nothing
+
+{- Looks up the file mode corresponding to the Ref using the running
+ - cat-file.
+ -
+ - Currently this always has to look in HEAD, because cat-file --batch
+ - does not offer a way to specify that we want to look up a tree object
+ - in the index. So if the index has a file staged not as a symlink,
+ - and it is a sylink in head, the wrong mode is gotten. This is a bug.
+ - Also, we have to assume the file is a symlink if it's not yet committed
+ - to HEAD.
+ -}
+catKeyChecked :: Bool -> Ref -> Annex (Maybe Key)
+catKeyChecked needhead ref@(Ref r) =
+ catKey ref =<< findmode <$> catTree treeref
+ where
+ pathparts = split "/" r
+ dir = intercalate "/" $ take (length pathparts - 1) pathparts
+ file = fromMaybe "" $ lastMaybe pathparts
+ treeref = Ref $ if needhead then "HEAD" ++ dir ++ "/" else dir ++ "/"
+ findmode = fromMaybe symLinkMode . headMaybe .
+ map snd . filter (\p -> fst p == file)
{- From a file in the repository back to the key.
-
@@ -76,7 +110,8 @@ catKey ref = do
-
- For command-line git-annex use, that doesn't matter. It's perfectly
- reasonable for things staged in the index after the currently running
- - git-annex process to not be noticed by it.
+ - git-annex process to not be noticed by it. However, we do want to see
+ - what's in the index, since it may have uncommitted changes not in HEAD>
-
- For the assistant, this is much more of a problem, since it commits
- files and then needs to be able to immediately look up their keys.
@@ -89,8 +124,8 @@ catKey ref = do
catKeyFile :: FilePath -> Annex (Maybe Key)
catKeyFile f = ifM (Annex.getState Annex.daemon)
( catKeyFileHEAD f
- , catKey $ Ref $ ":./" ++ f
+ , catKeyChecked True (Ref $ ":./" ++ f)
)
catKeyFileHEAD :: FilePath -> Annex (Maybe Key)
-catKeyFileHEAD f = catKey $ Ref $ "HEAD:./" ++ f
+catKeyFileHEAD f = catKeyChecked False (Ref $ "HEAD:./" ++ f)
diff --git a/Annex/Direct.hs b/Annex/Direct.hs
index a6c30ad08..ad9338ec7 100644
--- a/Annex/Direct.hs
+++ b/Annex/Direct.hs
@@ -14,7 +14,6 @@ import qualified Git.Merge
import qualified Git.DiffTree as DiffTree
import Git.Sha
import Git.Types
-import Git.FileMode
import Annex.CatFile
import qualified Annex.Queue
import Logs.Location
@@ -46,9 +45,7 @@ stageDirect = do
- efficiently as we can, by getting any key that's associated
- with it in git, as well as its stat info. -}
go (file, Just sha, Just mode) = do
- shakey <- if isSymLink mode
- then catKey sha
- else return Nothing
+ shakey <- catKey sha mode
mstat <- liftIO $ catchMaybeIO $ getSymbolicLinkStatus file
filekey <- isAnnexLink file
case (shakey, filekey, mstat, toInodeCache =<< mstat) of
@@ -149,10 +146,9 @@ mergeDirectCleanup d oldsha newsha = do
where
go getsha getmode a araw
| getsha item == nullSha = noop
- | isSymLink (getmode item) =
+ | otherwise =
maybe (araw f) (\k -> void $ a k f)
- =<< catKey (getsha item)
- | otherwise = araw f
+ =<< catKey (getsha item) (getmode item)
f = DiffTree.file item
moveout = removeDirect
diff --git a/Command/Indirect.hs b/Command/Indirect.hs
index 79e736d11..f866a93b6 100644
--- a/Command/Indirect.hs
+++ b/Command/Indirect.hs
@@ -78,7 +78,7 @@ perform = do
return Nothing
| otherwise ->
maybe noop (fromdirect f)
- =<< catKey sha
+ =<< catKey sha mode
_ -> noop
go _ = noop
diff --git a/Command/PreCommit.hs b/Command/PreCommit.hs
index c35cf61e1..afc5882d4 100644
--- a/Command/PreCommit.hs
+++ b/Command/PreCommit.hs
@@ -16,7 +16,6 @@ import qualified Git.Ref
import Annex.CatFile
import Annex.Content.Direct
import Git.Sha
-import Git.FileMode
def :: [Command]
def = [command "pre-commit" paramPaths seek SectionPlumbing
@@ -48,8 +47,8 @@ startDirect _ = next $ do
withkey (Git.DiffTree.srcsha diff) (Git.DiffTree.srcmode diff) removeAssociatedFile
withkey (Git.DiffTree.dstsha diff) (Git.DiffTree.dstmode diff) addAssociatedFile
where
- withkey sha mode a = when (sha /= nullSha && isSymLink mode) $ do
- k <- catKey sha
+ withkey sha mode a = when (sha /= nullSha) $ do
+ k <- catKey sha mode
case k of
Nothing -> noop
Just key -> void $ a key (Git.DiffTree.file diff)
diff --git a/Command/Sync.hs b/Command/Sync.hs
index ca823c736..d8c6fb8d4 100644
--- a/Command/Sync.hs
+++ b/Command/Sync.hs
@@ -29,6 +29,7 @@ import qualified Remote.Git
import Types.Key
import Config
import Annex.ReplaceFile
+import Git.FileMode
import Data.Hash.MD5
@@ -321,7 +322,7 @@ resolveMerge' u
case msha of
Nothing -> a Nothing
Just sha -> do
- key <- catKey sha
+ key <- catKey sha symLinkMode
maybe (return False) (a . Just) key
{- The filename to use when resolving a conflicted merge of a file,
diff --git a/Git/CatFile.hs b/Git/CatFile.hs
index 984d2f465..bd86ff326 100644
--- a/Git/CatFile.hs
+++ b/Git/CatFile.hs
@@ -130,4 +130,4 @@ catTree h treeref = go <$> catObjectDetails h treeref
parsemodefile b =
let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b)
in (file, readmode modestr)
- readmode = fst . Prelude.head . readOct
+ readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct
diff --git a/Git/FileMode.hs b/Git/FileMode.hs
index d42df9833..fc4d0264e 100644
--- a/Git/FileMode.hs
+++ b/Git/FileMode.hs
@@ -13,8 +13,11 @@ import Utility.FileMode
import System.PosixCompat.Types
+symLinkMode :: FileMode
+symLinkMode = 40960
+
{- Git uses a special file mode to indicate a symlink. This is the case
- even on Windows, so we hard code the valuse here, rather than using
- System.Posix.Files.symbolicLinkMode. -}
isSymLink :: FileMode -> Bool
-isSymLink = checkMode 40960
+isSymLink = checkMode symLinkMode
diff --git a/debian/changelog b/debian/changelog
index 6cc20e412..ff374fd04 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -18,8 +18,8 @@ git-annex (4.20130912) UNRELEASED; urgency=low
numcopies levels. (--fast avoids calculating these)
* gcrypt: Ensure that signing key is set to one of the participants keys.
* webapp: Show encryption information when editing a remote.
- * sync, pre-commit, indirect: Avoid unnecessarily catting non-symlink
- files from git, which can be so large it runs out of memory.
+ * Avoid unnecessarily catting non-symlink files from git, which can be
+ so large it runs out of memory.
-- Joey Hess <joeyh@debian.org> Thu, 12 Sep 2013 12:14:46 -0400
diff --git a/doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn b/doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn
index 2b28cb089..07d6f3eb3 100644
--- a/doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn
+++ b/doc/bugs/Huge_annex_out_of_memory_on_switch_to_indirect_mode_and_status.mdwn
@@ -61,3 +61,9 @@ commit git-annex: out of memory (requested 985661440 bytes)
# End of transcript or log.
"""]]
+
+> [[fixed|done]]. However, if you saw this behavior,
+> you have large files checked directly into git. You may
+> want to examine your repository and use git filter-branch to clean
+> it up.
+> --[[Joey]]
diff --git a/doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn b/doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn
index d63394b02..c127bac15 100644
--- a/doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn
+++ b/doc/bugs/Out_of_memory_error_in_fsck_whereis_find_and_status_cmds.mdwn
@@ -77,4 +77,8 @@ Any thoughts on how I can get git-annex (esp. fsck) to complete would be appreci
Thanks
Giovanni
-[[!tag moreinfo]]
+> [[fixed|done]]. However, if you saw this behavior,
+> you have large files checked directly into git. You may
+> want to examine your repository and use git filter-branch to clean
+> it up.
+> --[[Joey]]