summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Annex/CatFile.hs14
-rw-r--r--Annex/Link.hs6
-rw-r--r--CHANGELOG6
-rw-r--r--Git/CatFile.hs79
-rw-r--r--doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment42
5 files changed, 117 insertions, 30 deletions
diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs
index 99d301b4b..b1d8fba28 100644
--- a/Annex/CatFile.hs
+++ b/Annex/CatFile.hs
@@ -49,6 +49,11 @@ catObject ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catObject h ref
+catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType))
+catObjectMetaData ref = do
+ h <- catFileHandle
+ liftIO $ Git.CatFile.catObjectMetaData h ref
+
catTree :: Git.Ref -> Annex [(FilePath, FileMode)]
catTree ref = do
h <- catFileHandle
@@ -89,7 +94,14 @@ catFileStop = do
{- From ref to a symlink or a pointer file, get the key. -}
catKey :: Ref -> Annex (Maybe Key)
-catKey ref = parseLinkOrPointer <$> catObject ref
+catKey ref = go =<< catObjectMetaData ref
+ where
+ go (Just (sz, _))
+ -- Avoid catting large files, that cannot be symlinks or
+ -- pointer files, which would require buffering their
+ -- content in memory, as well as a lot of IO.
+ | sz <= maxPointerSz = parseLinkOrPointer <$> catObject ref
+ go _ = return Nothing
{- Gets a symlink target. -}
catSymLinkTarget :: Sha -> Annex String
diff --git a/Annex/Link.hs b/Annex/Link.hs
index af20ae30d..90312a04a 100644
--- a/Annex/Link.hs
+++ b/Annex/Link.hs
@@ -26,7 +26,6 @@ import Annex.HashObject
import Utility.FileMode
import qualified Data.ByteString.Lazy as L
-import Data.Int
type LinkTarget = String
@@ -137,7 +136,8 @@ writePointerFile file k mode = do
- Only looks at the first line, as pointer files can have subsequent
- lines. -}
parseLinkOrPointer :: L.ByteString -> Maybe Key
-parseLinkOrPointer = parseLinkOrPointer' . decodeBS . L.take maxPointerSz
+parseLinkOrPointer = parseLinkOrPointer'
+ . decodeBS . L.take (fromIntegral maxPointerSz)
where
{- Want to avoid buffering really big files in git into
@@ -146,7 +146,7 @@ parseLinkOrPointer = parseLinkOrPointer' . decodeBS . L.take maxPointerSz
- 8192 bytes is plenty for a pointer to a key.
- Pad some more to allow for any pointer files that might have
- lines after the key explaining what the file is used for. -}
-maxPointerSz :: Int64
+maxPointerSz :: Integer
maxPointerSz = 81920
parseLinkOrPointer' :: String -> Maybe Key
diff --git a/CHANGELOG b/CHANGELOG
index 15a5de2f5..57c497495 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -17,6 +17,12 @@ git-annex (6.20160924) UNRELEASED; urgency=medium
* Linux standalone: Include locale files in the bundle, and generate
locale definition files for the locales in use when starting runshell.
(Currently only done for utf-8 locales.)
+ * Avoid using a lot of memory when large objects are present in the git
+ repository and have to be checked to see if they are a pointed to an
+ annexed file. Cases where such memory use could occur included, but
+ were not limited to:
+ - git commit -a of a large unlocked file (in v5 mode)
+ - git-annex adjust when a large file was checked into git directly
-- Joey Hess <id@joeyh.name> Mon, 26 Sep 2016 16:46:19 -0400
diff --git a/Git/CatFile.hs b/Git/CatFile.hs
index dc96730ab..a377a08f7 100644
--- a/Git/CatFile.hs
+++ b/Git/CatFile.hs
@@ -16,6 +16,7 @@ module Git.CatFile (
catCommit,
catObject,
catObjectDetails,
+ catObjectMetaData,
) where
import System.IO
@@ -37,21 +38,28 @@ import Git.Types
import Git.FilePath
import qualified Utility.CoProcess as CoProcess
-data CatFileHandle = CatFileHandle CoProcess.CoProcessHandle Repo
+data CatFileHandle = CatFileHandle
+ { catFileProcess :: CoProcess.CoProcessHandle
+ , checkFileProcess :: CoProcess.CoProcessHandle
+ }
catFileStart :: Repo -> IO CatFileHandle
catFileStart = catFileStart' True
catFileStart' :: Bool -> Repo -> IO CatFileHandle
-catFileStart' restartable repo = do
- coprocess <- CoProcess.rawMode =<< gitCoProcessStart restartable
+catFileStart' restartable repo = CatFileHandle
+ <$> startp "--batch"
+ <*> startp "--batch-check=%(objectname) %(objecttype) %(objectsize)"
+ where
+ startp p = CoProcess.rawMode =<< gitCoProcessStart restartable
[ Param "cat-file"
- , Param "--batch"
+ , Param p
] repo
- return $ CatFileHandle coprocess repo
catFileStop :: CatFileHandle -> IO ()
-catFileStop (CatFileHandle p _) = CoProcess.stop p
+catFileStop h = do
+ CoProcess.stop (catFileProcess h)
+ CoProcess.stop (checkFileProcess h)
{- Reads a file from a specified branch. -}
catFile :: CatFileHandle -> Branch -> FilePath -> IO L.ByteString
@@ -68,32 +76,51 @@ catObject :: CatFileHandle -> Ref -> IO L.ByteString
catObject h object = maybe L.empty fst3 <$> catObjectDetails h object
catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, ObjectType))
-catObjectDetails (CatFileHandle hdl _) object = CoProcess.query hdl send receive
+catObjectDetails h object = query (catFileProcess h) object $ \from -> do
+ header <- hGetLine from
+ case parseResp object header of
+ Just (ParsedResp sha size objtype) -> do
+ content <- S.hGet from (fromIntegral size)
+ eatchar '\n' from
+ return $ Just (L.fromChunks [content], sha, objtype)
+ Just DNE -> return Nothing
+ Nothing -> error $ "unknown response from git cat-file " ++ show (header, object)
where
- query = fromRef object
- send to = hPutStrLn to query
- receive from = do
- header <- hGetLine from
- case words header of
- [sha, objtype, size]
- | length sha == shaSize ->
- case (readObjectType objtype, reads size) of
- (Just t, [(bytes, "")]) -> readcontent t bytes from sha
- _ -> dne
- | otherwise -> dne
- _
- | header == fromRef object ++ " missing" -> dne
- | otherwise -> error $ "unknown response from git cat-file " ++ show (header, query)
- readcontent objtype bytes from sha = do
- content <- S.hGet from bytes
- eatchar '\n' from
- return $ Just (L.fromChunks [content], Ref sha, objtype)
- dne = return Nothing
eatchar expected from = do
c <- hGetChar from
when (c /= expected) $
error $ "missing " ++ (show expected) ++ " from git cat-file"
+{- Gets the size and type of an object, without reading its content. -}
+catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType))
+catObjectMetaData h object = query (checkFileProcess h) object $ \from -> do
+ resp <- hGetLine from
+ case parseResp object resp of
+ Just (ParsedResp _ size objtype) ->
+ return $ Just (size, objtype)
+ Just DNE -> return Nothing
+ Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
+
+data ParsedResp = ParsedResp Sha Integer ObjectType | DNE
+
+query :: CoProcess.CoProcessHandle -> Ref -> (Handle -> IO a) -> IO a
+query hdl object receive = CoProcess.query hdl send receive
+ where
+ send to = hPutStrLn to (fromRef object)
+
+parseResp :: Ref -> String -> Maybe ParsedResp
+parseResp object l = case words l of
+ [sha, objtype, size]
+ | length sha == shaSize ->
+ case (readObjectType objtype, reads size) of
+ (Just t, [(bytes, "")]) ->
+ Just $ ParsedResp (Ref sha) bytes t
+ _ -> Nothing
+ | otherwise -> Nothing
+ _
+ | l == fromRef object ++ " missing" -> Just DNE
+ | otherwise -> Nothing
+
{- Gets a list of files and directories in a tree. (Not recursive.) -}
catTree :: CatFileHandle -> Ref -> IO [(FilePath, FileMode)]
catTree h treeref = go <$> catObjectDetails h treeref
diff --git a/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment b/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment
new file mode 100644
index 000000000..7da2e3d21
--- /dev/null
+++ b/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment
@@ -0,0 +1,42 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 2"""
+ date="2016-10-05T18:04:21Z"
+ content="""
+Nothing to do with v6 actually.
+[[!commit 412dcb8017d9d42bc1fb2b5a7ae5418410cde539]] and a subsequent
+commit caused this behavior.
+
+In order to tell if a file is unlocked, the type needs to have changed
+from a symlink to a regular file, and the symlink needs to have been
+an annexed link (and not some other symlink). That commit made it check
+catKeyFileHEAD, which necessarily reads the whole content of the last
+committed version of the file from git cat-file.
+
+A later commit added a check to catKeyFile, which reads the version of the
+file that is staged. Due to the git commit -a, the whole file content has
+been staged, and so that is where the large file content is read from git
+cat-file in git annex pre-commit.
+
+For pre-commit's purposes, the catKeyFile check is never going to find an
+annexed link, since the whole file content has been staged by git commit.
+
+But, rather than such a specific fix, I think that the core problem to fix
+is that catKey reads the whole content of a large object from git. That's
+just too expensive for git repos that contain large objects, for whatever
+reason.
+
+For example, grepping for catKey, I quickly found another place where
+a large file is read from git cat-file, in the adjusted branch code.
+
+So, let's make catKey check the object size before reading it; if it's
+> 8192 bytes it's certianly not a symlink. This wil need a separate
+`git cat-file --batch-check` process, and a little extra work. It which will
+probably not be very expensive due to disk caching, but if it does cause
+a slowdown, the main thing will be to handling of unlocked files in v6
+mode, which needs to use catKey.
+
+I've done this, and it fixes the problem I saw. I am not 100% sure if
+that's the same problem that occurred on OSX. (Which I was also able to
+reproduce). Probably is. Need to verify.
+"""]]