5 files changed, 117 insertions, 30 deletions
diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs
index 99d301b4b..b1d8fba28 100644
--- a/Annex/CatFile.hs
+++ b/Annex/CatFile.hs
@@ -49,6 +49,11 @@ catObject ref = do
 	h <- catFileHandle
 	liftIO $ Git.CatFile.catObject h ref
 
+catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType))
+catObjectMetaData ref = do
+	h <- catFileHandle
+	liftIO $ Git.CatFile.catObjectMetaData h ref
+
 catTree :: Git.Ref -> Annex [(FilePath, FileMode)]
 catTree ref = do
 	h <- catFileHandle
@@ -89,7 +94,14 @@ catFileStop = do
 
 {- From ref to a symlink or a pointer file, get the key. -}
 catKey :: Ref -> Annex (Maybe Key)
-catKey ref = parseLinkOrPointer <$> catObject ref
+catKey ref = go =<< catObjectMetaData ref
+  where
+	go (Just (sz, _))
+		-- Avoid catting large files, that cannot be symlinks or
+		-- pointer files, which would require buffering their
+		-- content in memory, as well as a lot of IO.
+		| sz <= maxPointerSz = parseLinkOrPointer <$> catObject ref
+	go _ = return Nothing
 
 {- Gets a symlink target. -}
 catSymLinkTarget :: Sha -> Annex String
diff --git a/Annex/Link.hs b/Annex/Link.hs
index af20ae30d..90312a04a 100644
--- a/Annex/Link.hs
+++ b/Annex/Link.hs
@@ -26,7 +26,6 @@ import Annex.HashObject
 import Utility.FileMode
 
 import qualified Data.ByteString.Lazy as L
-import Data.Int
 
 type LinkTarget = String
 
@@ -137,7 +136,8 @@ writePointerFile file k mode = do
  - Only looks at the first line, as pointer files can have subsequent
  - lines. -}
 parseLinkOrPointer :: L.ByteString -> Maybe Key
-parseLinkOrPointer = parseLinkOrPointer' . decodeBS . L.take maxPointerSz
+parseLinkOrPointer = parseLinkOrPointer' 
+	. decodeBS . L.take (fromIntegral maxPointerSz)
   where
 
 {- Want to avoid buffering really big files in git into
@@ -146,7 +146,7 @@ parseLinkOrPointer = parseLinkOrPointer' . decodeBS . L.take maxPointerSz
  - 8192 bytes is plenty for a pointer to a key.
  - Pad some more to allow for any pointer files that might have
  - lines after the key explaining what the file is used for. -}
-maxPointerSz :: Int64
+maxPointerSz :: Integer
 maxPointerSz = 81920
 
 parseLinkOrPointer' :: String -> Maybe Key
diff --git a/CHANGELOG b/CHANGELOG
index 15a5de2f5..57c497495 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -17,6 +17,12 @@ git-annex (6.20160924) UNRELEASED; urgency=medium
   * Linux standalone: Include locale files in the bundle, and generate
     locale definition files for the locales in use when starting runshell.
     (Currently only done for utf-8 locales.)
+  * Avoid using a lot of memory when large objects are present in the git
+    repository and have to be checked to see if they are a pointed to an
+    annexed file. Cases where such memory use could occur included, but
+    were not limited to:
+    - git commit -a of a large unlocked file (in v5 mode)
+    - git-annex adjust when a large file was checked into git directly
 
  -- Joey Hess <id@joeyh.name>  Mon, 26 Sep 2016 16:46:19 -0400
 
diff --git a/Git/CatFile.hs b/Git/CatFile.hs
index dc96730ab..a377a08f7 100644
--- a/Git/CatFile.hs
+++ b/Git/CatFile.hs
@@ -16,6 +16,7 @@ module Git.CatFile (
 	catCommit,
 	catObject,
 	catObjectDetails,
+	catObjectMetaData,
 ) where
 
 import System.IO
@@ -37,21 +38,28 @@ import Git.Types
 import Git.FilePath
 import qualified Utility.CoProcess as CoProcess
 
-data CatFileHandle = CatFileHandle CoProcess.CoProcessHandle Repo
+data CatFileHandle = CatFileHandle 
+	{ catFileProcess :: CoProcess.CoProcessHandle
+	, checkFileProcess :: CoProcess.CoProcessHandle
+	}
 
 catFileStart :: Repo -> IO CatFileHandle
 catFileStart = catFileStart' True
 
 catFileStart' :: Bool -> Repo -> IO CatFileHandle
-catFileStart' restartable repo = do
-	coprocess <- CoProcess.rawMode =<< gitCoProcessStart restartable
+catFileStart' restartable repo = CatFileHandle
+	<$> startp "--batch"
+	<*> startp "--batch-check=%(objectname) %(objecttype) %(objectsize)"
+  where
+	startp p = CoProcess.rawMode =<< gitCoProcessStart restartable
 		[ Param "cat-file"
-		, Param "--batch"
+		, Param p
 		] repo
-	return $ CatFileHandle coprocess repo
 
 catFileStop :: CatFileHandle -> IO ()
-catFileStop (CatFileHandle p _) = CoProcess.stop p
+catFileStop h = do
+	CoProcess.stop (catFileProcess h)
+	CoProcess.stop (checkFileProcess h)
 
 {- Reads a file from a specified branch. -}
 catFile :: CatFileHandle -> Branch -> FilePath -> IO L.ByteString
@@ -68,32 +76,51 @@ catObject :: CatFileHandle -> Ref -> IO L.ByteString
 catObject h object = maybe L.empty fst3 <$> catObjectDetails h object
 
 catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, ObjectType))
-catObjectDetails (CatFileHandle hdl _) object = CoProcess.query hdl send receive
+catObjectDetails h object = query (catFileProcess h) object $ \from -> do
+	header <- hGetLine from
+	case parseResp object header of
+		Just (ParsedResp sha size objtype) -> do
+			content <- S.hGet from (fromIntegral size)
+			eatchar '\n' from
+			return $ Just (L.fromChunks [content], sha, objtype)
+		Just DNE -> return Nothing
+		Nothing -> error $ "unknown response from git cat-file " ++ show (header, object)
   where
-	query = fromRef object
-	send to = hPutStrLn to query
-	receive from = do
-		header <- hGetLine from
-		case words header of
-			[sha, objtype, size]
-				| length sha == shaSize ->
-					case (readObjectType objtype, reads size) of
-						(Just t, [(bytes, "")]) -> readcontent t bytes from sha
-						_ -> dne
-				| otherwise -> dne
-			_
-				| header == fromRef object ++ " missing" -> dne
-				| otherwise -> error $ "unknown response from git cat-file " ++ show (header, query)
-	readcontent objtype bytes from sha = do
-		content <- S.hGet from bytes
-		eatchar '\n' from
-		return $ Just (L.fromChunks [content], Ref sha, objtype)
-	dne = return Nothing
 	eatchar expected from = do
 		c <- hGetChar from
 		when (c /= expected) $
 			error $ "missing " ++ (show expected) ++ " from git cat-file"
 
+{- Gets the size and type of an object, without reading its content. -}
+catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType))
+catObjectMetaData h object = query (checkFileProcess h) object $ \from -> do
+	resp <- hGetLine from
+	case parseResp object resp of
+		Just (ParsedResp _ size objtype) ->
+			return $ Just (size, objtype)
+		Just DNE -> return Nothing
+		Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
+
+data ParsedResp = ParsedResp Sha Integer ObjectType | DNE
+
+query :: CoProcess.CoProcessHandle -> Ref -> (Handle -> IO a) -> IO a
+query hdl object receive = CoProcess.query hdl send receive
+  where
+	send to = hPutStrLn to (fromRef object)
+
+parseResp :: Ref -> String -> Maybe ParsedResp
+parseResp object l = case words l of
+	[sha, objtype, size]
+		| length sha == shaSize ->
+			case (readObjectType objtype, reads size) of
+				(Just t, [(bytes, "")]) -> 
+					Just $ ParsedResp (Ref sha) bytes t
+				_ -> Nothing
+		| otherwise -> Nothing
+	_
+		| l == fromRef object ++ " missing" -> Just DNE
+		| otherwise -> Nothing
+
 {- Gets a list of files and directories in a tree. (Not recursive.) -}
 catTree :: CatFileHandle -> Ref -> IO [(FilePath, FileMode)]
 catTree h treeref = go <$> catObjectDetails h treeref
diff --git a/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment b/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment
new file mode 100644
index 000000000..7da2e3d21
--- /dev/null
+++ b/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment
@@ -0,0 +1,42 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 2"""
+ date="2016-10-05T18:04:21Z"
+ content="""
+Nothing to do with v6 actually. 
+[[!commit 412dcb8017d9d42bc1fb2b5a7ae5418410cde539]] and a subsequent
+commit caused this behavior.
+
+In order to tell if a file is unlocked, the type needs to have changed
+from a symlink to a regular file, and the symlink needs to have been 
+an annexed link (and not some other symlink). That commit made it check
+catKeyFileHEAD, which necessarily reads the whole content of the last
+committed version of the file from git cat-file.
+
+A later commit added a check to catKeyFile, which reads the version of the
+file that is staged. Due to the git commit -a, the whole file content has
+been staged, and so that is where the large file content is read from git
+cat-file in git annex pre-commit.
+
+For pre-commit's purposes, the catKeyFile check is never going to find an
+annexed link, since the whole file content has been staged by git commit.
+
+But, rather than such a specific fix, I think that the core problem to fix
+is that catKey reads the whole content of a large object from git. That's
+just too expensive for git repos that contain large objects, for whatever
+reason.
+
+For example, grepping for catKey, I quickly found another place where
+a large file is read from git cat-file, in the adjusted branch code.
+
+So, let's make catKey check the object size before reading it; if it's
+> 8192 bytes it's certianly not a symlink. This wil need a separate 
+`git cat-file --batch-check` process, and a little extra work. It which will
+probably not be very expensive due to disk caching, but if it does cause
+a slowdown, the main thing will be to handling of unlocked files in v6
+mode, which needs to use catKey.
+
+I've done this, and it fixes the problem I saw. I am not 100% sure if
+that's the same problem that occurred on OSX. (Which I was also able to
+reproduce). Probably is. Need to verify.
+"""]]