diff options
-rw-r--r-- | Annex/CatFile.hs | 14 | ||||
-rw-r--r-- | Annex/Link.hs | 6 | ||||
-rw-r--r-- | CHANGELOG | 6 | ||||
-rw-r--r-- | Git/CatFile.hs | 79 | ||||
-rw-r--r-- | doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment | 42 |
5 files changed, 117 insertions, 30 deletions
diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs index 99d301b4b..b1d8fba28 100644 --- a/Annex/CatFile.hs +++ b/Annex/CatFile.hs @@ -49,6 +49,11 @@ catObject ref = do h <- catFileHandle liftIO $ Git.CatFile.catObject h ref +catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType)) +catObjectMetaData ref = do + h <- catFileHandle + liftIO $ Git.CatFile.catObjectMetaData h ref + catTree :: Git.Ref -> Annex [(FilePath, FileMode)] catTree ref = do h <- catFileHandle @@ -89,7 +94,14 @@ catFileStop = do {- From ref to a symlink or a pointer file, get the key. -} catKey :: Ref -> Annex (Maybe Key) -catKey ref = parseLinkOrPointer <$> catObject ref +catKey ref = go =<< catObjectMetaData ref + where + go (Just (sz, _)) + -- Avoid catting large files, that cannot be symlinks or + -- pointer files, which would require buffering their + -- content in memory, as well as a lot of IO. + | sz <= maxPointerSz = parseLinkOrPointer <$> catObject ref + go _ = return Nothing {- Gets a symlink target. -} catSymLinkTarget :: Sha -> Annex String diff --git a/Annex/Link.hs b/Annex/Link.hs index af20ae30d..90312a04a 100644 --- a/Annex/Link.hs +++ b/Annex/Link.hs @@ -26,7 +26,6 @@ import Annex.HashObject import Utility.FileMode import qualified Data.ByteString.Lazy as L -import Data.Int type LinkTarget = String @@ -137,7 +136,8 @@ writePointerFile file k mode = do - Only looks at the first line, as pointer files can have subsequent - lines. -} parseLinkOrPointer :: L.ByteString -> Maybe Key -parseLinkOrPointer = parseLinkOrPointer' . decodeBS . L.take maxPointerSz +parseLinkOrPointer = parseLinkOrPointer' + . decodeBS . L.take (fromIntegral maxPointerSz) where {- Want to avoid buffering really big files in git into @@ -146,7 +146,7 @@ parseLinkOrPointer = parseLinkOrPointer' . decodeBS . L.take maxPointerSz - 8192 bytes is plenty for a pointer to a key. - Pad some more to allow for any pointer files that might have - lines after the key explaining what the file is used for. -} -maxPointerSz :: Int64 +maxPointerSz :: Integer maxPointerSz = 81920 parseLinkOrPointer' :: String -> Maybe Key @@ -17,6 +17,12 @@ git-annex (6.20160924) UNRELEASED; urgency=medium * Linux standalone: Include locale files in the bundle, and generate locale definition files for the locales in use when starting runshell. (Currently only done for utf-8 locales.) + * Avoid using a lot of memory when large objects are present in the git + repository and have to be checked to see if they are a pointed to an + annexed file. Cases where such memory use could occur included, but + were not limited to: + - git commit -a of a large unlocked file (in v5 mode) + - git-annex adjust when a large file was checked into git directly -- Joey Hess <id@joeyh.name> Mon, 26 Sep 2016 16:46:19 -0400 diff --git a/Git/CatFile.hs b/Git/CatFile.hs index dc96730ab..a377a08f7 100644 --- a/Git/CatFile.hs +++ b/Git/CatFile.hs @@ -16,6 +16,7 @@ module Git.CatFile ( catCommit, catObject, catObjectDetails, + catObjectMetaData, ) where import System.IO @@ -37,21 +38,28 @@ import Git.Types import Git.FilePath import qualified Utility.CoProcess as CoProcess -data CatFileHandle = CatFileHandle CoProcess.CoProcessHandle Repo +data CatFileHandle = CatFileHandle + { catFileProcess :: CoProcess.CoProcessHandle + , checkFileProcess :: CoProcess.CoProcessHandle + } catFileStart :: Repo -> IO CatFileHandle catFileStart = catFileStart' True catFileStart' :: Bool -> Repo -> IO CatFileHandle -catFileStart' restartable repo = do - coprocess <- CoProcess.rawMode =<< gitCoProcessStart restartable +catFileStart' restartable repo = CatFileHandle + <$> startp "--batch" + <*> startp "--batch-check=%(objectname) %(objecttype) %(objectsize)" + where + startp p = CoProcess.rawMode =<< gitCoProcessStart restartable [ Param "cat-file" - , Param "--batch" + , Param p ] repo - return $ CatFileHandle coprocess repo catFileStop :: CatFileHandle -> IO () -catFileStop (CatFileHandle p _) = CoProcess.stop p +catFileStop h = do + CoProcess.stop (catFileProcess h) + CoProcess.stop (checkFileProcess h) {- Reads a file from a specified branch. -} catFile :: CatFileHandle -> Branch -> FilePath -> IO L.ByteString @@ -68,32 +76,51 @@ catObject :: CatFileHandle -> Ref -> IO L.ByteString catObject h object = maybe L.empty fst3 <$> catObjectDetails h object catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, ObjectType)) -catObjectDetails (CatFileHandle hdl _) object = CoProcess.query hdl send receive +catObjectDetails h object = query (catFileProcess h) object $ \from -> do + header <- hGetLine from + case parseResp object header of + Just (ParsedResp sha size objtype) -> do + content <- S.hGet from (fromIntegral size) + eatchar '\n' from + return $ Just (L.fromChunks [content], sha, objtype) + Just DNE -> return Nothing + Nothing -> error $ "unknown response from git cat-file " ++ show (header, object) where - query = fromRef object - send to = hPutStrLn to query - receive from = do - header <- hGetLine from - case words header of - [sha, objtype, size] - | length sha == shaSize -> - case (readObjectType objtype, reads size) of - (Just t, [(bytes, "")]) -> readcontent t bytes from sha - _ -> dne - | otherwise -> dne - _ - | header == fromRef object ++ " missing" -> dne - | otherwise -> error $ "unknown response from git cat-file " ++ show (header, query) - readcontent objtype bytes from sha = do - content <- S.hGet from bytes - eatchar '\n' from - return $ Just (L.fromChunks [content], Ref sha, objtype) - dne = return Nothing eatchar expected from = do c <- hGetChar from when (c /= expected) $ error $ "missing " ++ (show expected) ++ " from git cat-file" +{- Gets the size and type of an object, without reading its content. -} +catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType)) +catObjectMetaData h object = query (checkFileProcess h) object $ \from -> do + resp <- hGetLine from + case parseResp object resp of + Just (ParsedResp _ size objtype) -> + return $ Just (size, objtype) + Just DNE -> return Nothing + Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object) + +data ParsedResp = ParsedResp Sha Integer ObjectType | DNE + +query :: CoProcess.CoProcessHandle -> Ref -> (Handle -> IO a) -> IO a +query hdl object receive = CoProcess.query hdl send receive + where + send to = hPutStrLn to (fromRef object) + +parseResp :: Ref -> String -> Maybe ParsedResp +parseResp object l = case words l of + [sha, objtype, size] + | length sha == shaSize -> + case (readObjectType objtype, reads size) of + (Just t, [(bytes, "")]) -> + Just $ ParsedResp (Ref sha) bytes t + _ -> Nothing + | otherwise -> Nothing + _ + | l == fromRef object ++ " missing" -> Just DNE + | otherwise -> Nothing + {- Gets a list of files and directories in a tree. (Not recursive.) -} catTree :: CatFileHandle -> Ref -> IO [(FilePath, FileMode)] catTree h treeref = go <$> catObjectDetails h treeref diff --git a/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment b/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment new file mode 100644 index 000000000..7da2e3d21 --- /dev/null +++ b/doc/bugs/committing_an_edited_file_fails_with___34__hGetBuf__58___Invalid_argument__34__/comment_2_b436936c502a40c01f82e64d97f1875e._comment @@ -0,0 +1,42 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2016-10-05T18:04:21Z" + content=""" +Nothing to do with v6 actually. +[[!commit 412dcb8017d9d42bc1fb2b5a7ae5418410cde539]] and a subsequent +commit caused this behavior. + +In order to tell if a file is unlocked, the type needs to have changed +from a symlink to a regular file, and the symlink needs to have been +an annexed link (and not some other symlink). That commit made it check +catKeyFileHEAD, which necessarily reads the whole content of the last +committed version of the file from git cat-file. + +A later commit added a check to catKeyFile, which reads the version of the +file that is staged. Due to the git commit -a, the whole file content has +been staged, and so that is where the large file content is read from git +cat-file in git annex pre-commit. + +For pre-commit's purposes, the catKeyFile check is never going to find an +annexed link, since the whole file content has been staged by git commit. + +But, rather than such a specific fix, I think that the core problem to fix +is that catKey reads the whole content of a large object from git. That's +just too expensive for git repos that contain large objects, for whatever +reason. + +For example, grepping for catKey, I quickly found another place where +a large file is read from git cat-file, in the adjusted branch code. + +So, let's make catKey check the object size before reading it; if it's +> 8192 bytes it's certianly not a symlink. This wil need a separate +`git cat-file --batch-check` process, and a little extra work. It which will +probably not be very expensive due to disk caching, but if it does cause +a slowdown, the main thing will be to handling of unlocked files in v6 +mode, which needs to use catKey. + +I've done this, and it fixes the problem I saw. I am not 100% sure if +that's the same problem that occurred on OSX. (Which I was also able to +reproduce). Probably is. Need to verify. +"""]] |