summaryrefslogtreecommitdiff
path: root/Git/RecoverRepository.hs
blob: a109896e75d9e0b185df39ce57ba253eac40a7f0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
{- git repository recovery
 -
 - Copyright 2013 Joey Hess <joey@kitenet.net>
 -
 - Licensed under the GNU GPL version 3 or higher.
 -}

module Git.RecoverRepository (
	cleanCorruptObjects,
	retrieveMissingObjects,
	resetLocalBranches,
	removeTrackingBranches,
	rewriteIndex,
	emptyGoodCommits,
) where

import Common
import Git
import Git.Command
import Git.Fsck
import Git.Objects
import Git.Sha
import Git.Types
import qualified Git.Config
import qualified Git.Construct
import qualified Git.LsTree as LsTree
import qualified Git.LsFiles as LsFiles
import qualified Git.Ref as Ref
import qualified Git.RefLog as RefLog
import qualified Git.UpdateIndex as UpdateIndex
import Utility.Tmp
import Utility.Rsync

import qualified Data.Set as S
import qualified Data.ByteString.Lazy as L
import System.Log.Logger
import Data.Tuple.Utils

{- Given a set of bad objects found by git fsck, removes all
 - corrupt objects, and returns a list of missing objects,
 - which need to be found elsewhere to finish recovery.
 -
 - Since git fsck may crash on corrupt objects, and so not
 - report the full set of corrupt or missing objects,
 - this removes corrupt objects, and re-runs fsck, until it
 - stabalizes.
 -
 - To remove corrupt objects, unpack all packs, and remove the packs
 - (to handle corrupt packs), and remove loose object files.
 -}
cleanCorruptObjects :: Maybe MissingObjects -> Repo -> IO MissingObjects
cleanCorruptObjects mmissing r = check mmissing
  where
	check Nothing = do
		notice "git fsck found a problem but no specific broken objects. Perhaps a corrupt pack file?"
		ifM (explodePacks r)
			( retry S.empty
			, return S.empty
			)
	check (Just bad)
		| S.null bad = return S.empty
		| otherwise = do
			notice $ unwords 
				[ "git fsck found"
				, show (S.size bad)
				, "broken objects."
				]
			exploded <- explodePacks r
			removed <- removeLoose r bad
			if exploded || removed
				then retry bad
				else return bad
	retry oldbad = do
		notice "Re-running git fsck to see if it finds more problems."
		v <- findBroken False r
		case v of
			Nothing -> error $ unwords
				[ "git fsck found a problem, which was not corrected after removing"
				, show (S.size oldbad)
				, "corrupt objects."
				]
			Just newbad -> do
				removed <- removeLoose r newbad
				let s = S.union oldbad newbad
				if not removed || s == oldbad
					then return s
					else retry s

removeLoose :: Repo -> MissingObjects -> IO Bool
removeLoose r s = do
	let fs = map (looseObjectFile r) (S.toList s)
	count <- length <$> filterM doesFileExist fs
	if (count > 0)
		then do
			notice $ unwords
				[ "removing"
				, show count
				, "corrupt loose objects"
				]
			mapM_ nukeFile fs
			return True
		else return False

explodePacks :: Repo -> IO Bool
explodePacks r = do
	packs <- listPackFiles r
	if null packs
		then return False
		else do
			notice "Unpacking all pack files."
			mapM_ go packs
			return True
  where
	go packfile = do
		-- May fail, if pack file is corrupt.
		void $ tryIO $
			pipeWrite [Param "unpack-objects"] r $ \h ->
				L.hPut h =<< L.readFile packfile
		nukeFile packfile
		nukeFile $ packIdxFile packfile

{- Try to retrieve a set of missing objects, from the remotes of a
 - repository. Returns any that could not be retreived.
 -}
retrieveMissingObjects :: MissingObjects -> Repo -> IO MissingObjects
retrieveMissingObjects missing r
	| S.null missing = return missing
	| otherwise = withTmpDir "tmprepo" $ \tmpdir -> do
		unlessM (boolSystem "git" [Params "init", File tmpdir]) $
			error $ "failed to create temp repository in " ++ tmpdir
		tmpr <- Git.Config.read =<< Git.Construct.fromAbsPath tmpdir
		stillmissing <- pullremotes tmpr (remotes r) fetchrefstags missing
		if S.null stillmissing
			then return stillmissing
			else pullremotes tmpr (remotes r) fetchallrefs stillmissing
  where
	pullremotes _tmpr [] _ stillmissing = return stillmissing
	pullremotes tmpr (rmt:rmts) fetchrefs s
		| S.null s = return s
		| otherwise = do
			notice $ "Trying to recover missing objects from remote " ++ repoDescribe rmt
			ifM (fetchsome rmt fetchrefs tmpr)
				( do
					void $ copyObjects tmpr r
					stillmissing <- findMissing (S.toList s) r
					pullremotes tmpr rmts fetchrefs stillmissing
				, do
					notice $ unwords
						[ "failed to fetch from remote"
						, repoDescribe rmt
						, "(will continue without it, but making this remote available may improve recovery)"
						]
					pullremotes tmpr rmts fetchrefs s
				)
	fetchsome rmt ps = runBool $
		[ Param "fetch"
		, Param (repoLocation rmt)
		, Params "--force --update-head-ok --quiet"
		] ++ ps
	-- fetch refs and tags
	fetchrefstags = [ Param "+refs/heads/*:refs/heads/*", Param "--tags"]
	-- Fetch all available refs (more likely to fail,
	-- as the remote may have refs it refuses to send).
	fetchallrefs = [ Param "+*:*" ]

{- Copies all objects from the src repository to the dest repository.
 - This is done using rsync, so it copies all missing object, and all
 - objects they rely on. -}
copyObjects :: Repo -> Repo -> IO Bool
copyObjects srcr destr = rsync
	[ Param "-qr"
	, File $ addTrailingPathSeparator $ objectsDir srcr
	, File $ addTrailingPathSeparator $ objectsDir destr
	]

{- To deal with missing objects that cannot be recovered, resets any
 - local branches to point to an old commit before the missing
 - objects. Returns all branches that were changed, and deleted.
 -}
resetLocalBranches :: MissingObjects -> GoodCommits -> Repo -> IO ([Branch], [Branch], GoodCommits)
resetLocalBranches missing goodcommits r =
	go [] [] goodcommits =<< filter islocalbranch <$> getAllRefs r
  where
	islocalbranch b = "refs/heads/" `isPrefixOf` show b
	go changed deleted gcs [] = return (changed, deleted, gcs)
	go changed deleted gcs (b:bs) = do
		(mc, gcs') <- findUncorruptedCommit missing gcs b r
		case mc of
			Just c
				| c == b -> go changed deleted gcs' bs
				| otherwise -> do
					reset b c
					go (b:changed) deleted gcs' bs
			Nothing -> do
				nukeBranchRef b r
				go changed (b:deleted) gcs' bs
	reset b c = do
		nukeBranchRef b	r
		void $ runBool
			[ Param "branch"
			, Param (show $ Ref.base b)
			, Param (show c)
			] r

{- To deal with missing objects that cannot be recovered, removes
 - any remote tracking branches that reference them. Returns a list of
 - all removed branches.
 -}
removeTrackingBranches :: MissingObjects -> GoodCommits -> Repo -> IO ([Branch], GoodCommits)
removeTrackingBranches missing goodcommits r =
	go [] goodcommits =<< filter istrackingbranch <$> getAllRefs r
  where
  	istrackingbranch b = "refs/remotes/" `isPrefixOf` show b
	go removed gcs [] = return (removed, gcs)
	go removed gcs (b:bs) = do
		(ok, gcs') <- verifyCommit missing gcs b r
		if ok
			then go removed gcs' bs
			else do
				nukeBranchRef b r
				go (b:removed) gcs' bs

{- Gets all refs, including ones that are corrupt.
 - git show-ref does not output refs to commits that are directly
 - corrupted, so it is not used.
 -}
getAllRefs :: Repo -> IO [Ref]
getAllRefs r = do
	packedrs <- mapMaybe parsePacked . lines
		<$> catchDefaultIO "" (readFile $ packedRefsFile r)
	loosers <- map toref <$> dirContentsRecursive (localGitDir r </> "refs")
	return $ packedrs ++ loosers
  where
  	refdir = localGitDir r </> "refs"
	toref = Ref . relPathDirToFile (localGitDir r)

packedRefsFile :: Repo -> FilePath
packedRefsFile r = localGitDir r </> "packed-refs"

parsePacked :: String -> Maybe Ref
parsePacked l = case words l of
	(sha:ref:[])
		| isJust (extractSha sha) -> Just $ Ref ref
	_ -> Nothing

{- git-branch -d cannot be used to remove a branch that is directly
 - pointing to a corrupt commit. However, it's tried first. -}
nukeBranchRef :: Branch -> Repo -> IO ()
nukeBranchRef b r = void $ usegit <||> byhand
  where
	usegit = runBool
		[ Param "branch"
		, Params "-r -d"
		, Param $ show $ Ref.base b
		] r
	byhand = do
		nukeFile $ localGitDir r </> show b
		whenM (doesFileExist packedrefs) $
			withTmpFile "packed-refs" $ \tmp h -> do
				ls <- lines <$> readFile packedrefs
				hPutStr h $ unlines $
					filter (not . skiprefline) ls
				hClose h
				renameFile tmp packedrefs
		return True
	skiprefline l = case parsePacked l of
		Just packedref
			| packedref == b -> True
		_ -> False
	packedrefs = packedRefsFile r

{- Finds the most recent commit to a branch that does not need any
 - of the missing objects. If the input branch is good as-is, returns it.
 - Otherwise, tries to traverse the commits in the branch to find one
 - that is ok. That might fail, if one of them is corrupt, or if an object
 - at the root of the branch is missing. Finally, looks for an old version
 - of the branch from the reflog.
 -}
findUncorruptedCommit :: MissingObjects -> GoodCommits -> Branch -> Repo -> IO (Maybe Sha, GoodCommits)
findUncorruptedCommit missing goodcommits branch r = do
	(ok, goodcommits') <- verifyCommit missing goodcommits branch r
	if ok
		then return (Just branch, goodcommits')
		else do
			(ls, cleanup) <- pipeNullSplit
				[ Param "log"
				, Param "-z"
				, Param "--format=%H"
				, Param (show branch)
				] r
			let branchshas = catMaybes $ map extractSha ls
			reflogshas <- RefLog.get branch r
			-- XXX Could try a bit harder here, and look
			-- for uncorrupted old commits in branches in the
			-- reflog.
			cleanup `after` findfirst goodcommits (branchshas ++ reflogshas)
  where
	findfirst gcs [] = return (Nothing, gcs)
	findfirst gcs (c:cs) = do
		(ok, gcs') <- verifyCommit missing gcs c r
		if ok
			then return (Just c, gcs')
			else findfirst gcs' cs

{- Verifies tha none of the missing objects in the set are used by
 - the commit. Also adds to a set of commit shas that have been verified to
 - be good, which can be passed into subsequent calls to avoid
 - redundant work when eg, chasing down branches to find the first
 - uncorrupted commit. -}
verifyCommit :: MissingObjects -> GoodCommits -> Sha -> Repo -> IO (Bool, GoodCommits)
verifyCommit missing goodcommits commit r
	| checkGoodCommit commit goodcommits = return (True, goodcommits)
	| otherwise = do
		(ls, cleanup) <- pipeNullSplit
			[ Param "log"
			, Param "-z"
			, Param "--format=%H %T"
			, Param (show commit)
			] r
		let committrees = map parse ls
		if any isNothing committrees || null committrees
			then do
				void cleanup
				return (False, goodcommits)
			else do
				let cts = catMaybes committrees
				ifM (cleanup <&&> check cts)
					( return (True, addGoodCommits (map fst cts) goodcommits)
					, return (False, goodcommits)
					)
  where
	parse l = case words l of
		(commitsha:treesha:[]) -> (,)
			<$> extractSha commitsha
			<*> extractSha treesha
		_ -> Nothing
	check [] = return True
	check ((commit, tree):rest)
		| checkGoodCommit commit goodcommits = return True
		| otherwise = verifyTree missing tree r <&&> check rest

{- Verifies that a tree is good, including all trees and blobs
 - referenced by it. -}
verifyTree :: MissingObjects -> Sha -> Repo -> IO Bool
verifyTree missing treesha r
	| S.member treesha missing = return False
	| otherwise = do
		(ls, cleanup) <- pipeNullSplit (LsTree.lsTreeParams treesha) r
		let objshas = map (extractSha . LsTree.sha . LsTree.parseLsTree) ls
		if any isNothing objshas || any (`S.member` missing) (catMaybes objshas)
			then do
				void cleanup
				return False
			-- as long as ls-tree succeeded, we're good
			else cleanup

{- Rewrites the index file, removing from it any files whose blobs are
 - missing. Returns the list of affected files. -}
rewriteIndex :: MissingObjects -> Repo -> IO [FilePath]
rewriteIndex missing r
	| repoIsLocalBare r = return []
	| otherwise = do
		(indexcontents, cleanup) <- LsFiles.stagedDetails [Git.repoPath r] r
		let (missing, present) = partition ismissing indexcontents
		unless (null missing) $ do
			nukeFile (localGitDir r </> "index")
			UpdateIndex.streamUpdateIndex r
				=<< (catMaybes <$> mapM reinject present)
		void cleanup
		return $ map fst3 missing
  where
	getblob (_file, Just sha, Just _mode) = Just sha
	getblob _ = Nothing
	ismissing = maybe False (`S.member` missing) . getblob
	reinject (file, Just sha, Just mode) = case toBlobType mode of
		Nothing -> return Nothing
		Just blobtype -> Just <$>
			UpdateIndex.stageFile sha blobtype file r
	reinject _ = return Nothing

newtype GoodCommits = GoodCommits (S.Set Sha)

emptyGoodCommits :: GoodCommits
emptyGoodCommits = GoodCommits S.empty

checkGoodCommit :: Sha -> GoodCommits -> Bool
checkGoodCommit sha (GoodCommits s) = S.member sha s

addGoodCommits :: [Sha] -> GoodCommits -> GoodCommits
addGoodCommits shas (GoodCommits s) = GoodCommits $
	S.union s (S.fromList shas)

notice :: String -> IO ()
notice = noticeM "RecoverRepository"