Utility/LockFile/PidLock.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239

{- pid-based lock files
 -
 - Copyright 2015 Joey Hess <id@joeyh.name>
 -
 - License: BSD-2-clause
 -}

module Utility.LockFile.PidLock (
	LockHandle,
	tryLock,
	waitLock,
	dropLock,
	LockStatus(..),
	getLockStatus,
	checkLocked,
	checkSaneLock,
) where

import Utility.PartialPrelude
import Utility.Exception
import Utility.Applicative
import Utility.Directory
import Utility.Monad
import Utility.Path
import Utility.FileMode
import Utility.LockFile.LockStatus
import Utility.ThreadScheduler
import qualified Utility.LockFile.Posix as Posix

import System.IO
import System.Posix
import Data.Maybe
import Data.List
import Control.Applicative
import Network.BSD
import System.FilePath
import Data.Hash.MD5
import System.Directory

type LockFile = FilePath

data LockHandle = LockHandle LockFile FileStatus SideLockHandle

type SideLockHandle = Maybe (LockFile, Posix.LockHandle)

data PidLock = PidLock
	{ lockingPid :: ProcessID
	, lockingHost :: HostName
	}
	deriving (Eq, Read, Show)

mkPidLock :: IO PidLock
mkPidLock = PidLock
	<$> getProcessID
	<*> getHostName

readPidLock :: LockFile -> IO (Maybe PidLock)
readPidLock lockfile = (readish =<<) <$> catchMaybeIO (readFile lockfile)

-- To avoid races when taking over a stale pid lock, a side lock is used.
-- This is a regular posix exclusive lock.
trySideLock :: LockFile -> (SideLockHandle -> IO a) -> IO a
trySideLock lockfile a = do
	sidelock <- sideLockFile lockfile
	mlck <- catchDefaultIO Nothing $ 
		withUmask nullFileMode $
			Posix.tryLockExclusive (Just mode) sidelock
	-- Check the lock we just took, in case we opened a side lock file
	-- belonging to another process that will have since deleted it.
	case mlck of
		Nothing -> a Nothing
		Just lck -> ifM (Posix.checkSaneLock sidelock lck)
			( a (Just (sidelock, lck))
			, a Nothing
			)
  where
	-- Let all users write to the lock file in /dev/shm or /tmp,
	-- so that other users can reuse it to take the lock.
	-- Since /dev/shm and /tmp are sticky dirs, a user cannot
	-- delete another user's lock file there, so could not
	-- delete a stale lock.
	mode = combineModes (readModes ++ writeModes)

dropSideLock :: SideLockHandle -> IO ()
dropSideLock Nothing = return ()
dropSideLock (Just (f, h)) = do
	-- Delete the file first, to ensure that any process that is trying
	-- to take the side lock will only succeed once the file is
	-- deleted, and so will be able to immediately see that it's taken
	-- a stale lock.
	_ <- tryIO $ removeFile f 
	Posix.dropLock h

-- The side lock is put in /dev/shm. This will work on most any
-- Linux system, even if its whole root filesystem doesn't support posix
-- locks. /tmp is used as a fallback.
sideLockFile :: LockFile -> IO LockFile
sideLockFile lockfile = do
	f <- absPath lockfile
	let base = intercalate "_" (splitDirectories (makeRelative "/" f))
	let shortbase = reverse $ take 32 $ reverse base
	let md5sum = if base == shortbase then "" else md5s (Str base)
	dir <- ifM (doesDirectoryExist "/dev/shm")
		( return "/dev/shm"
		, return "/tmp"
		)
	return $ dir </> md5sum ++ shortbase ++ ".lck"

-- | Tries to take a lock; does not block when the lock is already held.
--
-- Note that stale locks are automatically detected and broken.
-- However, if the lock file is on a networked file system, and was
-- created on a different host than the current host (determined by hostname),
-- this can't be done and stale locks may persist.
tryLock :: LockFile -> IO (Maybe LockHandle)
tryLock lockfile = trySideLock lockfile $ \sidelock -> do
	lockfile' <- absPath lockfile
	(tmp, h) <- openTempFile (takeDirectory lockfile') "locktmp"
	setFileMode tmp (combineModes readModes)
	hPutStr h . show =<< mkPidLock
	hClose h
	st <- getFileStatus tmp
	let failedlock = do
		dropLock $ LockHandle tmp st sidelock
		return Nothing
	let tooklock = return $ Just $ LockHandle lockfile' st sidelock
	ifM (linkToLock sidelock tmp lockfile')
		( do
			nukeFile tmp
			tooklock
		, do
			v <- readPidLock lockfile'
			hn <- getHostName
			case v of
				Just pl | isJust sidelock && hn == lockingHost pl -> do
					-- Since we have the sidelock,
					-- and are on the same host that
					-- the pidlock was taken on,
					-- we know that the pidlock is
					-- stale, and can take it over.
					rename tmp lockfile'
					tooklock
				_ -> failedlock
		)

-- Linux's open(2) man page recommends linking a pid lock into place,
-- as the most portable atomic operation that will fail if
-- it already exists. 
--
-- open(2) suggests that link can sometimes appear to fail
-- on NFS but have actually succeeded, and the way to find out is to stat
-- the file and check its link count etc.
linkToLock :: SideLockHandle -> FilePath -> FilePath -> IO Bool
linkToLock Nothing _ _ = return False
linkToLock (Just _) src dest = do
	_ <- tryIO $ createLink src dest
	ifM (catchBoolIO checklinked)
		( catchBoolIO $ not <$> checkInsaneLustre dest
		, return False
		)
  where
	checklinked = do
		x <- getSymbolicLinkStatus src
		y <- getSymbolicLinkStatus dest
		return $ and
			[ deviceID x == deviceID y
			, fileID x == fileID y
			, fileMode x == fileMode y
			, fileOwner x == fileOwner y
			, fileGroup x == fileGroup y
			, fileSize x == fileSize y
			, modificationTime x == modificationTime y
			, isRegularFile x == isRegularFile y
			, linkCount x == linkCount y
			, linkCount x == 2
			]

-- On a Lustre filesystem, link has been observed to incorrectly *succeed*,
-- despite the dest already existing. A subsequent stat of the dest
-- looked like it had been replaced with the src. The process proceeded to
-- run and then deleted the dest, and after the process was done, the
-- original file was observed to still be in place.
--
-- We can detect this insanity by getting the directory contents after
-- making the link, and checking to see if 2 copies of the dest file,
-- with the SAME FILENAME exist.
checkInsaneLustre :: FilePath -> IO Bool
checkInsaneLustre dest = do
	fs <- dirContents (takeDirectory dest)
	case length (filter (== dest) fs) of
		1 -> return False -- whew!
		0 -> return True -- wtf?
		_ -> do
			-- Try to clean up the extra copy we made
			-- that has the same name. Egads.
			_ <- tryIO $ removeFile dest
			return True

-- | Waits as necessary to take a lock.
--
-- Uses a 1 second wait-loop.
--
-- May wait untie timeout if the lock file is stale and is on a network file
-- system, or on a system where the side lock cannot be taken.
waitLock :: Seconds -> LockFile -> IO LockHandle
waitLock (Seconds timeout) lockfile = go timeout
  where
	go n
		| n > 0 = maybe (threadDelaySeconds (Seconds 1) >> go (pred n)) return
			=<< tryLock lockfile
		| otherwise = do
			hPutStrLn stderr $ show timeout ++ " second timeout exceeded while waiting for pid lock file " ++ lockfile
			error $ "Gave up waiting for possibly stale pid lock file " ++ lockfile

dropLock :: LockHandle -> IO ()
dropLock (LockHandle lockfile _ sidelock) = do
	-- Drop side lock first, at which point the pid lock will be
	-- considered stale.
	dropSideLock sidelock
	nukeFile lockfile

getLockStatus :: LockFile -> IO LockStatus
getLockStatus = maybe StatusUnLocked (StatusLockedBy . lockingPid) <$$> readPidLock

checkLocked :: LockFile -> IO (Maybe Bool)
checkLocked lockfile = conv <$> getLockStatus lockfile
  where
	conv (StatusLockedBy _) = Just True
	conv _ = Just False

-- Checks that the lock file still exists, and is the same file that was
-- locked to get the LockHandle.
checkSaneLock :: LockFile -> LockHandle -> IO Bool
checkSaneLock lockfile (LockHandle f st _) = 
	go =<< catchMaybeIO (getFileStatus lockfile)
  where
	go Nothing = return False
	go (Just st') = return $
		deviceID st == deviceID st' && fileID st == fileID st'