From 2d0f11008a1381f6212d0ab1f33e26fb1ae22f79 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 13 Nov 2015 16:13:43 -0400 Subject: starting to get a handle on how to detect that mad gleam in lustre's eye --- Utility/LockFile/PidLock.hs | 38 +++++++++++++--------- ...nt_14_4dea6eac389bbf5235a3d5d3378e6d04._comment | 33 +++++++++++++++++++ 2 files changed, 55 insertions(+), 16 deletions(-) create mode 100644 doc/bugs/git-annex_doesn__39__t_work_on_lustre:_waitToSetLock:_unsupported_operation___40__Function_not_implemented__41__/comment_14_4dea6eac389bbf5235a3d5d3378e6d04._comment diff --git a/Utility/LockFile/PidLock.hs b/Utility/LockFile/PidLock.hs index 5c5a89cc7..206127efb 100644 --- a/Utility/LockFile/PidLock.hs +++ b/Utility/LockFile/PidLock.hs @@ -132,27 +132,12 @@ tryLock lockfile = trySideLock lockfile $ \sidelock -> do -- open(2) suggests that link can sometimes appear to fail -- on NFS but have actually succeeded, and the way to find out is to stat -- the file and check its link count etc. --- --- On a Lustre filesystem, link has been observed to incorrectly *succeed*, --- despite the dest already existing. A subsequent stat of the dest --- looked like it had been replaced with the src. The process proceeded to --- run and then deleted the dest, and after the process was done, the --- original file was observed to still be in place. This is horrible and we --- can't do anything about such a lying filesystem. --- At least the side lock file will prevent git-annex's running on the same --- host from running concurrently even on such a lying filesystem. linkToLock :: SideLockHandle -> FilePath -> FilePath -> IO Bool linkToLock Nothing _ _ = return False linkToLock (Just _) src dest = do - -- This might make Lustre notice that a lock file that is already - -- there is there? - _ <- catchMaybeIO $ readFile dest _ <- tryIO $ createLink src dest ifM (catchBoolIO checklinked) - ( catchBoolIO $ do - srccontent <- readFile src - destcontent <- readFile dest - return (srccontent == destcontent) + ( catchBoolIO $ not <$> checkInsaneLustre dest , return False ) where @@ -173,6 +158,27 @@ linkToLock (Just _) src dest = do , linkCount x == 2 ] +-- On a Lustre filesystem, link has been observed to incorrectly *succeed*, +-- despite the dest already existing. A subsequent stat of the dest +-- looked like it had been replaced with the src. The process proceeded to +-- run and then deleted the dest, and after the process was done, the +-- original file was observed to still be in place. +-- +-- We can detect this insanity by getting the directory contents after +-- making the link, and checking to see if 2 copies of the dest file, +-- with the SAME FILENAME exist. +checkInsaneLustre :: FilePath -> IO Bool +checkInsaneLustre dest = do + fs <- dirContents (takeDirectory dest) + case length (filter (== dest) fs) of + 1 -> return False -- whew! + 0 -> return True -- wtf? + _ -> do + -- Try to clean up the extra copy we made + -- that has the same name. Egads. + tryIO $ removeFile dest + return True + -- | Waits as necessary to take a lock. -- -- Uses a 1 second wait-loop. diff --git a/doc/bugs/git-annex_doesn__39__t_work_on_lustre:_waitToSetLock:_unsupported_operation___40__Function_not_implemented__41__/comment_14_4dea6eac389bbf5235a3d5d3378e6d04._comment b/doc/bugs/git-annex_doesn__39__t_work_on_lustre:_waitToSetLock:_unsupported_operation___40__Function_not_implemented__41__/comment_14_4dea6eac389bbf5235a3d5d3378e6d04._comment new file mode 100644 index 000000000..5bd76b6ec --- /dev/null +++ b/doc/bugs/git-annex_doesn__39__t_work_on_lustre:_waitToSetLock:_unsupported_operation___40__Function_not_implemented__41__/comment_14_4dea6eac389bbf5235a3d5d3378e6d04._comment @@ -0,0 +1,33 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 14""" + date="2015-11-13T20:00:48Z" + content=""" +Adding to the crazy Lustre fun, check this out: + + $ ls -l .git/annex/ + total 56 + -rw-rw-r-- 1 hess root 18387 Nov 13 14:35 index + -rw-rw-r-- 1 hess root 41 Nov 13 14:35 index.lck + drwxrwsr-x 2 hess root 12288 Nov 13 14:35 journal + -rw-rw-r-- 1 hess root 0 Nov 13 11:48 journal.lck + drwxrwsr-x 2 hess root 4096 Nov 13 14:35 misctmp + drwxrwsr-x 88 hess root 4096 Nov 13 14:35 objects + -r--r--r-- 1 hess root 70 Nov 13 14:35 pidlock + -r--r--r-- 1 hess root 70 Nov 13 14:35 pidlock + -rw-rw-r-- 1 hess root 0 Nov 13 11:48 sentinal + -rw-rw-r-- 1 hess root 23 Nov 13 11:48 sentinal.cache + +There are 2 pidlock files in that directory listing. 2 files with the same name. +I deleted one of them, and with no other changes, ls shows only 1 now. + + -r--r--r-- 1 hess root 74 Nov 13 14:35 pidlock + +Notice that the file stat has changed too. + +So, Lustre has clearly thrown POSIX out the window, and then defrenstrated +sanity for good measure. + +On the plus side, this may show how I can detect when rename() fails to +preserve POSIX semantics.. +"""]] -- cgit v1.2.3