From 0f1ea37a68535b95c56e1e142ecc8db1ac6b43dc Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 12 Apr 2014 16:32:59 -0400 Subject: remotedaemon: When network connection is lost, close all cached ssh connections. This commit was sponsored by Cedric Staub. --- Annex/Ssh.hs | 55 +++++++++++++++++++++++---------------- Assistant/Threads/NetWatcher.hs | 2 +- RemoteDaemon/Core.hs | 18 +++++++++---- RemoteDaemon/Transport/Ssh.hs | 1 + RemoteDaemon/Types.hs | 3 +++ debian/changelog | 2 ++ doc/design/git-remote-daemon.mdwn | 29 ++++++++++++++++----- 7 files changed, 75 insertions(+), 35 deletions(-) diff --git a/Annex/Ssh.hs b/Annex/Ssh.hs index fab25c462..06e3ac449 100644 --- a/Annex/Ssh.hs +++ b/Annex/Ssh.hs @@ -11,6 +11,7 @@ module Annex.Ssh ( sshCachingOptions, sshCacheDir, sshReadPort, + forceSshCleanup, sshCachingEnv, sshCachingTo, inRepoWithSshCachingTo, @@ -124,21 +125,27 @@ prepSocket socketfile = do liftIO $ createDirectoryIfMissing True $ parentDir socketfile lockFile $ socket2lock socketfile -{- Stop any unused ssh processes. -} +enumSocketFiles :: Annex [FilePath] +enumSocketFiles = go =<< sshCacheDir + where + go Nothing = return [] + go (Just dir) = liftIO $ filter (not . isLock) + <$> catchDefaultIO [] (dirContents dir) + +{- Stop any unused ssh connection caching processes. -} sshCleanup :: Annex () -sshCleanup = go =<< sshCacheDir +sshCleanup = mapM_ cleanup =<< enumSocketFiles where - go Nothing = noop - go (Just dir) = do - sockets <- liftIO $ filter (not . isLock) - <$> catchDefaultIO [] (dirContents dir) - forM_ sockets cleanup cleanup socketfile = do #ifndef mingw32_HOST_OS -- Drop any shared lock we have, and take an -- exclusive lock, without blocking. If the lock -- succeeds, nothing is using this ssh, and it can -- be stopped. + -- + -- After ssh is stopped cannot remove the lock file; + -- other processes may be waiting on our exclusive + -- lock to use it. let lockfile = socket2lock socketfile unlockFile lockfile mode <- annexFileMode @@ -148,24 +155,28 @@ sshCleanup = go =<< sshCacheDir setLock fd (WriteLock, AbsoluteSeek, 0, 0) case v of Left _ -> noop - Right _ -> stopssh socketfile + Right _ -> forceStopSsh socketfile liftIO $ closeFd fd #else - stopssh socketfile + forceStopSsh socketfile #endif - stopssh socketfile = do - let (dir, base) = splitFileName socketfile - let params = sshConnectionCachingParams base - -- "ssh -O stop" is noisy on stderr even with -q - void $ liftIO $ catchMaybeIO $ - withQuietOutput createProcessSuccess $ - (proc "ssh" $ toCommand $ - [ Params "-O stop" - ] ++ params ++ [Param "localhost"]) - { cwd = Just dir } - liftIO $ nukeFile socketfile - -- Cannot remove the lock file; other processes may - -- be waiting on our exclusive lock to use it. + +{- Stop all ssh connection caching processes, even when they're in use. -} +forceSshCleanup :: Annex () +forceSshCleanup = mapM_ forceStopSsh =<< enumSocketFiles + +forceStopSsh :: FilePath -> Annex () +forceStopSsh socketfile = do + let (dir, base) = splitFileName socketfile + let params = sshConnectionCachingParams base + -- "ssh -O stop" is noisy on stderr even with -q + void $ liftIO $ catchMaybeIO $ + withQuietOutput createProcessSuccess $ + (proc "ssh" $ toCommand $ + [ Params "-O stop" + ] ++ params ++ [Param "localhost"]) + { cwd = Just dir } + liftIO $ nukeFile socketfile {- This needs to be as short as possible, due to limitations on the length - of the path to a socket file. At the same time, it needs to be unique diff --git a/Assistant/Threads/NetWatcher.hs b/Assistant/Threads/NetWatcher.hs index 912893b87..0ff605edc 100644 --- a/Assistant/Threads/NetWatcher.hs +++ b/Assistant/Threads/NetWatcher.hs @@ -71,7 +71,7 @@ dbusThread = do ) handleconn = do debug ["detected network connection"] - sendRemoteControl PAUSE + sendRemoteControl LOSTNET notifyNetMessagerRestart handleConnection sendRemoteControl RESUME diff --git a/RemoteDaemon/Core.hs b/RemoteDaemon/Core.hs index 0c2937103..b3cfe67b2 100644 --- a/RemoteDaemon/Core.hs +++ b/RemoteDaemon/Core.hs @@ -18,6 +18,7 @@ import qualified Git.Types as Git import qualified Git.CurrentRepo import Utility.SimpleProtocol import Config +import Annex.Ssh import Control.Concurrent.Async import Control.Concurrent @@ -65,12 +66,19 @@ runController ichan ochan = do let common = M.intersection m m' let new = M.difference m' m let old = M.difference m m' - stoprunning old + broadcast STOP old unless paused $ startrunning new go h paused (M.union common new) + LOSTNET -> do + -- force close all cached ssh connections + -- (done here so that if there are multiple + -- ssh remotes, it's only done once) + liftAnnex h forceSshCleanup + broadcast LOSTNET m + go h True M.empty PAUSE -> do - stoprunning m + broadcast STOP m go h True M.empty RESUME -> do when paused $ @@ -89,9 +97,9 @@ runController ichan ochan = do startrunning m = forM_ (M.elems m) startrunning' startrunning' (transport, _) = void $ async transport - -- Ask the transport nicely to stop. - stoprunning m = forM_ (M.elems m) stoprunning' - stoprunning' (_, c) = writeChan c STOP + broadcast msg m = forM_ (M.elems m) send + where + send (_, c) = writeChan c msg -- Generates a map with a transport for each supported remote in the git repo, -- except those that have annex.sync = false diff --git a/RemoteDaemon/Transport/Ssh.hs b/RemoteDaemon/Transport/Ssh.hs index d6150bbce..ba03a2589 100644 --- a/RemoteDaemon/Transport/Ssh.hs +++ b/RemoteDaemon/Transport/Ssh.hs @@ -84,6 +84,7 @@ transport' r url transporthandle ichan ochan = do msg <- readChan ichan case msg of STOP -> return Stopping + LOSTNET -> return Stopping _ -> handlecontrol -- Old versions of git-annex-shell that do not support diff --git a/RemoteDaemon/Types.hs b/RemoteDaemon/Types.hs index eef7389cc..aff910120 100644 --- a/RemoteDaemon/Types.hs +++ b/RemoteDaemon/Types.hs @@ -42,6 +42,7 @@ data Emitted -- Messages that the deamon consumes. data Consumed = PAUSE + | LOSTNET | RESUME | CHANGED RefList | RELOAD @@ -63,6 +64,7 @@ instance Proto.Sendable Emitted where instance Proto.Sendable Consumed where formatMessage PAUSE = ["PAUSE"] + formatMessage LOSTNET = ["LOSTNET"] formatMessage RESUME = ["RESUME"] formatMessage (CHANGED refs) =["CHANGED", Proto.serialize refs] formatMessage RELOAD = ["RELOAD"] @@ -78,6 +80,7 @@ instance Proto.Receivable Emitted where instance Proto.Receivable Consumed where parseCommand "PAUSE" = Proto.parse0 PAUSE + parseCommand "LOSTNET" = Proto.parse0 LOSTNET parseCommand "RESUME" = Proto.parse0 RESUME parseCommand "CHANGED" = Proto.parse1 CHANGED parseCommand "RELOAD" = Proto.parse0 RELOAD diff --git a/debian/changelog b/debian/changelog index 1b5b39de8..4c1b20df7 100644 --- a/debian/changelog +++ b/debian/changelog @@ -10,6 +10,8 @@ git-annex (5.20140413) UNRELEASED; urgency=medium set up. * sync, assistant, remotedaemon: Use ssh connection caching for git pushes and pulls. + * remotedaemon: When network connection is lost, close all cached ssh + connections. * Improve handling on monthly/yearly scheduling. -- Joey Hess Fri, 11 Apr 2014 21:33:35 -0400 diff --git a/doc/design/git-remote-daemon.mdwn b/doc/design/git-remote-daemon.mdwn index ca3a59fce..a3d07eaa6 100644 --- a/doc/design/git-remote-daemon.mdwn +++ b/doc/design/git-remote-daemon.mdwn @@ -95,18 +95,18 @@ the webapp. * `PAUSE` - This indicates that the network connection has gone down, - or the user has requested a pause. + The user has requested a pause. git-remote-daemon should close connections and idle. - Affects all remotes. +* `LOSTNET` -* `RESUME` + The network connection has been lost. + git-remote-daemon should close connections and idle. - This indicates that the network connection has come back up, or the user - has asked it to run again. Start back up network connections. +* `RESUME` - Affects all remotes. + Undoes PAUSE or DISCONNECTED. + Start back up network connections. * `CHANGED ref ...` @@ -170,6 +170,21 @@ TODO: * Remote system might not be available. Find a smart way to detect it, ideally w/o generating network traffic. One way might be to check if the ssh connection caching control socket exists, for example. +* Now that ssh connection caching is enabled for git push/pull in sync, + there's the possibility that a stale ssh connection may linger when + changing network connections, and so attempts to use it will stall. + (This was already a potential issue with transfers, which already + used the caching.) + + One option is ssh's ServerAliveCountMax, which will make a dead + ssh connection disconnect after approx 45 seconds, per ssh manual. + It would need to be enabled by setting ServerAliveInterval=15. + And this would add network traffic.. + + Another option is to disable all cached connections when the network + connection changes. This would handle *most* cases. The case + not handled is eg, my dialup ppp box getting a new public IP address, + which my laptop won't notice. **done** ## telehash -- cgit v1.2.3