summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Remote/Git.hs11
-rw-r--r--Utility/Process.hs45
-rw-r--r--debian/changelog2
-rw-r--r--doc/todo/speed_up_transfers_over_ssh+rsync_--_directly_reuse_the_same_connection__63__/comment_6_8f9a3dd1c021b862f97934b6b25f6aa9._comment79
4 files changed, 115 insertions, 22 deletions
diff --git a/Remote/Git.hs b/Remote/Git.hs
index 4505c14ff..ad4449b46 100644
--- a/Remote/Git.hs
+++ b/Remote/Git.hs
@@ -58,6 +58,7 @@ import Control.Concurrent
import Control.Concurrent.MSampleVar
import qualified Data.Map as M
import Network.URI
+import System.Log.Logger
remote :: RemoteType
remote = RemoteType {
@@ -200,7 +201,6 @@ tryGitConfigRead :: Bool -> Git.Repo -> Annex Git.Repo
tryGitConfigRead autoinit r
| haveconfig r = return r -- already read
| Git.repoIsSsh r = store $ do
- liftIO $ print autoinit
v <- Ssh.onRemote r (pipedconfig, return (Left $ error "configlist failed")) "configlist" [] configlistfields
case v of
Right r'
@@ -446,9 +446,14 @@ copyFromRemote' r key file dest meterupdate
let feeder = \n -> do
meterupdate n
writeSV v (fromBytesProcessed n)
- let cleanup = do
+
+ -- It can easily take 0.3 seconds to clean up after
+ -- the transferinfo, and all that's involved is shutting
+ -- down the process and associated thread cleanly. So,
+ -- do it in the background.
+ let cleanup = forkIO $ do
void $ tryIO $ killThread tid
- tryNonAsync $
+ void $ tryNonAsync $
maybe noop (void . waitForProcess)
=<< tryTakeMVar pidv
bracketIO noop (const cleanup) (const $ a feeder)
diff --git a/Utility/Process.hs b/Utility/Process.hs
index 469f7659f..205994811 100644
--- a/Utility/Process.hs
+++ b/Utility/Process.hs
@@ -31,6 +31,7 @@ module Utility.Process (
withQuietOutput,
feedWithQuietOutput,
createProcess,
+ waitForProcess,
startInteractiveProcess,
stdinHandle,
stdoutHandle,
@@ -42,7 +43,7 @@ module Utility.Process (
import qualified System.Process
import qualified System.Process as X hiding (CreateProcess(..), createProcess, runInteractiveProcess, readProcess, readProcessWithExitCode, system, rawSystem, runInteractiveCommand, runProcess)
-import System.Process hiding (createProcess, readProcess)
+import System.Process hiding (createProcess, readProcess, waitForProcess)
import System.Exit
import System.IO
import System.Log.Logger
@@ -345,22 +346,6 @@ oeHandles _ = error "expected oeHandles"
processHandle :: (Maybe Handle, Maybe Handle, Maybe Handle, ProcessHandle) -> ProcessHandle
processHandle (_, _, _, pid) = pid
--- | Debugging trace for a CreateProcess.
-debugProcess :: CreateProcess -> IO ()
-debugProcess p = do
- debugM "Utility.Process" $ unwords
- [ action ++ ":"
- , showCmd p
- ]
- where
- action
- | piped (std_in p) && piped (std_out p) = "chat"
- | piped (std_in p) = "feed"
- | piped (std_out p) = "read"
- | otherwise = "call"
- piped Inherit = False
- piped _ = True
-
-- | Shows the command that a CreateProcess will run.
showCmd :: CreateProcess -> String
showCmd = go . cmdspec
@@ -385,9 +370,31 @@ startInteractiveProcess cmd args environ = do
(Just from, Just to, _, pid) <- createProcess p
return (pid, to, from)
--- | Wrapper around 'System.Process.createProcess' from System.Process,
--- that does debug logging.
+-- | Wrapper around 'System.Process.createProcess' that does debug logging.
createProcess :: CreateProcess -> IO (Maybe Handle, Maybe Handle, Maybe Handle, ProcessHandle)
createProcess p = do
debugProcess p
System.Process.createProcess p
+
+-- | Debugging trace for a CreateProcess.
+debugProcess :: CreateProcess -> IO ()
+debugProcess p = do
+ debugM "Utility.Process" $ unwords
+ [ action ++ ":"
+ , showCmd p
+ ]
+ where
+ action
+ | piped (std_in p) && piped (std_out p) = "chat"
+ | piped (std_in p) = "feed"
+ | piped (std_out p) = "read"
+ | otherwise = "call"
+ piped Inherit = False
+ piped _ = True
+
+-- | Wrapper around 'System.Process.waitForProcess' that does debug logging.
+waitForProcess :: ProcessHandle -> IO ExitCode
+waitForProcess h = do
+ r <- System.Process.waitForProcess h
+ debugM "Utility.Process" ("process done " ++ show r)
+ return r
diff --git a/debian/changelog b/debian/changelog
index b8b82512a..cbb2601eb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,6 +1,8 @@
git-annex (5.20150813) UNRELEASED; urgency=medium
* --debug log messages are now timestamped with fractional seconds.
+ * Sped up downloads of files from ssh remotes, reducing the
+ non-data-transfer overhead 6x.
-- Joey Hess <id@joeyh.name> Wed, 12 Aug 2015 14:31:01 -0400
diff --git a/doc/todo/speed_up_transfers_over_ssh+rsync_--_directly_reuse_the_same_connection__63__/comment_6_8f9a3dd1c021b862f97934b6b25f6aa9._comment b/doc/todo/speed_up_transfers_over_ssh+rsync_--_directly_reuse_the_same_connection__63__/comment_6_8f9a3dd1c021b862f97934b6b25f6aa9._comment
new file mode 100644
index 000000000..009a21af0
--- /dev/null
+++ b/doc/todo/speed_up_transfers_over_ssh+rsync_--_directly_reuse_the_same_connection__63__/comment_6_8f9a3dd1c021b862f97934b6b25f6aa9._comment
@@ -0,0 +1,79 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""measure twice, cut once"""
+ date="2015-08-13T17:22:17Z"
+ content="""
+Debug output has been enhanced with fractional seconds and also shows when
+a command exits so the time spent in a command can be determined.
+
+ [2015-08-13 13:54:29.578099] process done ExitSuccess
+ ok
+ get ook8 (from origin...)
+ [2015-08-13 13:54:29.588827] read: rsync ["--progress","--inplace","--perms","-e","'ssh' '-S' '.git/annex/ssh/localhost' '-o' 'ControlMaster=auto' '-o' 'ControlPersist=yes' '-T' 'localhost' 'git-annex-shell ''sendkey'' ''/home/joey/tmp/r'' ''SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128'' --uuid ba2a51f9-f356-40a7-9600-2ac4193c7d58 ''--'' ''remoteuuid=72078ee3-1150-45f0-b00e-53e971921982'' ''direct='' ''associatedfile=ook8'' ''--'''","--","dummy:",".git/annex/tmp/SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128"]
+ SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128
+ 30 100% 29.30kB/s 0:00:00 (xfr#1, to-chk=0/1)
+ [2015-08-13 13:54:29.635771] transferinfo starting up
+ [2015-08-13 13:54:29.636097] feed: ssh ["-S",".git/annex/ssh/localhost","-o","ControlMaster=auto","-o","ControlPersist=yes","-T","localhost","git-annex-shell 'transferinfo' '/home/joey/tmp/r' 'SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128' --uuid ba2a51f9-f356-40a7-9600-2ac4193c7d58 '--' 'remoteuuid=72078ee3-1150-45f0-b00e-53e971921982' 'associatedfile=ook8' '--'"]
+ [2015-08-13 13:54:29.656803] process done ExitSuccess
+ [2015-08-13 13:54:29.657487] transferinfo shutting down
+ [2015-08-13 13:54:29.987744] process done ExitSuccess
+ ok
+
+Here we can see:
+
+1. 0.409645s -- total time spent on this one file (12 files took 4.124s overall)
+2. 0.010728s -- is spent in between finishing one file and
+ starting the transfer of the next. This includes updating location
+ tracking, figuring out what the next file that needs transfer is
+ (in this case the very next one in the work tree), and other bookkeeping.
+3. 0.067976s -- The rsync transfer of one (small) file, including ssh and
+ git-annex-shell overhead. It would be good to get this part broken down
+ in more detail.
+4. 0.330941 -- This is pure overhead involved in shutting down the
+ transferinfo process. Yikes!
+
+So, that transferinfo is the unexpected meat of the time. Since all
+that does it open another ssh connection back to the remote and tell git-annex-shell
+the progress of the download, for git-annex status or the assistant to display,
+that's really not a justifyable overhead.
+
+Here it is without the transferinfo being done:
+
+ [2015-08-13 13:49:26.860914] process done ExitSuccess
+ ok
+ get ook8 (from origin...)
+ [2015-08-13 13:49:26.865482] read: rsync ["--progress","--inplace","--perms","-e","'ssh' '-S' '.git/annex/ssh/localhost' '-o' 'ControlMaster=auto' '-o' 'ControlPersist=yes' '-T' 'localhost' 'git-annex-shell ''sendkey'' ''/home/joey/tmp/r'' ''SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128'' --uuid ba2a51f9-f356-40a7-9600-2ac4193c7d58 ''--'' ''remoteuuid=72078ee3-1150-45f0-b00e-53e971921982'' ''direct='' ''associatedfile=ook8'' ''--'''","--","dummy:",".git/annex/tmp/SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128"]
+ SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128
+ 30 100% 29.30kB/s 0:00:00 (xfr#1, to-chk=0/1)
+ [2015-08-13 13:49:26.926932] process done ExitSuccess
+ ok
+
+1. 0.066018s -- massive improvement! (12 files took 0.885s overall)
+2. 0.004568s -- Weird that this is half what it was before. I have made enough
+ measurements that I'm pretty sure it is, consistently lower though. Maybe
+ this time reduced because avoiding the transferinfo means less threads and/or
+ less garbage collection time.
+3. 0.061450s -- Much as before, so the concurrent ssh connection made
+ for transferinfo doesn't slow down the rsync appreciably.
+
+So, I made it spin off a new thread to do the transferinfo cleanup.
+This one word change to the code performs almost as well as eliminating
+transferinfo entirely did!
+
+ [2015-08-13 14:11:10.275867] process done ExitSuccess
+ [2015-08-13 14:11:10.276359] transferinfo shutting down
+ ok
+ get ook8 (from origin...)
+ [2015-08-13 14:11:10.282027] read: rsync ["--progress","--inplace","--perms","-e","'ssh' '-S' '.git/annex/ssh/localhost' '-o' 'ControlMaster=auto' '-o' 'ControlPersist=yes' '-T' 'localhost' 'git-annex-shell ''sendkey'' ''/home/joey/tmp/r'' ''SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128'' --uuid ba2a51f9-f356-40a7-9600-2ac4193c7d58 ''--'' ''remoteuuid=72078ee3-1150-45f0-b00e-53e971921982'' ''direct='' ''associatedfile=ook8'' ''--'''","--","dummy:",".git/annex/tmp/SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128"]
+ SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128
+ 30 100% 29.30kB/s 0:00:00 (xfr#1, to-chk=0/1)
+ [2015-08-13 14:11:10.324395] transferinfo starting up
+ [2015-08-13 14:11:10.324624] feed: ssh ["-S",".git/annex/ssh/localhost","-o","ControlMaster=auto","-o","ControlPersist=yes","-T","localhost","git-annex-shell 'transferinfo' '/home/joey/tmp/r' 'SHA256E-s30--fc394d2854169d3a85b7ffda59f30b797e915fd98368c30d11588df7a20ee128' --uuid ba2a51f9-f356-40a7-9600-2ac4193c7d58 '--' 'remoteuuid=72078ee3-1150-45f0-b00e-53e971921982' 'associatedfile=ook8' '--'"]
+ [2015-08-13 14:11:10.34482] process done ExitSuccess
+
+1. 0.068953s -- nearly as good as eliminating transferinfo (12 files took 0.925s overall)
+2. 0.006160 -- little bit higher, perhaps this is thread or GC overhead
+3. 0.062793 -- much as before (possibly slowed a tiny bit by the extra ssh traffic)
+
+Now, with -J4, all 12 files transfer in 0.661s.
+"""]]