summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2016-09-06 12:42:50 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2016-09-06 12:45:21 -0400
commitd90783a507ef2776cb43e0d55204adb7e91cfea5 (patch)
tree6e059450d0612966e5434db0d8709968618a4d6d
parent14c485bb43f071e8f7ff2614bf3eb3e7a7ab0065 (diff)
get -J: Download different files from different remotes when the remotes have the same costs.
Only done in -J mode because only if there's concurrency can downloading from two remotes be faster. Without concurrency, it's likely the case that sequential downloads from the same remote are faster than switching back and forth between two remotes. There is some hairy MVar code here, but basically it just keeps the activeremotes MVar full except when deciding which remote to assign to a thread. Also affects gets by sync --content -J This commit was sponsored by Jochen Bartl.
-rw-r--r--Annex.hs100
-rw-r--r--Annex/Transfer.hs61
-rw-r--r--CHANGELOG2
-rw-r--r--Command/Get.hs19
-rw-r--r--doc/todo/get_round_robin.mdwn9
-rw-r--r--doc/todo/wishlist__58___do_round_robin_downloading_of_data.mdwn3
6 files changed, 135 insertions, 59 deletions
diff --git a/Annex.hs b/Annex.hs
index 5ab2b748d..9bd6d34b8 100644
--- a/Annex.hs
+++ b/Annex.hs
@@ -77,7 +77,7 @@ import qualified Data.Set as S
- The MVar is not exposed outside this module.
-
- Note that when an Annex action fails and the exception is caught,
- - ny changes the action has made to the AnnexState are retained,
+ - any changes the action has made to the AnnexState are retained,
- due to the use of the MVar to store the state.
-}
newtype Annex a = Annex { runAnnex :: ReaderT (MVar AnnexState) IO a }
@@ -135,56 +135,60 @@ data AnnexState = AnnexState
, desktopnotify :: DesktopNotify
, workers :: [Either AnnexState (Async AnnexState)]
, concurrentjobs :: Maybe Int
+ , activeremotes :: MVar (S.Set (Types.Remote.RemoteA Annex))
, keysdbhandle :: Maybe Keys.DbHandle
, cachedcurrentbranch :: Maybe Git.Branch
}
-newState :: GitConfig -> Git.Repo -> AnnexState
-newState c r = AnnexState
- { repo = r
- , repoadjustment = return
- , gitconfig = c
- , backends = []
- , remotes = []
- , remoteannexstate = M.empty
- , output = def
- , force = False
- , fast = False
- , daemon = False
- , branchstate = startBranchState
- , repoqueue = Nothing
- , catfilehandles = M.empty
- , hashobjecthandle = Nothing
- , checkattrhandle = Nothing
- , checkignorehandle = Nothing
- , forcebackend = Nothing
- , globalnumcopies = Nothing
- , forcenumcopies = Nothing
- , limit = BuildingMatcher []
- , uuidmap = Nothing
- , preferredcontentmap = Nothing
- , requiredcontentmap = Nothing
- , forcetrust = M.empty
- , trustmap = Nothing
- , groupmap = Nothing
- , ciphers = M.empty
- , lockcache = M.empty
- , flags = M.empty
- , fields = M.empty
- , cleanup = M.empty
- , sentinalstatus = Nothing
- , useragent = Nothing
- , errcounter = 0
- , unusedkeys = Nothing
- , tempurls = M.empty
- , quviversion = Nothing
- , existinghooks = M.empty
- , desktopnotify = mempty
- , workers = []
- , concurrentjobs = Nothing
- , keysdbhandle = Nothing
- , cachedcurrentbranch = Nothing
- }
+newState :: GitConfig -> Git.Repo -> IO AnnexState
+newState c r = do
+ emptyactiveremotes <- newMVar S.empty
+ return $ AnnexState
+ { repo = r
+ , repoadjustment = return
+ , gitconfig = c
+ , backends = []
+ , remotes = []
+ , remoteannexstate = M.empty
+ , output = def
+ , force = False
+ , fast = False
+ , daemon = False
+ , branchstate = startBranchState
+ , repoqueue = Nothing
+ , catfilehandles = M.empty
+ , hashobjecthandle = Nothing
+ , checkattrhandle = Nothing
+ , checkignorehandle = Nothing
+ , forcebackend = Nothing
+ , globalnumcopies = Nothing
+ , forcenumcopies = Nothing
+ , limit = BuildingMatcher []
+ , uuidmap = Nothing
+ , preferredcontentmap = Nothing
+ , requiredcontentmap = Nothing
+ , forcetrust = M.empty
+ , trustmap = Nothing
+ , groupmap = Nothing
+ , ciphers = M.empty
+ , lockcache = M.empty
+ , flags = M.empty
+ , fields = M.empty
+ , cleanup = M.empty
+ , sentinalstatus = Nothing
+ , useragent = Nothing
+ , errcounter = 0
+ , unusedkeys = Nothing
+ , tempurls = M.empty
+ , quviversion = Nothing
+ , existinghooks = M.empty
+ , desktopnotify = mempty
+ , workers = []
+ , concurrentjobs = Nothing
+ , activeremotes = emptyactiveremotes
+ , keysdbhandle = Nothing
+ , cachedcurrentbranch = Nothing
+ }
{- Makes an Annex state object for the specified git repo.
- Ensures the config is read, if it was not already, and performs
@@ -193,7 +197,7 @@ new :: Git.Repo -> IO AnnexState
new r = do
r' <- Git.Config.read =<< Git.relPath r
let c = extractGitConfig r'
- newState c <$> fixupRepo r' c
+ newState c =<< fixupRepo r' c
{- Performs an action in the Annex monad from a starting state,
- returning a new state. -}
diff --git a/Annex/Transfer.hs b/Annex/Transfer.hs
index a78d82ef3..6ed8ca761 100644
--- a/Annex/Transfer.hs
+++ b/Annex/Transfer.hs
@@ -1,11 +1,11 @@
{- git-annex transfers
-
- - Copyright 2012-2014 Joey Hess <id@joeyh.name>
+ - Copyright 2012-2016 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
-{-# LANGUAGE CPP, FlexibleInstances #-}
+{-# LANGUAGE CPP, FlexibleInstances, BangPatterns #-}
module Annex.Transfer (
module X,
@@ -15,9 +15,11 @@ module Annex.Transfer (
alwaysRunTransfer,
noRetry,
forwardRetry,
+ pickRemote,
) where
import Annex.Common
+import qualified Annex
import Logs.Transfer as X
import Types.Transfer as X
import Annex.Notification as X
@@ -25,8 +27,10 @@ import Annex.Perms
import Utility.Metered
import Annex.LockPool
import Types.Remote (Verification(..))
+import qualified Types.Remote as Remote
import Control.Concurrent
+import qualified Data.Set as S
class Observable a where
observeBool :: a -> Bool
@@ -166,3 +170,56 @@ noRetry _ _ = False
- to send some data. -}
forwardRetry :: RetryDecider
forwardRetry old new = bytesComplete old < bytesComplete new
+
+{- Picks a remote from the list and tries a transfer to it. If the transfer
+ - does not succeed, goes on to try other remotes from the list.
+ -
+ - The list should already be ordered by remote cost, and is normally
+ - tried in order. However, when concurrent jobs are running, they will
+ - be assigned different remotes of the same cost when possible. This can
+ - increase total transfer speed.
+ -}
+pickRemote :: Observable v => [Remote] -> (Remote -> Annex v) -> Annex v
+pickRemote l a = go l =<< Annex.getState Annex.concurrentjobs
+ where
+ go [] _ = return observeFailure
+ go (r:[]) _ = a r
+ go rs (Just n) | n > 1 = do
+ mv <- Annex.getState Annex.activeremotes
+ active <- liftIO $ takeMVar mv
+ let rs' = sortBy (inactiveFirst active) rs
+ goconcurrent mv active rs'
+ go (r:rs) _ = do
+ ok <- a r
+ if observeBool ok
+ then return ok
+ else go rs Nothing
+ goconcurrent mv active [] = do
+ liftIO $ putMVar mv active
+ return observeFailure
+ goconcurrent mv active (r:rs) = do
+ let !active' = S.insert r active
+ liftIO $ putMVar mv active'
+ let getnewactive = do
+ active'' <- liftIO $ takeMVar mv
+ let !active''' = S.delete r active''
+ return active'''
+ let removeactive = liftIO . putMVar mv =<< getnewactive
+ ok <- a r `onException` removeactive
+ if observeBool ok
+ then do
+ removeactive
+ return ok
+ else do
+ active'' <- getnewactive
+ -- Re-sort the remaining rs
+ -- because other threads could have
+ -- been assigned them in the meantime.
+ let rs' = sortBy (inactiveFirst active'') rs
+ goconcurrent mv active'' rs'
+
+inactiveFirst :: S.Set Remote -> Remote -> Remote -> Ordering
+inactiveFirst active a b
+ | Remote.cost a == Remote.cost b =
+ if a `S.member` active then GT else LT
+ | otherwise = compare a b
diff --git a/CHANGELOG b/CHANGELOG
index 7139aaf59..0310dd3e2 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,6 +11,8 @@ git-annex (6.20160809) UNRELEASED; urgency=medium
Fixes a reversion introduced in version 5.20150727.
* Assistant, repair: Filter out git fsck lines about duplicate file
entries in tree objects.
+ * get -J, sync --content -J: Download different files from different
+ remotes when the remotes have the same costs.
-- Joey Hess <id@joeyh.name> Mon, 05 Sep 2016 11:51:49 -0400
diff --git a/Command/Get.hs b/Command/Get.hs
index 3d2914673..70b877065 100644
--- a/Command/Get.hs
+++ b/Command/Get.hs
@@ -89,16 +89,17 @@ getKey' key afile = dispatch
showNote "not available"
showlocs
return False
- dispatch remotes = notifyTransfer Download afile $ trycopy remotes remotes
- trycopy full [] _ = do
- Remote.showTriedRemotes full
- showlocs
- return False
- trycopy full (r:rs) witness =
- ifM (probablyPresent r)
- ( docopy r witness <||> trycopy full rs witness
- , trycopy full rs witness
+ dispatch remotes = notifyTransfer Download afile $ \witness -> do
+ ok <- pickRemote remotes $ \r -> ifM (probablyPresent r)
+ ( docopy r witness
+ , return False
)
+ if ok
+ then return ok
+ else do
+ Remote.showTriedRemotes remotes
+ showlocs
+ return False
showlocs = Remote.showLocations False key []
"No other repository is known to contain the file."
-- This check is to avoid an ugly message if a remote is a
diff --git a/doc/todo/get_round_robin.mdwn b/doc/todo/get_round_robin.mdwn
index 9fad9da9d..25c3a8644 100644
--- a/doc/todo/get_round_robin.mdwn
+++ b/doc/todo/get_round_robin.mdwn
@@ -20,3 +20,12 @@ it might not really be desirable when not run in parallel. In particular,
if A and B are on different spinning disks, then an access pattern of
A,B,A,B might keep the disks idle enough that they spin down in-between
access.
+
+> done for `git annex get -JN` where two remotes have the same cost.
+>
+> Also for `git annex sync --content -JN` when downloading and two remotes
+> have the same cost.
+>
+> Can't think of any other places that apply, except perhaps the assistant,
+> but it does its own much different queuing of transfers. [[done]]
+> --[[Joey]]
diff --git a/doc/todo/wishlist__58___do_round_robin_downloading_of_data.mdwn b/doc/todo/wishlist__58___do_round_robin_downloading_of_data.mdwn
index 2a283b3df..00777faa2 100644
--- a/doc/todo/wishlist__58___do_round_robin_downloading_of_data.mdwn
+++ b/doc/todo/wishlist__58___do_round_robin_downloading_of_data.mdwn
@@ -5,3 +5,6 @@ This of course assumes that we like the idea of "parallel" launching and running
This wish item is probably only useful for the paranoid people who store more than 1 copy of their data.
See [[get_round_robin]] --[[Joey]]
+
+> [[done]], at least for `git annex get -JN` when two remotes have the same
+> cost..