summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Robie Basak <robie@justgohome.co.uk>2014-05-15 19:44:00 +0100
committerGravatar Joey Hess <joey@kitenet.net>2014-05-15 16:32:44 -0400
commit9914df2d424beb62a59bd3c82ca57b2d2f146e7c (patch)
tree920d16f734b11d52ce8b483195a64dc110c77097
parent1991541d1e4fdaa205bd8bdda31bb10ab4b49aef (diff)
ddar special remote
-rw-r--r--Remote/Ddar.hs229
-rw-r--r--Remote/List.hs2
-rw-r--r--Types/GitConfig.hs2
-rw-r--r--doc/git-annex.mdwn6
-rw-r--r--doc/special_remotes.mdwn1
-rw-r--r--doc/special_remotes/ddar.mdwn40
-rw-r--r--doc/walkthrough/using_ddar.mdwn32
7 files changed, 312 insertions, 0 deletions
diff --git a/Remote/Ddar.hs b/Remote/Ddar.hs
new file mode 100644
index 000000000..01d483309
--- /dev/null
+++ b/Remote/Ddar.hs
@@ -0,0 +1,229 @@
+{- Using ddar as a remote. Based on bup and rsync remotes.
+ -
+ - Copyright 2011 Joey Hess <joey@kitenet.net>
+ - Copyright 2014 Robie Basak <robie@justgohome.co.uk>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Remote.Ddar (remote) where
+
+import Control.Exception
+import qualified Data.ByteString.Lazy as L
+import qualified Data.Map as M
+import System.IO.Error
+import System.Process
+
+import Data.String.Utils
+import Common.Annex
+import Types.Remote
+import Types.Key
+import Types.Creds
+import qualified Git
+import Config
+import Config.Cost
+import Remote.Helper.Special
+import Remote.Helper.Encryptable
+import Crypto
+import Annex.Content
+import Annex.Ssh
+import Annex.UUID
+import Utility.Metered
+
+type DdarRepo = String
+
+remote :: RemoteType
+remote = RemoteType {
+ typename = "ddar",
+ enumerate = findSpecialRemotes "ddarrepo",
+ generate = gen,
+ setup = ddarSetup
+}
+
+gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote)
+gen r u c gc = do
+ cst <- remoteCost gc $
+ if ddarLocal ddarrepo
+ then nearlyCheapRemoteCost
+ else expensiveRemoteCost
+
+ let new = Remote
+ { uuid = u
+ , cost = cst
+ , name = Git.repoDescribe r
+ , storeKey = store ddarrepo
+ , retrieveKeyFile = retrieve ddarrepo
+ , retrieveKeyFileCheap = retrieveCheap
+ , removeKey = remove ddarrepo
+ , hasKey = checkPresent ddarrepo
+ , hasKeyCheap = ddarLocal ddarrepo
+ , whereisKey = Nothing
+ , remoteFsck = Nothing
+ , repairRepo = Nothing
+ , config = c
+ , repo = r
+ , gitconfig = gc
+ , localpath = if ddarLocal ddarrepo && not (null ddarrepo)
+ then Just ddarrepo
+ else Nothing
+ , remotetype = remote
+ , availability = if ddarLocal ddarrepo then LocallyAvailable else GloballyAvailable
+ , readonly = False
+ }
+ return $ Just $ encryptableRemote c
+ (storeEncrypted new ddarrepo)
+ (retrieveEncrypted ddarrepo)
+ new
+ where
+ ddarrepo = fromMaybe (error "missing ddarrepo") $ remoteAnnexDdarRepo gc
+
+ddarSetup :: Maybe UUID -> Maybe CredPair -> RemoteConfig -> Annex (RemoteConfig, UUID)
+ddarSetup mu _ c = do
+ u <- maybe (liftIO genUUID) return mu
+
+ -- verify configuration is sane
+ let ddarrepo = fromMaybe (error "Specify ddarrepo=") $
+ M.lookup "ddarrepo" c
+ c' <- encryptionSetup c
+
+ -- The ddarrepo is stored in git config, as well as this repo's
+ -- persistant state, so it can vary between hosts.
+ gitConfigSpecialRemote u c' "ddarrepo" ddarrepo
+
+ return (c', u)
+
+pipeDdar :: [CommandParam] -> Maybe Handle -> Maybe Handle -> IO Bool
+pipeDdar params inh outh = do
+ p <- runProcess "ddar" (toCommand params)
+ Nothing Nothing inh outh Nothing
+ ok <- waitForProcess p
+ case ok of
+ ExitSuccess -> return True
+ _ -> return False
+
+store :: DdarRepo -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool
+store ddarrepo k _f _p = sendAnnex k (void $ remove ddarrepo k) $ \src -> do
+ let params =
+ [ Param "c"
+ , Param "-N"
+ , Param $ key2file k
+ , Param ddarrepo
+ , File src
+ ]
+ liftIO $ boolSystem "ddar" params
+
+storeEncrypted :: Remote -> DdarRepo -> (Cipher, Key) -> Key -> MeterUpdate -> Annex Bool
+storeEncrypted r ddarrepo (cipher, enck) k _p =
+ sendAnnex k (void $ remove ddarrepo k) $ \src ->
+ liftIO $ catchBoolIO $
+ encrypt (getGpgEncParams r) cipher (feedFile src) $ \h ->
+ pipeDdar params (Just h) Nothing
+ where
+ params =
+ [ Param "c"
+ , Param "-N"
+ , Param $ key2file enck
+ , Param ddarrepo
+ , Param "-"
+ ]
+
+{- Convert remote DdarRepo to host and path on remote end -}
+splitRemoteDdarRepo :: DdarRepo -> (String, String)
+splitRemoteDdarRepo ddarrepo =
+ (host, ddarrepo')
+ where
+ (host, remainder) = span (/= ':') ddarrepo
+ ddarrepo' = drop 1 remainder
+
+{- Return the command and parameters to use for a ddar call that may need to be
+ - made on a remote repository. This will call ssh if needed. -}
+
+ddarRemoteCall :: DdarRepo -> Char -> [CommandParam] -> Annex (String, [CommandParam])
+ddarRemoteCall ddarrepo cmd params
+ | ddarLocal ddarrepo = return ("ddar", localParams)
+ | otherwise = do
+ remoteCachingParams <- sshCachingOptions (host, Nothing) []
+ return ("ssh", remoteCachingParams ++ remoteParams)
+ where
+ (host, ddarrepo') = splitRemoteDdarRepo ddarrepo
+ localParams = Param [cmd] : Param ddarrepo : params
+ remoteParams = Param host : Param "ddar" : Param [cmd] : Param ddarrepo' : params
+
+{- Specialized ddarRemoteCall that includes extraction command and flags -}
+
+ddarExtractRemoteCall :: DdarRepo -> Key -> Annex (String, [CommandParam])
+ddarExtractRemoteCall ddarrepo k =
+ ddarRemoteCall ddarrepo 'x' [Param "--force-stdout", Param $ key2file k]
+
+retrieve :: DdarRepo -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Bool
+retrieve ddarrepo k _f d _p = do
+ (cmd, params) <- ddarExtractRemoteCall ddarrepo k
+ liftIO $ catchBoolIO $ withFile d WriteMode $ \h -> do
+ let p = (proc cmd $ toCommand params){ std_out = UseHandle h }
+ (_, _, _, pid) <- Common.Annex.createProcess p
+ forceSuccessProcess p pid
+ return True
+
+retrieveCheap :: Key -> FilePath -> Annex Bool
+retrieveCheap _ _ = return False
+
+retrieveEncrypted :: DdarRepo -> (Cipher, Key) -> Key -> FilePath -> MeterUpdate -> Annex Bool
+retrieveEncrypted ddarrepo (cipher, enck) _ f _p = do
+ (cmd, params) <- ddarExtractRemoteCall ddarrepo enck
+ let p = proc cmd $ toCommand params
+ liftIO $ catchBoolIO $ withHandle StdoutHandle createProcessSuccess p $ \h -> do
+ decrypt cipher (\toh -> L.hPut toh =<< L.hGetContents h) $
+ readBytes $ L.writeFile f
+ return True
+
+remove :: DdarRepo -> Key -> Annex Bool
+remove ddarrepo key = do
+ (cmd, params) <- ddarRemoteCall ddarrepo 'd' [Param $ key2file key]
+ liftIO $ boolSystem cmd params
+
+ddarDirectoryExists :: DdarRepo -> Annex (Either String Bool)
+ddarDirectoryExists ddarrepo
+ | ddarLocal ddarrepo = do
+ maybeStatus <- liftIO $ tryJust (guard . isDoesNotExistError) $ getFileStatus ddarrepo
+ return $ case maybeStatus of
+ Left _ -> Right False
+ Right status -> Right $ isDirectory status
+ | otherwise = do
+ sshCachingParams <- sshCachingOptions (host, Nothing) []
+ exitCode <- liftIO $ safeSystem "ssh" $ sshCachingParams ++ params
+ case exitCode of
+ ExitSuccess -> return $ Right True
+ ExitFailure 1 -> return $ Right False
+ ExitFailure code -> return $ Left $ "ssh call " ++
+ show (Data.String.Utils.join " " $ toCommand params) ++
+ " failed with status " ++ show code
+ where
+ (host, ddarrepo') = splitRemoteDdarRepo ddarrepo
+ params =
+ [ Param host
+ , Param "test"
+ , Param "-d"
+ , Param ddarrepo'
+ ]
+
+{- Use "ddar t" to determine if a given key is present in a ddar archive -}
+inDdarManifest :: DdarRepo -> Key -> Annex (Either String Bool)
+inDdarManifest ddarrepo k = do
+ (cmd, params) <- ddarRemoteCall ddarrepo 't' []
+ let p = proc cmd $ toCommand params
+ liftIO $ catchMsgIO $ withHandle StdoutHandle createProcessSuccess p $ \h -> do
+ contents <- hGetContents h
+ return $ elem k' $ lines contents
+ where
+ k' = key2file k
+
+checkPresent :: DdarRepo -> Key -> Annex (Either String Bool)
+checkPresent ddarrepo key = do
+ directoryExists <- ddarDirectoryExists ddarrepo
+ case directoryExists of
+ Left e -> return $ Left e
+ Right True -> inDdarManifest ddarrepo key
+ Right False -> return $ Right False
+
+ddarLocal :: DdarRepo -> Bool
+ddarLocal = notElem ':'
diff --git a/Remote/List.hs b/Remote/List.hs
index e3afc939c..221ab9a54 100644
--- a/Remote/List.hs
+++ b/Remote/List.hs
@@ -38,6 +38,7 @@ import qualified Remote.WebDAV
import qualified Remote.Tahoe
#endif
import qualified Remote.Glacier
+import qualified Remote.Ddar
import qualified Remote.Hook
import qualified Remote.External
@@ -59,6 +60,7 @@ remoteTypes =
, Remote.Tahoe.remote
#endif
, Remote.Glacier.remote
+ , Remote.Ddar.remote
, Remote.Hook.remote
, Remote.External.remote
]
diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs
index b49f3d762..71f06ff45 100644
--- a/Types/GitConfig.hs
+++ b/Types/GitConfig.hs
@@ -131,6 +131,7 @@ data RemoteGitConfig = RemoteGitConfig
, remoteAnnexBupSplitOptions :: [String]
, remoteAnnexDirectory :: Maybe FilePath
, remoteAnnexGCrypt :: Maybe String
+ , remoteAnnexDdarRepo :: Maybe String
, remoteAnnexHookType :: Maybe String
, remoteAnnexExternalType :: Maybe String
{- A regular git remote's git repository config. -}
@@ -162,6 +163,7 @@ extractRemoteGitConfig r remotename = RemoteGitConfig
, remoteAnnexBupSplitOptions = getoptions "bup-split-options"
, remoteAnnexDirectory = notempty $ getmaybe "directory"
, remoteAnnexGCrypt = notempty $ getmaybe "gcrypt"
+ , remoteAnnexDdarRepo = getmaybe "ddarrepo"
, remoteAnnexHookType = notempty $ getmaybe "hooktype"
, remoteAnnexExternalType = notempty $ getmaybe "externaltype"
, remoteGitConfig = Nothing
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index d5408a2ae..bb741810c 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -1647,6 +1647,12 @@ Here are all the supported configuration settings.
the location of the bup repository to use. Normally this is automatically
set up by `git annex initremote`, but you can change it if needed.
+* `remote.<name>.ddarrepo`
+
+ Used by ddar special remotes, this configures
+ the location of the ddar repository to use. Normally this is automatically
+ set up by `git annex initremote`, but you can change it if needed.
+
* `remote.<name>.directory`
Used by directory special remotes, this configures
diff --git a/doc/special_remotes.mdwn b/doc/special_remotes.mdwn
index e3004d2cf..cc52dfc55 100644
--- a/doc/special_remotes.mdwn
+++ b/doc/special_remotes.mdwn
@@ -10,6 +10,7 @@ They cannot be used by other git commands though.
* [[S3]] (Amazon S3, and other compatible services)
* [[Amazon_Glacier|glacier]]
* [[bup]]
+* [[ddar]]
* [[gcrypt]] (encrypted git repositories!)
* [[directory]]
* [[rsync]]
diff --git a/doc/special_remotes/ddar.mdwn b/doc/special_remotes/ddar.mdwn
new file mode 100644
index 000000000..8de86e2c7
--- /dev/null
+++ b/doc/special_remotes/ddar.mdwn
@@ -0,0 +1,40 @@
+This special remote type stores file contents in a
+[ddar](https://github.com/basak/ddar) repository. This provides easy
+de-duplication when you use git-annex to manage many files that are similar.
+
+Unlike bup, ddar uses its own storage format, which allows for both creation
+and deletion of de-deduplicated files. In addition to using local storage, ddar
+archives can be remote, providing that ddar is installed on the remote machine
+and ssh is available to it.
+
+See [[walkthrough/using_ddar]] for usage examples.
+
+## encryption
+
+Encryption is nominally supported, but is not useful. Since effective
+encryption necessarily obfuscates file contents, similar areas across different
+files are no longer visible to ddar and cannot be de-duplicated.
+
+## compression
+
+The same caveat with encryption also generally applies to compression, since
+file compression changes file contents such that similar regions across files
+no longer appear similar. An exception is `gzip --rsyncable`, which is
+specifically designed to work around this issue. This is the only compression
+mechanism with which de-duplication remains effective.
+
+## configuration
+
+These parameters can be passed to `git annex initremote` to configure ddar:
+
+* `encryption` - One of "none", "hybrid", "shared", or "pubkey".
+ See [[encryption]]. However, note that encryption renders all de-duplication
+ineffective.
+
+* `keyid` - Specifies the gpg key to use for [[encryption]].
+
+* `ddarrepo` - Required. This is passed to `ddar` as the path to the ddar
+ archive to use. If it doesn't exist, the ddar repository will be created
+ automatically when a file is first copied to it. To use a remote ddar
+ repository, use a colon (`:`) to separate the hostname from the remote path.
+ Example: "ddarrepo=example.com:/big/myddar" or "ddarrepo=/big/myddar"
diff --git a/doc/walkthrough/using_ddar.mdwn b/doc/walkthrough/using_ddar.mdwn
new file mode 100644
index 000000000..1d7cdd749
--- /dev/null
+++ b/doc/walkthrough/using_ddar.mdwn
@@ -0,0 +1,32 @@
+Another [[special_remote|special_remotes]] that git-annex can use is
+a [[special_remotes/ddar]] repository. ddar stores large file contents
+in a directory structure of its own, with deduplication. For remote
+repositories, ddar requires that ssh is available on the remote, with ddar also
+installed remotely. When copying files to the remote, ddar only needs to send
+over the network the parts of the files that are not already present remotely.
+
+Unlike bup, ddar uses its own storage format, which allows for both creation
+and deletion of de-deduplicated files.
+
+Here's how to create a ddar remote, and describe it.
+
+[[!template id=note text="""
+Instead of specifying a remote system, you could choose to make a bup
+remote that is only accessible on the current system, by passing
+"ddarrepo=/big/myddar".
+"""]]
+
+ # git annex initremote myddar type=ddar encryption=none ddarrepo=example.com:/big/myddar
+ initremote ddar (bup init)
+ Initialized empty Git repository in /big/myddar/
+ ok
+ # git annex describe myddar "my bup repository at example.com"
+ describe myddar ok
+
+Now the remote can be used like any other remote.
+
+ # git annex move my_cool_big_file --to myddar
+ move my_cool_big_file (to myddar...)
+ ok
+
+See [[special_remotes/bup]] for details.