summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2012-07-05 09:12:54 -0600
committerGravatar Joey Hess <joey@kitenet.net>2012-07-05 09:12:54 -0600
commitb0894f00c075e4dd93a692880e8eb0ea865b6c28 (patch)
tree913802df5ed3184f83c82a846c8641719e3583a7
parentb4917bd18fa9e2eacb5fbd916828d30e2ac297b4 (diff)
parent59f8413abe89b9abe5708fc0ab3aba93fa2c0f64 (diff)
Merge branch 'master' into assistant
-rw-r--r--Backend/SHA.hs95
-rw-r--r--Build/Configure.hs9
-rw-r--r--debian/changelog5
-rw-r--r--doc/bugs/Issue_on_OSX_with_some_system_limits/comment_3_18ddf8b5934dd6fb1676cd6adc7d103b._comment19
-rw-r--r--doc/bugs/git_annex_du.mdwn14
-rw-r--r--doc/design/assistant/blog/day_22__horrible_option_parsing_hack.mdwn34
-rw-r--r--doc/design/assistant/blog/day_23__transfer_watching.mdwn25
-rw-r--r--doc/design/assistant/blog/day_24__airport_digressions.mdwn99
-rw-r--r--doc/design/assistant/syncing.mdwn21
-rw-r--r--doc/install/OSX/comment_14_6ef2ddb7b11ce6ad54578ae118ed346e._comment9
-rw-r--r--doc/install/OSX/comment_15_6fd1fad5b6d9f36620e5a0e99edd2f89._comment9
11 files changed, 284 insertions, 55 deletions
diff --git a/Backend/SHA.hs b/Backend/SHA.hs
index 838a97ab8..7abbf8035 100644
--- a/Backend/SHA.hs
+++ b/Backend/SHA.hs
@@ -1,6 +1,6 @@
{- git-annex SHA backend
-
- - Copyright 2011 Joey Hess <joey@kitenet.net>
+ - Copyright 2011,2012 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@@ -12,7 +12,10 @@ import qualified Annex
import Types.Backend
import Types.Key
import Types.KeySource
+
import qualified Build.SysConfig as SysConfig
+import Data.Digest.Pure.SHA
+import qualified Data.ByteString.Lazy as L
type SHASize = Int
@@ -25,32 +28,19 @@ backends :: [Backend]
backends = catMaybes $ map genBackend sizes ++ map genBackendE sizes
genBackend :: SHASize -> Maybe Backend
-genBackend size
- | isNothing (shaCommand size) = Nothing
- | otherwise = Just b
- where
- b = Backend
- { name = shaName size
- , getKey = keyValue size
- , fsckKey = Just $ checkKeyChecksum size
- }
+genBackend size = Just $ Backend
+ { name = shaName size
+ , getKey = keyValue size
+ , fsckKey = Just $ checkKeyChecksum size
+ }
genBackendE :: SHASize -> Maybe Backend
-genBackendE size =
- case genBackend size of
- Nothing -> Nothing
- Just b -> Just $ b
- { name = shaNameE size
- , getKey = keyValueE size
- }
-
-shaCommand :: SHASize -> Maybe String
-shaCommand 1 = SysConfig.sha1
-shaCommand 256 = Just SysConfig.sha256
-shaCommand 224 = SysConfig.sha224
-shaCommand 384 = SysConfig.sha384
-shaCommand 512 = SysConfig.sha512
-shaCommand _ = Nothing
+genBackendE size = do
+ b <- genBackend size
+ return $ b
+ { name = shaNameE size
+ , getKey = keyValueE size
+ }
shaName :: SHASize -> String
shaName size = "SHA" ++ show size
@@ -58,27 +48,48 @@ shaName size = "SHA" ++ show size
shaNameE :: SHASize -> String
shaNameE size = shaName size ++ "E"
-shaN :: SHASize -> FilePath -> Annex String
-shaN size file = do
+shaN :: SHASize -> FilePath -> Integer -> Annex String
+shaN shasize file filesize = do
showAction "checksum"
- liftIO $ pOpen ReadFromPipe command (toCommand [File file]) $ \h -> do
- sha <- fst . separate (== ' ') <$> hGetLine h
- if null sha
- then error $ command ++ " parse error"
- else return sha
+ case shaCommand shasize filesize of
+ Left sha -> liftIO $ sha <$> L.readFile file
+ Right command -> liftIO $ runcommand command
+ where
+ runcommand command =
+ pOpen ReadFromPipe command (toCommand [File file]) $ \h -> do
+ sha <- fst . separate (== ' ') <$> hGetLine h
+ if null sha
+ then error $ command ++ " parse error"
+ else return sha
+
+shaCommand :: SHASize -> Integer -> Either (L.ByteString -> String) String
+shaCommand shasize filesize
+ | shasize == 1 = use SysConfig.sha1 sha1
+ | shasize == 256 = use SysConfig.sha256 sha256
+ | shasize == 224 = use SysConfig.sha224 sha224
+ | shasize == 384 = use SysConfig.sha384 sha384
+ | shasize == 512 = use SysConfig.sha512 sha512
+ | otherwise = error $ "bad sha size " ++ show shasize
where
- command = fromJust $ shaCommand size
+ use Nothing sha = Left $ showDigest . sha
+ use (Just c) sha
+ -- use builtin, but slower sha for small files
+ -- benchmarking indicates it's faster up to
+ -- and slightly beyond 50 kb files
+ | filesize < 51200 = use Nothing sha
+ | otherwise = Right c
{- A key is a checksum of its contents. -}
keyValue :: SHASize -> KeySource -> Annex (Maybe Key)
-keyValue size source = do
+keyValue shasize source = do
let file = contentLocation source
- s <- shaN size file
stat <- liftIO $ getFileStatus file
+ let filesize = fromIntegral $ fileSize stat
+ s <- shaN shasize file filesize
return $ Just $ stubKey
{ keyName = s
- , keyBackendName = shaName size
- , keySize = Just $ fromIntegral $ fileSize stat
+ , keyBackendName = shaName shasize
+ , keySize = Just filesize
}
{- Extension preserving keys. -}
@@ -101,10 +112,12 @@ keyValueE size source = keyValue size source >>= maybe (return Nothing) addE
checkKeyChecksum :: SHASize -> Key -> FilePath -> Annex Bool
checkKeyChecksum size key file = do
fast <- Annex.getState Annex.fast
- present <- liftIO $ doesFileExist file
- if not present || fast
- then return True
- else check <$> shaN size file
+ mstat <- liftIO $ catchMaybeIO $ getFileStatus file
+ case (mstat, fast) of
+ (Just stat, False) -> do
+ let filesize = fromIntegral $ fileSize stat
+ check <$> shaN size file filesize
+ _ -> return True
where
check s
| s == dropExtension (keyName key) = True
diff --git a/Build/Configure.hs b/Build/Configure.hs
index 7af53cf10..24743bf61 100644
--- a/Build/Configure.hs
+++ b/Build/Configure.hs
@@ -28,15 +28,14 @@ tests =
, TestCase "gpg" $ testCmd "gpg" "gpg --version >/dev/null"
, TestCase "lsof" $ testCmd "lsof" "lsof -v >/dev/null 2>&1"
, TestCase "ssh connection caching" getSshConnectionCaching
- ] ++ shaTestCases False [1, 512, 224, 384] ++ shaTestCases True [256]
+ ] ++ shaTestCases [1, 256, 512, 224, 384]
-shaTestCases :: Bool -> [Int] -> [TestCase]
-shaTestCases required l = map make l
+shaTestCases :: [Int] -> [TestCase]
+shaTestCases l = map make l
where
- make n = TestCase key $ selector key (shacmds n) "</dev/null"
+ make n = TestCase key $ maybeSelectCmd key (shacmds n) "</dev/null"
where
key = "sha" ++ show n
- selector = if required then selectCmd else maybeSelectCmd
shacmds n = concatMap (\x -> [x, osxpath </> x]) $
map (\x -> "sha" ++ show n ++ x) ["", "sum"]
-- Max OSX puts GNU tools outside PATH, so look in
diff --git a/debian/changelog b/debian/changelog
index 33c850861..1c44f5952 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -4,6 +4,11 @@ git-annex (3.20120630) UNRELEASED; urgency=low
transfer is already in progress by another process.
* status: Lists transfers that are currently in progress.
* Fix passing --uuid to git-annex-shell.
+ * When shaNsum commands cannot be found, use the Haskell SHA library
+ (already a dependency) to do the checksumming. This may be slower,
+ but avoids portability problems.
+ * Use SHA library for files less than 50 kb in size, at which point it's
+ faster than forking the more optimised external program.
-- Joey Hess <joeyh@debian.org> Sun, 01 Jul 2012 15:04:37 -0400
diff --git a/doc/bugs/Issue_on_OSX_with_some_system_limits/comment_3_18ddf8b5934dd6fb1676cd6adc7d103b._comment b/doc/bugs/Issue_on_OSX_with_some_system_limits/comment_3_18ddf8b5934dd6fb1676cd6adc7d103b._comment
new file mode 100644
index 000000000..eb886acf6
--- /dev/null
+++ b/doc/bugs/Issue_on_OSX_with_some_system_limits/comment_3_18ddf8b5934dd6fb1676cd6adc7d103b._comment
@@ -0,0 +1,19 @@
+[[!comment format=mdwn
+ username="http://joeyh.name/"
+ subject="comment 3"
+ date="2012-07-04T12:32:44Z"
+ content="""
+Jimmy, sounds like I could use something like this to get the current limit:
+
+ sysctl kern.maxfilesperproc
+
+Probably prints \"sysctl kern.maxfilesperproc = 256\" or such.. can you verify?
+Once I have the limit, I can make the kqueue code use subset of it, and print out a message when it needs to be increased, like the inotify code does.
+
+(Also, the kqueue code only opens directories, not files, so unless you have 400000 directories, that's
+a little high.)
+
+---
+
+On file removal not propigating, does this still happen? When you remove a file does a git commit automatically happen, or is that broken with kqueue?
+"""]]
diff --git a/doc/bugs/git_annex_du.mdwn b/doc/bugs/git_annex_du.mdwn
new file mode 100644
index 000000000..2b1315298
--- /dev/null
+++ b/doc/bugs/git_annex_du.mdwn
@@ -0,0 +1,14 @@
+We need a way to calculate space taken by certain files.
+
+Use cases: I want to drop some files from my small disk. I need to figure out things that take most space, and drop them.
+
+Usage examples:
+
+ git annex du -hs *.mp3
+ git annex du -sBm --in=here *.ogg
+
+Would be nice if it was compatible with standard unix `df`.
+
+> `du -L` works.
+>
+> See also: [[forum/Wishlist:_getting_the_disk_used_by_a_subtree_of_files]]
diff --git a/doc/design/assistant/blog/day_22__horrible_option_parsing_hack.mdwn b/doc/design/assistant/blog/day_22__horrible_option_parsing_hack.mdwn
new file mode 100644
index 000000000..9f59d1af9
--- /dev/null
+++ b/doc/design/assistant/blog/day_22__horrible_option_parsing_hack.mdwn
@@ -0,0 +1,34 @@
+Well, sometimes you just have to go for the hack. Trying to find a way
+to add additional options to git-annex-shell without breaking backwards
+compatability, I noticed that it ignores all options after `--`, because
+those tend to be random rsync options due to the way rsync runs it.
+
+So, I've added a new class of options, that come in between, like
+`-- opt=val opt=val ... --`
+
+The parser for these will not choke on unknown options, unlike normal
+getopt. So this let me add the additional info I needed to
+pass to git-annex-shell to make it record transfer information. And
+if I need to pass more info in the future, that's covered too.
+
+It's ugly, but since only git-annex runs git-annex-shell, this is an
+ugliness only I (and now you, dear reader) have to put up with.
+
+Note to self: Command-line programs are sometimes an API, particularly
+if designed to be called remotely, and so it makes sense consider
+whether they are, and design expandability into them from day 1.
+
+---
+
+Anyway, we now have full transfer tracking in git-annex! Both sides of
+a transfer know what's being transferred, and from where, and have
+the info necessary to interrupt the transfer.
+
+---
+
+Also did some basic groundwork, adding a queue of transfers to perform,
+and adding to the daemon's status information a map of currently running
+transfers.
+
+Next up: The daemon will use inotify to notice new and deleted transfer
+info files, and update its status info.
diff --git a/doc/design/assistant/blog/day_23__transfer_watching.mdwn b/doc/design/assistant/blog/day_23__transfer_watching.mdwn
new file mode 100644
index 000000000..beaf75bc5
--- /dev/null
+++ b/doc/design/assistant/blog/day_23__transfer_watching.mdwn
@@ -0,0 +1,25 @@
+Starting to travel, so limited time today.
+
+Yet Another Thread added to the assistant, all it does is watch for changes
+to transfer information files, and update the assistant's map of transfers
+currently in progress. Now the assistant will know if some other repository
+has connected to the local repo and is sending or receiving a file's
+content.
+
+This seemed really simple to write, it's just 78 lines of code. It worked
+100% correctly the first time. :) But it's only so easy because I've got
+this shiny new inotify hammer that I keep finding places to use in the
+assistant.
+
+Also, the new thread does some things that caused a similar thread (the
+merger thread) to go into a MVar deadlock. Luckily, I spent much of
+[day 19](day_19__random_improvements) investigating and fixing that
+deadlock, even though it was not a problem at the time.
+
+So, good.. I'm doing things right and getting to a place where rather
+nontrivial features can be added easily.
+
+--
+
+Next up: Enough nonsense with tracking tranfers... Time to start actually
+transferring content around!
diff --git a/doc/design/assistant/blog/day_24__airport_digressions.mdwn b/doc/design/assistant/blog/day_24__airport_digressions.mdwn
new file mode 100644
index 000000000..695296974
--- /dev/null
+++ b/doc/design/assistant/blog/day_24__airport_digressions.mdwn
@@ -0,0 +1,99 @@
+In a series of airport layovers all day. Since I woke up at 3:45 am,
+didn't feel up to doing serious new work, so instead I worked through some
+OSX support backlog.
+
+git-annex will now use Haskell's SHA library if the `sha256sum`
+command is not available. That library is slow, but it's guaranteed to be
+available; git-annex already depended on it to calculate HMACs.
+
+Then I decided to see if it makes sense to use the SHA library
+when adding smaller files. At some point, its slower implementation should
+win over needing to fork and parse the output of `sha256sum`. This was
+the first time I tried out Haskell's
+[Criterion](http://hackage.haskell.org/package/criterion) benchmarker,
+and I built this simple benchmark in short order.
+
+[[!format haskell """
+import Data.Digest.Pure.SHA
+import Data.ByteString.Lazy as L
+import Criterion.Main
+import Common
+
+testfile :: FilePath
+testfile = "/tmp/bar" -- on ram disk
+
+main = defaultMain
+ [ bgroup "sha256"
+ [ bench "internal" $ whnfIO internal
+ , bench "external" $ whnfIO external
+ ]
+ ]
+
+internal :: IO String
+internal = showDigest . sha256 <$> L.readFile testfile
+
+external :: IO String
+external = pOpen ReadFromPipe "sha256sum" [testfile] $ \h ->
+ fst . separate (== ' ') <$> hGetLine h
+"""]]
+
+The nice thing about benchmarking in Airports is when you're running a
+benchmark locally, you don't want to do anything else with the computer,
+so can alternate people watching, spacing out, and analizing results.
+
+100 kb file:
+
+ benchmarking sha256/internal
+ mean: 15.64729 ms, lb 15.29590 ms, ub 16.10119 ms, ci 0.950
+ std dev: 2.032476 ms, lb 1.638016 ms, ub 2.527089 ms, ci 0.950
+
+ benchmarking sha256/external
+ mean: 8.217700 ms, lb 7.931324 ms, ub 8.568805 ms, ci 0.950
+ std dev: 1.614786 ms, lb 1.357791 ms, ub 2.009682 ms, ci 0.950
+
+75 kb file:
+
+ benchmarking sha256/internal
+ mean: 12.16099 ms, lb 11.89566 ms, ub 12.50317 ms, ci 0.950
+ std dev: 1.531108 ms, lb 1.232353 ms, ub 1.929141 ms, ci 0.950
+
+ benchmarking sha256/external
+ mean: 8.818731 ms, lb 8.425744 ms, ub 9.269550 ms, ci 0.950
+ std dev: 2.158530 ms, lb 1.916067 ms, ub 2.487242 ms, ci 0.950
+
+50 kb file:
+
+ benchmarking sha256/internal
+ mean: 7.699274 ms, lb 7.560254 ms, ub 7.876605 ms, ci 0.950
+ std dev: 801.5292 us, lb 655.3344 us, ub 990.4117 us, ci 0.950
+
+ benchmarking sha256/external
+ mean: 8.715779 ms, lb 8.330540 ms, ub 9.102232 ms, ci 0.950
+ std dev: 1.988089 ms, lb 1.821582 ms, ub 2.181676 ms, ci 0.950
+
+10 kb file:
+
+ benchmarking sha256/internal
+ mean: 1.586105 ms, lb 1.574512 ms, ub 1.604922 ms, ci 0.950
+ std dev: 74.07235 us, lb 51.71688 us, ub 108.1348 us, ci 0.950
+
+ benchmarking sha256/external
+ mean: 6.873742 ms, lb 6.582765 ms, ub 7.252911 ms, ci 0.950
+ std dev: 1.689662 ms, lb 1.346310 ms, ub 2.640399 ms, ci 0.950
+
+It's possible to get nice graphical reports out of Criterion, but
+this is clear enough, so I stopped here. 50 kb seems a reasonable
+cutoff point.
+
+I also used this to benchmark the SHA256 in Haskell's Crypto package.
+Surprisingly, it's a *lot* slower than even the Pure.SHA code.
+On a 50 kb file:
+
+ benchmarking sha256/Crypto
+ collecting 100 samples, 1 iterations each, in estimated 6.073809 s
+ mean: 69.89037 ms, lb 69.15831 ms, ub 70.71845 ms, ci 0.950
+ std dev: 3.995397 ms, lb 3.435775 ms, ub 4.721952 ms, ci 0.950
+
+
+There's another Haskell library, [SHA2](http://hackage.haskell.org/package/SHA2),
+which I should try some time.
diff --git a/doc/design/assistant/syncing.mdwn b/doc/design/assistant/syncing.mdwn
index e3fdca316..aa3c6066b 100644
--- a/doc/design/assistant/syncing.mdwn
+++ b/doc/design/assistant/syncing.mdwn
@@ -10,14 +10,15 @@ all the other git clones, at both the git level and the key/value level.
* transfer info for git-annex-shell **done**
* update files as transfers proceed. See [[progressbars]]
(updating for downloads is easy; for uploads is hard)
-* add Transfer queue TChan
+* add Transfer queue TChan **done**
+* add TransferInfo Map to DaemonStatus for tracking transfers in progress.
+ **done**
+* Poll transfer in progress info files for changes (use inotify again!
+ wow! hammer, meet nail..), and update the TransferInfo Map **done**
* enqueue Transfers (Uploads) as new files are added to the annex by
Watcher.
* enqueue Tranferrs (Downloads) as new dangling symlinks are noticed by
Watcher.
-* add TransferInfo Map to DaemonStatus for tracking transfers in progress.
-* Poll transfer in progress info files for changes (use inotify again!
- wow! hammer, meet nail..), and update the TransferInfo Map
* Write basic Transfer handling thread. Multiple such threads need to be
able to be run at once. Each will need its own independant copy of the
Annex state monad.
@@ -52,6 +53,9 @@ all the other git clones, at both the git level and the key/value level.
signaling a change out of band.
4. Add a hook, so when there's a change to sync, a program can be run
and do its own signaling.
+5. --debug will show often unnecessary work being done. Optimise.
+6. It would be nice if, when a USB drive is connected,
+ syncing starts automatically. Use dbus on Linux?
## misc todo
@@ -89,13 +93,12 @@ anyway.
that lack content.
* Transfer threads started/stopped as necessary to move data.
(May sometimes want multiple threads downloading, or uploading, or even both.)
-
- type TransferQueue = TChan [Transfer]
- -- add (M.Map Transfer TransferInfo) to DaemonStatus
- startTransfer :: Transfer -> Annex TransferID
+ startTransfer :: TransferQueue -> Transfer -> Annex ()
+ startTransfer q transfer = error "TODO"
- stopTransfer :: TransferID -> IO ()
+ stopTransfer :: TransferQueue -> TransferID -> Annex ()
+ stopTransfer q transfer = error "TODO"
The assistant needs to find out when `git-annex-shell` is receiving or
sending (triggered by another remote), so it can add data for those too.
diff --git a/doc/install/OSX/comment_14_6ef2ddb7b11ce6ad54578ae118ed346e._comment b/doc/install/OSX/comment_14_6ef2ddb7b11ce6ad54578ae118ed346e._comment
new file mode 100644
index 000000000..35e0bb6ed
--- /dev/null
+++ b/doc/install/OSX/comment_14_6ef2ddb7b11ce6ad54578ae118ed346e._comment
@@ -0,0 +1,9 @@
+[[!comment format=mdwn
+ username="http://joeyh.name/"
+ subject="comment 14"
+ date="2012-07-04T12:43:54Z"
+ content="""
+@Damien, hmm, it should not be using any cp options, unless when it was built there was a cp in the path that supported some option like -p. Can you check with --debug what cp parameters it's trying to use?
+
+
+"""]]
diff --git a/doc/install/OSX/comment_15_6fd1fad5b6d9f36620e5a0e99edd2f89._comment b/doc/install/OSX/comment_15_6fd1fad5b6d9f36620e5a0e99edd2f89._comment
new file mode 100644
index 000000000..0005328c4
--- /dev/null
+++ b/doc/install/OSX/comment_15_6fd1fad5b6d9f36620e5a0e99edd2f89._comment
@@ -0,0 +1,9 @@
+[[!comment format=mdwn
+ username="http://joeyh.name/"
+ subject="comment 15"
+ date="2012-07-04T13:14:00Z"
+ content="""
+git-annex will now fall back to slower pure Haskell hashing code if `sha256sum`, etc programs are not in PATH. I'd still recommend installing the coreutils, as they're probably faster.
+
+(The `shasum` command seems to come from a perl library, so I have not tried to make git-annex use that one.)
+"""]]