diff options
author | Joey Hess <joey@kitenet.net> | 2012-07-05 09:12:54 -0600 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2012-07-05 09:12:54 -0600 |
commit | b0894f00c075e4dd93a692880e8eb0ea865b6c28 (patch) | |
tree | 913802df5ed3184f83c82a846c8641719e3583a7 | |
parent | b4917bd18fa9e2eacb5fbd916828d30e2ac297b4 (diff) | |
parent | 59f8413abe89b9abe5708fc0ab3aba93fa2c0f64 (diff) |
Merge branch 'master' into assistant
-rw-r--r-- | Backend/SHA.hs | 95 | ||||
-rw-r--r-- | Build/Configure.hs | 9 | ||||
-rw-r--r-- | debian/changelog | 5 | ||||
-rw-r--r-- | doc/bugs/Issue_on_OSX_with_some_system_limits/comment_3_18ddf8b5934dd6fb1676cd6adc7d103b._comment | 19 | ||||
-rw-r--r-- | doc/bugs/git_annex_du.mdwn | 14 | ||||
-rw-r--r-- | doc/design/assistant/blog/day_22__horrible_option_parsing_hack.mdwn | 34 | ||||
-rw-r--r-- | doc/design/assistant/blog/day_23__transfer_watching.mdwn | 25 | ||||
-rw-r--r-- | doc/design/assistant/blog/day_24__airport_digressions.mdwn | 99 | ||||
-rw-r--r-- | doc/design/assistant/syncing.mdwn | 21 | ||||
-rw-r--r-- | doc/install/OSX/comment_14_6ef2ddb7b11ce6ad54578ae118ed346e._comment | 9 | ||||
-rw-r--r-- | doc/install/OSX/comment_15_6fd1fad5b6d9f36620e5a0e99edd2f89._comment | 9 |
11 files changed, 284 insertions, 55 deletions
diff --git a/Backend/SHA.hs b/Backend/SHA.hs index 838a97ab8..7abbf8035 100644 --- a/Backend/SHA.hs +++ b/Backend/SHA.hs @@ -1,6 +1,6 @@ {- git-annex SHA backend - - - Copyright 2011 Joey Hess <joey@kitenet.net> + - Copyright 2011,2012 Joey Hess <joey@kitenet.net> - - Licensed under the GNU GPL version 3 or higher. -} @@ -12,7 +12,10 @@ import qualified Annex import Types.Backend import Types.Key import Types.KeySource + import qualified Build.SysConfig as SysConfig +import Data.Digest.Pure.SHA +import qualified Data.ByteString.Lazy as L type SHASize = Int @@ -25,32 +28,19 @@ backends :: [Backend] backends = catMaybes $ map genBackend sizes ++ map genBackendE sizes genBackend :: SHASize -> Maybe Backend -genBackend size - | isNothing (shaCommand size) = Nothing - | otherwise = Just b - where - b = Backend - { name = shaName size - , getKey = keyValue size - , fsckKey = Just $ checkKeyChecksum size - } +genBackend size = Just $ Backend + { name = shaName size + , getKey = keyValue size + , fsckKey = Just $ checkKeyChecksum size + } genBackendE :: SHASize -> Maybe Backend -genBackendE size = - case genBackend size of - Nothing -> Nothing - Just b -> Just $ b - { name = shaNameE size - , getKey = keyValueE size - } - -shaCommand :: SHASize -> Maybe String -shaCommand 1 = SysConfig.sha1 -shaCommand 256 = Just SysConfig.sha256 -shaCommand 224 = SysConfig.sha224 -shaCommand 384 = SysConfig.sha384 -shaCommand 512 = SysConfig.sha512 -shaCommand _ = Nothing +genBackendE size = do + b <- genBackend size + return $ b + { name = shaNameE size + , getKey = keyValueE size + } shaName :: SHASize -> String shaName size = "SHA" ++ show size @@ -58,27 +48,48 @@ shaName size = "SHA" ++ show size shaNameE :: SHASize -> String shaNameE size = shaName size ++ "E" -shaN :: SHASize -> FilePath -> Annex String -shaN size file = do +shaN :: SHASize -> FilePath -> Integer -> Annex String +shaN shasize file filesize = do showAction "checksum" - liftIO $ pOpen ReadFromPipe command (toCommand [File file]) $ \h -> do - sha <- fst . separate (== ' ') <$> hGetLine h - if null sha - then error $ command ++ " parse error" - else return sha + case shaCommand shasize filesize of + Left sha -> liftIO $ sha <$> L.readFile file + Right command -> liftIO $ runcommand command + where + runcommand command = + pOpen ReadFromPipe command (toCommand [File file]) $ \h -> do + sha <- fst . separate (== ' ') <$> hGetLine h + if null sha + then error $ command ++ " parse error" + else return sha + +shaCommand :: SHASize -> Integer -> Either (L.ByteString -> String) String +shaCommand shasize filesize + | shasize == 1 = use SysConfig.sha1 sha1 + | shasize == 256 = use SysConfig.sha256 sha256 + | shasize == 224 = use SysConfig.sha224 sha224 + | shasize == 384 = use SysConfig.sha384 sha384 + | shasize == 512 = use SysConfig.sha512 sha512 + | otherwise = error $ "bad sha size " ++ show shasize where - command = fromJust $ shaCommand size + use Nothing sha = Left $ showDigest . sha + use (Just c) sha + -- use builtin, but slower sha for small files + -- benchmarking indicates it's faster up to + -- and slightly beyond 50 kb files + | filesize < 51200 = use Nothing sha + | otherwise = Right c {- A key is a checksum of its contents. -} keyValue :: SHASize -> KeySource -> Annex (Maybe Key) -keyValue size source = do +keyValue shasize source = do let file = contentLocation source - s <- shaN size file stat <- liftIO $ getFileStatus file + let filesize = fromIntegral $ fileSize stat + s <- shaN shasize file filesize return $ Just $ stubKey { keyName = s - , keyBackendName = shaName size - , keySize = Just $ fromIntegral $ fileSize stat + , keyBackendName = shaName shasize + , keySize = Just filesize } {- Extension preserving keys. -} @@ -101,10 +112,12 @@ keyValueE size source = keyValue size source >>= maybe (return Nothing) addE checkKeyChecksum :: SHASize -> Key -> FilePath -> Annex Bool checkKeyChecksum size key file = do fast <- Annex.getState Annex.fast - present <- liftIO $ doesFileExist file - if not present || fast - then return True - else check <$> shaN size file + mstat <- liftIO $ catchMaybeIO $ getFileStatus file + case (mstat, fast) of + (Just stat, False) -> do + let filesize = fromIntegral $ fileSize stat + check <$> shaN size file filesize + _ -> return True where check s | s == dropExtension (keyName key) = True diff --git a/Build/Configure.hs b/Build/Configure.hs index 7af53cf10..24743bf61 100644 --- a/Build/Configure.hs +++ b/Build/Configure.hs @@ -28,15 +28,14 @@ tests = , TestCase "gpg" $ testCmd "gpg" "gpg --version >/dev/null" , TestCase "lsof" $ testCmd "lsof" "lsof -v >/dev/null 2>&1" , TestCase "ssh connection caching" getSshConnectionCaching - ] ++ shaTestCases False [1, 512, 224, 384] ++ shaTestCases True [256] + ] ++ shaTestCases [1, 256, 512, 224, 384] -shaTestCases :: Bool -> [Int] -> [TestCase] -shaTestCases required l = map make l +shaTestCases :: [Int] -> [TestCase] +shaTestCases l = map make l where - make n = TestCase key $ selector key (shacmds n) "</dev/null" + make n = TestCase key $ maybeSelectCmd key (shacmds n) "</dev/null" where key = "sha" ++ show n - selector = if required then selectCmd else maybeSelectCmd shacmds n = concatMap (\x -> [x, osxpath </> x]) $ map (\x -> "sha" ++ show n ++ x) ["", "sum"] -- Max OSX puts GNU tools outside PATH, so look in diff --git a/debian/changelog b/debian/changelog index 33c850861..1c44f5952 100644 --- a/debian/changelog +++ b/debian/changelog @@ -4,6 +4,11 @@ git-annex (3.20120630) UNRELEASED; urgency=low transfer is already in progress by another process. * status: Lists transfers that are currently in progress. * Fix passing --uuid to git-annex-shell. + * When shaNsum commands cannot be found, use the Haskell SHA library + (already a dependency) to do the checksumming. This may be slower, + but avoids portability problems. + * Use SHA library for files less than 50 kb in size, at which point it's + faster than forking the more optimised external program. -- Joey Hess <joeyh@debian.org> Sun, 01 Jul 2012 15:04:37 -0400 diff --git a/doc/bugs/Issue_on_OSX_with_some_system_limits/comment_3_18ddf8b5934dd6fb1676cd6adc7d103b._comment b/doc/bugs/Issue_on_OSX_with_some_system_limits/comment_3_18ddf8b5934dd6fb1676cd6adc7d103b._comment new file mode 100644 index 000000000..eb886acf6 --- /dev/null +++ b/doc/bugs/Issue_on_OSX_with_some_system_limits/comment_3_18ddf8b5934dd6fb1676cd6adc7d103b._comment @@ -0,0 +1,19 @@ +[[!comment format=mdwn + username="http://joeyh.name/" + subject="comment 3" + date="2012-07-04T12:32:44Z" + content=""" +Jimmy, sounds like I could use something like this to get the current limit: + + sysctl kern.maxfilesperproc + +Probably prints \"sysctl kern.maxfilesperproc = 256\" or such.. can you verify? +Once I have the limit, I can make the kqueue code use subset of it, and print out a message when it needs to be increased, like the inotify code does. + +(Also, the kqueue code only opens directories, not files, so unless you have 400000 directories, that's +a little high.) + +--- + +On file removal not propigating, does this still happen? When you remove a file does a git commit automatically happen, or is that broken with kqueue? +"""]] diff --git a/doc/bugs/git_annex_du.mdwn b/doc/bugs/git_annex_du.mdwn new file mode 100644 index 000000000..2b1315298 --- /dev/null +++ b/doc/bugs/git_annex_du.mdwn @@ -0,0 +1,14 @@ +We need a way to calculate space taken by certain files. + +Use cases: I want to drop some files from my small disk. I need to figure out things that take most space, and drop them. + +Usage examples: + + git annex du -hs *.mp3 + git annex du -sBm --in=here *.ogg + +Would be nice if it was compatible with standard unix `df`. + +> `du -L` works. +> +> See also: [[forum/Wishlist:_getting_the_disk_used_by_a_subtree_of_files]] diff --git a/doc/design/assistant/blog/day_22__horrible_option_parsing_hack.mdwn b/doc/design/assistant/blog/day_22__horrible_option_parsing_hack.mdwn new file mode 100644 index 000000000..9f59d1af9 --- /dev/null +++ b/doc/design/assistant/blog/day_22__horrible_option_parsing_hack.mdwn @@ -0,0 +1,34 @@ +Well, sometimes you just have to go for the hack. Trying to find a way +to add additional options to git-annex-shell without breaking backwards +compatability, I noticed that it ignores all options after `--`, because +those tend to be random rsync options due to the way rsync runs it. + +So, I've added a new class of options, that come in between, like +`-- opt=val opt=val ... --` + +The parser for these will not choke on unknown options, unlike normal +getopt. So this let me add the additional info I needed to +pass to git-annex-shell to make it record transfer information. And +if I need to pass more info in the future, that's covered too. + +It's ugly, but since only git-annex runs git-annex-shell, this is an +ugliness only I (and now you, dear reader) have to put up with. + +Note to self: Command-line programs are sometimes an API, particularly +if designed to be called remotely, and so it makes sense consider +whether they are, and design expandability into them from day 1. + +--- + +Anyway, we now have full transfer tracking in git-annex! Both sides of +a transfer know what's being transferred, and from where, and have +the info necessary to interrupt the transfer. + +--- + +Also did some basic groundwork, adding a queue of transfers to perform, +and adding to the daemon's status information a map of currently running +transfers. + +Next up: The daemon will use inotify to notice new and deleted transfer +info files, and update its status info. diff --git a/doc/design/assistant/blog/day_23__transfer_watching.mdwn b/doc/design/assistant/blog/day_23__transfer_watching.mdwn new file mode 100644 index 000000000..beaf75bc5 --- /dev/null +++ b/doc/design/assistant/blog/day_23__transfer_watching.mdwn @@ -0,0 +1,25 @@ +Starting to travel, so limited time today. + +Yet Another Thread added to the assistant, all it does is watch for changes +to transfer information files, and update the assistant's map of transfers +currently in progress. Now the assistant will know if some other repository +has connected to the local repo and is sending or receiving a file's +content. + +This seemed really simple to write, it's just 78 lines of code. It worked +100% correctly the first time. :) But it's only so easy because I've got +this shiny new inotify hammer that I keep finding places to use in the +assistant. + +Also, the new thread does some things that caused a similar thread (the +merger thread) to go into a MVar deadlock. Luckily, I spent much of +[day 19](day_19__random_improvements) investigating and fixing that +deadlock, even though it was not a problem at the time. + +So, good.. I'm doing things right and getting to a place where rather +nontrivial features can be added easily. + +-- + +Next up: Enough nonsense with tracking tranfers... Time to start actually +transferring content around! diff --git a/doc/design/assistant/blog/day_24__airport_digressions.mdwn b/doc/design/assistant/blog/day_24__airport_digressions.mdwn new file mode 100644 index 000000000..695296974 --- /dev/null +++ b/doc/design/assistant/blog/day_24__airport_digressions.mdwn @@ -0,0 +1,99 @@ +In a series of airport layovers all day. Since I woke up at 3:45 am, +didn't feel up to doing serious new work, so instead I worked through some +OSX support backlog. + +git-annex will now use Haskell's SHA library if the `sha256sum` +command is not available. That library is slow, but it's guaranteed to be +available; git-annex already depended on it to calculate HMACs. + +Then I decided to see if it makes sense to use the SHA library +when adding smaller files. At some point, its slower implementation should +win over needing to fork and parse the output of `sha256sum`. This was +the first time I tried out Haskell's +[Criterion](http://hackage.haskell.org/package/criterion) benchmarker, +and I built this simple benchmark in short order. + +[[!format haskell """ +import Data.Digest.Pure.SHA +import Data.ByteString.Lazy as L +import Criterion.Main +import Common + +testfile :: FilePath +testfile = "/tmp/bar" -- on ram disk + +main = defaultMain + [ bgroup "sha256" + [ bench "internal" $ whnfIO internal + , bench "external" $ whnfIO external + ] + ] + +internal :: IO String +internal = showDigest . sha256 <$> L.readFile testfile + +external :: IO String +external = pOpen ReadFromPipe "sha256sum" [testfile] $ \h -> + fst . separate (== ' ') <$> hGetLine h +"""]] + +The nice thing about benchmarking in Airports is when you're running a +benchmark locally, you don't want to do anything else with the computer, +so can alternate people watching, spacing out, and analizing results. + +100 kb file: + + benchmarking sha256/internal + mean: 15.64729 ms, lb 15.29590 ms, ub 16.10119 ms, ci 0.950 + std dev: 2.032476 ms, lb 1.638016 ms, ub 2.527089 ms, ci 0.950 + + benchmarking sha256/external + mean: 8.217700 ms, lb 7.931324 ms, ub 8.568805 ms, ci 0.950 + std dev: 1.614786 ms, lb 1.357791 ms, ub 2.009682 ms, ci 0.950 + +75 kb file: + + benchmarking sha256/internal + mean: 12.16099 ms, lb 11.89566 ms, ub 12.50317 ms, ci 0.950 + std dev: 1.531108 ms, lb 1.232353 ms, ub 1.929141 ms, ci 0.950 + + benchmarking sha256/external + mean: 8.818731 ms, lb 8.425744 ms, ub 9.269550 ms, ci 0.950 + std dev: 2.158530 ms, lb 1.916067 ms, ub 2.487242 ms, ci 0.950 + +50 kb file: + + benchmarking sha256/internal + mean: 7.699274 ms, lb 7.560254 ms, ub 7.876605 ms, ci 0.950 + std dev: 801.5292 us, lb 655.3344 us, ub 990.4117 us, ci 0.950 + + benchmarking sha256/external + mean: 8.715779 ms, lb 8.330540 ms, ub 9.102232 ms, ci 0.950 + std dev: 1.988089 ms, lb 1.821582 ms, ub 2.181676 ms, ci 0.950 + +10 kb file: + + benchmarking sha256/internal + mean: 1.586105 ms, lb 1.574512 ms, ub 1.604922 ms, ci 0.950 + std dev: 74.07235 us, lb 51.71688 us, ub 108.1348 us, ci 0.950 + + benchmarking sha256/external + mean: 6.873742 ms, lb 6.582765 ms, ub 7.252911 ms, ci 0.950 + std dev: 1.689662 ms, lb 1.346310 ms, ub 2.640399 ms, ci 0.950 + +It's possible to get nice graphical reports out of Criterion, but +this is clear enough, so I stopped here. 50 kb seems a reasonable +cutoff point. + +I also used this to benchmark the SHA256 in Haskell's Crypto package. +Surprisingly, it's a *lot* slower than even the Pure.SHA code. +On a 50 kb file: + + benchmarking sha256/Crypto + collecting 100 samples, 1 iterations each, in estimated 6.073809 s + mean: 69.89037 ms, lb 69.15831 ms, ub 70.71845 ms, ci 0.950 + std dev: 3.995397 ms, lb 3.435775 ms, ub 4.721952 ms, ci 0.950 + + +There's another Haskell library, [SHA2](http://hackage.haskell.org/package/SHA2), +which I should try some time. diff --git a/doc/design/assistant/syncing.mdwn b/doc/design/assistant/syncing.mdwn index e3fdca316..aa3c6066b 100644 --- a/doc/design/assistant/syncing.mdwn +++ b/doc/design/assistant/syncing.mdwn @@ -10,14 +10,15 @@ all the other git clones, at both the git level and the key/value level. * transfer info for git-annex-shell **done** * update files as transfers proceed. See [[progressbars]] (updating for downloads is easy; for uploads is hard) -* add Transfer queue TChan +* add Transfer queue TChan **done** +* add TransferInfo Map to DaemonStatus for tracking transfers in progress. + **done** +* Poll transfer in progress info files for changes (use inotify again! + wow! hammer, meet nail..), and update the TransferInfo Map **done** * enqueue Transfers (Uploads) as new files are added to the annex by Watcher. * enqueue Tranferrs (Downloads) as new dangling symlinks are noticed by Watcher. -* add TransferInfo Map to DaemonStatus for tracking transfers in progress. -* Poll transfer in progress info files for changes (use inotify again! - wow! hammer, meet nail..), and update the TransferInfo Map * Write basic Transfer handling thread. Multiple such threads need to be able to be run at once. Each will need its own independant copy of the Annex state monad. @@ -52,6 +53,9 @@ all the other git clones, at both the git level and the key/value level. signaling a change out of band. 4. Add a hook, so when there's a change to sync, a program can be run and do its own signaling. +5. --debug will show often unnecessary work being done. Optimise. +6. It would be nice if, when a USB drive is connected, + syncing starts automatically. Use dbus on Linux? ## misc todo @@ -89,13 +93,12 @@ anyway. that lack content. * Transfer threads started/stopped as necessary to move data. (May sometimes want multiple threads downloading, or uploading, or even both.) - - type TransferQueue = TChan [Transfer] - -- add (M.Map Transfer TransferInfo) to DaemonStatus - startTransfer :: Transfer -> Annex TransferID + startTransfer :: TransferQueue -> Transfer -> Annex () + startTransfer q transfer = error "TODO" - stopTransfer :: TransferID -> IO () + stopTransfer :: TransferQueue -> TransferID -> Annex () + stopTransfer q transfer = error "TODO" The assistant needs to find out when `git-annex-shell` is receiving or sending (triggered by another remote), so it can add data for those too. diff --git a/doc/install/OSX/comment_14_6ef2ddb7b11ce6ad54578ae118ed346e._comment b/doc/install/OSX/comment_14_6ef2ddb7b11ce6ad54578ae118ed346e._comment new file mode 100644 index 000000000..35e0bb6ed --- /dev/null +++ b/doc/install/OSX/comment_14_6ef2ddb7b11ce6ad54578ae118ed346e._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="http://joeyh.name/" + subject="comment 14" + date="2012-07-04T12:43:54Z" + content=""" +@Damien, hmm, it should not be using any cp options, unless when it was built there was a cp in the path that supported some option like -p. Can you check with --debug what cp parameters it's trying to use? + + +"""]] diff --git a/doc/install/OSX/comment_15_6fd1fad5b6d9f36620e5a0e99edd2f89._comment b/doc/install/OSX/comment_15_6fd1fad5b6d9f36620e5a0e99edd2f89._comment new file mode 100644 index 000000000..0005328c4 --- /dev/null +++ b/doc/install/OSX/comment_15_6fd1fad5b6d9f36620e5a0e99edd2f89._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="http://joeyh.name/" + subject="comment 15" + date="2012-07-04T13:14:00Z" + content=""" +git-annex will now fall back to slower pure Haskell hashing code if `sha256sum`, etc programs are not in PATH. I'd still recommend installing the coreutils, as they're probably faster. + +(The `shasum` command seems to come from a perl library, so I have not tried to make git-annex use that one.) +"""]] |