summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2017-05-15 18:10:13 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2017-05-15 21:36:03 -0400
commit973180b077e60b5d12d7c57d926878d11d7f2105 (patch)
tree7d3bc5e651d330c1945c81f356b59eca90e73ea2
parent0d85a42333484e1acb8e4942a619087768bc62fb (diff)
stop using MissingH for MD5
Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project.
-rw-r--r--Annex/DirHashes.hs24
-rw-r--r--Annex/Ssh.hs4
-rw-r--r--Annex/VariantFile.hs5
-rw-r--r--Backend/Utilities.hs6
-rw-r--r--Utility/FileSystemEncoding.hs6
-rw-r--r--Utility/Hash.hs6
-rw-r--r--Utility/LockFile/PidLock.hs7
-rw-r--r--Utility/Tuple.hs15
-rw-r--r--debian/control1
-rw-r--r--doc/todo/make_copy_--fast__faster/comment_11_1ca8d9765e6e3a18ae09df74bc390a00._comment49
-rw-r--r--git-annex.cabal3
11 files changed, 97 insertions, 29 deletions
diff --git a/Annex/DirHashes.hs b/Annex/DirHashes.hs
index 82d751eee..f8438484d 100644
--- a/Annex/DirHashes.hs
+++ b/Annex/DirHashes.hs
@@ -1,6 +1,6 @@
{- git-annex file locations
-
- - Copyright 2010-2015 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2017 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@@ -19,14 +19,15 @@ module Annex.DirHashes (
import Data.Bits
import Data.Word
-import Data.Hash.MD5
import Data.Default
+import qualified Data.ByteArray
import Common
import Key
import Types.GitConfig
import Types.Difference
import Utility.FileSystemEncoding
+import Utility.Hash
type Hasher = Key -> FilePath
@@ -62,15 +63,24 @@ hashDirs :: HashLevels -> Int -> String -> FilePath
hashDirs (HashLevels 1) sz s = addTrailingPathSeparator $ take sz s
hashDirs _ sz s = addTrailingPathSeparator $ take sz s </> drop sz s
+hashDirLower :: HashLevels -> Hasher
+hashDirLower n k = hashDirs n 3 $ take 6 $ show $ md5 $
+ encodeBS $ key2file $ nonChunkKey k
+
+{- This was originally using Data.Hash.MD5 from MissingH. This new version
+- is faster, but ugly as it has to replicate the 4 Word32's that produced. -}
hashDirMixed :: HashLevels -> Hasher
-hashDirMixed n k = hashDirs n 2 $ take 4 $ display_32bits_as_dir =<< [a,b,c,d]
+hashDirMixed n k = hashDirs n 2 $ take 4 $ concatMap display_32bits_as_dir $
+ encodeWord32 $ map fromIntegral $ Data.ByteArray.unpack $
+ Utility.Hash.md5 $ encodeBS $ key2file $ nonChunkKey k
where
- ABCD (a,b,c,d) = md5 $ md5FilePath $ key2file $ nonChunkKey k
-
-hashDirLower :: HashLevels -> Hasher
-hashDirLower n k = hashDirs n 3 $ take 6 $ md5s $ md5FilePath $ key2file $ nonChunkKey k
+ encodeWord32 (b1:b2:b3:b4:rest) =
+ (shiftL b4 24 .|. shiftL b3 16 .|. shiftL b2 8 .|. b1)
+ : encodeWord32 rest
+ encodeWord32 _ = []
{- modified version of display_32bits_as_hex from Data.Hash.MD5
+ - in MissingH
- Copyright (C) 2001 Ian Lynagh
- License: Either BSD or GPL
-}
diff --git a/Annex/Ssh.hs b/Annex/Ssh.hs
index c53802941..a9ff91751 100644
--- a/Annex/Ssh.hs
+++ b/Annex/Ssh.hs
@@ -33,6 +33,7 @@ import Config
import Annex.Path
import Utility.Env
import Utility.FileSystemEncoding
+import Utility.Hash
import Types.CleanupActions
import Types.Concurrency
import Git.Env
@@ -42,7 +43,6 @@ import Annex.Perms
import Annex.LockPool
#endif
-import Data.Hash.MD5
import Control.Concurrent.STM
{- Some ssh commands are fed stdin on a pipe and so should be allowed to
@@ -287,7 +287,7 @@ hostport2socket host Nothing = hostport2socket' host
hostport2socket host (Just port) = hostport2socket' $ host ++ "!" ++ show port
hostport2socket' :: String -> FilePath
hostport2socket' s
- | length s > lengthofmd5s = md5s (Str s)
+ | length s > lengthofmd5s = show $ md5 $ encodeBS s
| otherwise = s
where
lengthofmd5s = 32
diff --git a/Annex/VariantFile.hs b/Annex/VariantFile.hs
index 17658a9c6..8365073d5 100644
--- a/Annex/VariantFile.hs
+++ b/Annex/VariantFile.hs
@@ -9,8 +9,7 @@ module Annex.VariantFile where
import Annex.Common
import Utility.FileSystemEncoding
-
-import Data.Hash.MD5
+import Utility.Hash
variantMarker :: String
variantMarker = ".variant-"
@@ -42,4 +41,4 @@ variantFile file key
doubleconflict = variantMarker `isInfixOf` file
shortHash :: String -> String
-shortHash = take 4 . md5s . md5FilePath
+shortHash = take 4 . show . md5 . encodeBS
diff --git a/Backend/Utilities.hs b/Backend/Utilities.hs
index d1fb94f2a..1691fa2b2 100644
--- a/Backend/Utilities.hs
+++ b/Backend/Utilities.hs
@@ -7,10 +7,9 @@
module Backend.Utilities where
-import Data.Hash.MD5
-
import Annex.Common
import Utility.FileSystemEncoding
+import Utility.Hash
{- Generates a keyName from an input string. Takes care of sanitizing it.
- If it's not too long, the full string is used as the keyName.
@@ -20,7 +19,8 @@ genKeyName :: String -> String
genKeyName s
-- Avoid making keys longer than the length of a SHA256 checksum.
| bytelen > sha256len =
- truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++ md5s (Str s)
+ truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++
+ show (md5 (encodeBS s))
| otherwise = s'
where
s' = preSanitizeKeyName s
diff --git a/Utility/FileSystemEncoding.hs b/Utility/FileSystemEncoding.hs
index ae3bd35d7..785b078ef 100644
--- a/Utility/FileSystemEncoding.hs
+++ b/Utility/FileSystemEncoding.hs
@@ -12,7 +12,6 @@ module Utility.FileSystemEncoding (
useFileSystemEncoding,
fileEncoding,
withFilePath,
- md5FilePath,
decodeBS,
encodeBS,
decodeW8,
@@ -27,7 +26,6 @@ import qualified GHC.IO.Encoding as Encoding
import Foreign.C
import System.IO
import System.IO.Unsafe
-import qualified Data.Hash.MD5 as MD5
import Data.Word
import Data.Bits.Utils
import Data.List
@@ -101,10 +99,6 @@ _encodeFilePath fp = unsafePerformIO $ do
GHC.withCString enc fp (GHC.peekCString Encoding.char8)
`catchNonAsync` (\_ -> return fp)
-{- Encodes a FilePath into a Md5.Str, applying the filesystem encoding. -}
-md5FilePath :: FilePath -> MD5.Str
-md5FilePath = MD5.Str . _encodeFilePath
-
{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
decodeBS :: L.ByteString -> FilePath
#ifndef mingw32_HOST_OS
diff --git a/Utility/Hash.hs b/Utility/Hash.hs
index b6bf996f8..70f826b7a 100644
--- a/Utility/Hash.hs
+++ b/Utility/Hash.hs
@@ -1,8 +1,4 @@
-{- Convenience wrapper around cryptohash/cryptonite.
- -
- - SHA3 hashes are currently only enabled when using cryptonite,
- - because of https://github.com/vincenthz/hs-cryptohash/issues/36
- -}
+{- Convenience wrapper around cryptonite's hashing. -}
module Utility.Hash (
sha1,
diff --git a/Utility/LockFile/PidLock.hs b/Utility/LockFile/PidLock.hs
index 87c11c01c..23560fa57 100644
--- a/Utility/LockFile/PidLock.hs
+++ b/Utility/LockFile/PidLock.hs
@@ -25,6 +25,8 @@ import Utility.Path
import Utility.FileMode
import Utility.LockFile.LockStatus
import Utility.ThreadScheduler
+import Utility.Hash
+import Utility.FileSystemEncoding
import qualified Utility.LockFile.Posix as Posix
import System.IO
@@ -33,7 +35,6 @@ import Data.Maybe
import Data.List
import Network.BSD
import System.FilePath
-import Data.Hash.MD5
import Control.Applicative
import Prelude
@@ -99,7 +100,9 @@ sideLockFile lockfile = do
f <- absPath lockfile
let base = intercalate "_" (splitDirectories (makeRelative "/" f))
let shortbase = reverse $ take 32 $ reverse base
- let md5sum = if base == shortbase then "" else md5s (Str base)
+ let md5sum = if base == shortbase
+ then ""
+ else show (md5 (encodeBS base))
dir <- ifM (doesDirectoryExist "/dev/shm")
( return "/dev/shm"
, return "/tmp"
diff --git a/Utility/Tuple.hs b/Utility/Tuple.hs
new file mode 100644
index 000000000..78dd5d0c9
--- /dev/null
+++ b/Utility/Tuple.hs
@@ -0,0 +1,15 @@
+{- tuple utility functions
+ -
+ - Copyright 2017 Joey Hess <id@joeyh.name>
+ -
+ - License: BSD-2-clause
+ -}
+
+fst3 :: (a,b,c) -> a
+fst3 (a,b,c) = a
+
+snd3 :: (a,b,c) -> b
+snd3 (a,b,c) = b
+
+thd3 :: (a,b,c) -> c
+thd3 (a,b,c) = c
diff --git a/debian/control b/debian/control
index fa68c87fe..1d012cea0 100644
--- a/debian/control
+++ b/debian/control
@@ -11,6 +11,7 @@ Build-Depends:
libghc-hslogger-dev,
libghc-pcre-light-dev,
libghc-cryptonite-dev,
+ libghc-memory-dev,
libghc-sandi-dev,
libghc-utf8-string-dev,
libghc-aws-dev (>= 0.9.2-2~),
diff --git a/doc/todo/make_copy_--fast__faster/comment_11_1ca8d9765e6e3a18ae09df74bc390a00._comment b/doc/todo/make_copy_--fast__faster/comment_11_1ca8d9765e6e3a18ae09df74bc390a00._comment
new file mode 100644
index 000000000..619351d4c
--- /dev/null
+++ b/doc/todo/make_copy_--fast__faster/comment_11_1ca8d9765e6e3a18ae09df74bc390a00._comment
@@ -0,0 +1,49 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 11"""
+ date="2017-05-15T21:56:52Z"
+ content="""
+Switched from MissingH to cryptonite for md5. It did move md5 out of the top CPU spot but
+the overall runtime didn't change much. Memory allocations did go down by a
+good amount.
+
+Updated profiles:
+
+ git-annex +RTS -p -RTS find
+
+ total time = 1.63 secs (1629 ticks @ 1000 us, 1 processor)
+ total alloc = 1,496,336,496 bytes (excludes profiling overheads)
+
+ COST CENTRE MODULE SRC %time %alloc
+
+ catchIO Utility.Exception Utility/Exception.hs:79:1-17 14.1 15.1
+ inAnnex'.checkindirect Annex.Content Annex/Content.hs:(108,9)-(119,39) 10.6 4.8
+ catches Control.Monad.Catch src/Control/Monad/Catch.hs:(432,1)-(436,76) 8.6 6.9
+ spanList Data.List.Utils src/Data/List/Utils.hs:(150,1)-(155,36) 6.7 11.1
+ isAnnexLink Annex.Link Annex/Link.hs:35:1-85 5.0 10.2
+ keyFile Annex.Locations Annex/Locations.hs:(456,1)-(462,19) 5.0 7.0
+ readish Utility.PartialPrelude Utility/PartialPrelude.hs:(48,1)-(50,20) 3.8 2.0
+ startswith Data.List.Utils src/Data/List/Utils.hs:103:1-23 3.6 2.3
+ splitc Utility.Misc Utility/Misc.hs:(52,1)-(54,25) 3.4 6.5
+ s2w8 Data.Bits.Utils src/Data/Bits/Utils.hs:65:1-15 2.6 6.4
+ keyPath Annex.Locations Annex/Locations.hs:(492,1)-(494,23) 2.5 4.4
+ fileKey.unesc Annex.Locations Annex/Locations.hs:(469,9)-(474,39) 2.0 3.5
+ copyAndFreeze Data.ByteArray.Methods Data/ByteArray/Methods.hs:(224,1)-(227,21) 1.8 0.5
+
+ git-annex +RTS -p -RTS find --not --in web
+
+ total time = 5.33 secs (5327 ticks @ 1000 us, 1 processor)
+ total alloc = 2,908,489,000 bytes (excludes profiling overheads)
+
+ COST CENTRE MODULE SRC %time %alloc
+
+ catObjectDetails.\ Git.CatFile Git/CatFile.hs:(80,72)-(88,97) 7.8 2.8
+ catchIO Utility.Exception Utility/Exception.hs:79:1-17 7.6 8.3
+ spanList Data.List.Utils src/Data/List/Utils.hs:(150,1)-(155,36) 5.8 9.1
+ readish Utility.PartialPrelude Utility/PartialPrelude.hs:(48,1)-(50,20) 4.5 4.0
+ parseResp Git.CatFile Git/CatFile.hs:(113,1)-(124,28) 4.4 2.9
+ readFileStrict Utility.Misc Utility/Misc.hs:33:1-59 3.7 1.6
+ catches Control.Monad.Catch src/Control/Monad/Catch.hs:(432,1)-(436,76) 3.1 3.6
+ encodeW8 Utility.FileSystemEncoding Utility/FileSystemEncoding.hs:(131,1)-(133,70) 3.1 2.3
+
+"""]]
diff --git a/git-annex.cabal b/git-annex.cabal
index 545fab3f7..8d8255027 100644
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -361,7 +361,8 @@ Executable git-annex
stm-chans,
securemem,
crypto-api,
- cryptonite
+ cryptonite,
+ memory
CC-Options: -Wall
GHC-Options: -Wall -fno-warn-tabs
Extensions: PackageImports