From 973180b077e60b5d12d7c57d926878d11d7f2105 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 15 May 2017 18:10:13 -0400 Subject: stop using MissingH for MD5 Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project. --- Annex/DirHashes.hs | 24 +++++++---- Annex/Ssh.hs | 4 +- Annex/VariantFile.hs | 5 +-- Backend/Utilities.hs | 6 +-- Utility/FileSystemEncoding.hs | 6 --- Utility/Hash.hs | 6 +-- Utility/LockFile/PidLock.hs | 7 +++- Utility/Tuple.hs | 15 +++++++ debian/control | 1 + ...nt_11_1ca8d9765e6e3a18ae09df74bc390a00._comment | 49 ++++++++++++++++++++++ git-annex.cabal | 3 +- 11 files changed, 97 insertions(+), 29 deletions(-) create mode 100644 Utility/Tuple.hs create mode 100644 doc/todo/make_copy_--fast__faster/comment_11_1ca8d9765e6e3a18ae09df74bc390a00._comment diff --git a/Annex/DirHashes.hs b/Annex/DirHashes.hs index 82d751eee..f8438484d 100644 --- a/Annex/DirHashes.hs +++ b/Annex/DirHashes.hs @@ -1,6 +1,6 @@ {- git-annex file locations - - - Copyright 2010-2015 Joey Hess + - Copyright 2010-2017 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -19,14 +19,15 @@ module Annex.DirHashes ( import Data.Bits import Data.Word -import Data.Hash.MD5 import Data.Default +import qualified Data.ByteArray import Common import Key import Types.GitConfig import Types.Difference import Utility.FileSystemEncoding +import Utility.Hash type Hasher = Key -> FilePath @@ -62,15 +63,24 @@ hashDirs :: HashLevels -> Int -> String -> FilePath hashDirs (HashLevels 1) sz s = addTrailingPathSeparator $ take sz s hashDirs _ sz s = addTrailingPathSeparator $ take sz s drop sz s +hashDirLower :: HashLevels -> Hasher +hashDirLower n k = hashDirs n 3 $ take 6 $ show $ md5 $ + encodeBS $ key2file $ nonChunkKey k + +{- This was originally using Data.Hash.MD5 from MissingH. This new version +- is faster, but ugly as it has to replicate the 4 Word32's that produced. -} hashDirMixed :: HashLevels -> Hasher -hashDirMixed n k = hashDirs n 2 $ take 4 $ display_32bits_as_dir =<< [a,b,c,d] +hashDirMixed n k = hashDirs n 2 $ take 4 $ concatMap display_32bits_as_dir $ + encodeWord32 $ map fromIntegral $ Data.ByteArray.unpack $ + Utility.Hash.md5 $ encodeBS $ key2file $ nonChunkKey k where - ABCD (a,b,c,d) = md5 $ md5FilePath $ key2file $ nonChunkKey k - -hashDirLower :: HashLevels -> Hasher -hashDirLower n k = hashDirs n 3 $ take 6 $ md5s $ md5FilePath $ key2file $ nonChunkKey k + encodeWord32 (b1:b2:b3:b4:rest) = + (shiftL b4 24 .|. shiftL b3 16 .|. shiftL b2 8 .|. b1) + : encodeWord32 rest + encodeWord32 _ = [] {- modified version of display_32bits_as_hex from Data.Hash.MD5 + - in MissingH - Copyright (C) 2001 Ian Lynagh - License: Either BSD or GPL -} diff --git a/Annex/Ssh.hs b/Annex/Ssh.hs index c53802941..a9ff91751 100644 --- a/Annex/Ssh.hs +++ b/Annex/Ssh.hs @@ -33,6 +33,7 @@ import Config import Annex.Path import Utility.Env import Utility.FileSystemEncoding +import Utility.Hash import Types.CleanupActions import Types.Concurrency import Git.Env @@ -42,7 +43,6 @@ import Annex.Perms import Annex.LockPool #endif -import Data.Hash.MD5 import Control.Concurrent.STM {- Some ssh commands are fed stdin on a pipe and so should be allowed to @@ -287,7 +287,7 @@ hostport2socket host Nothing = hostport2socket' host hostport2socket host (Just port) = hostport2socket' $ host ++ "!" ++ show port hostport2socket' :: String -> FilePath hostport2socket' s - | length s > lengthofmd5s = md5s (Str s) + | length s > lengthofmd5s = show $ md5 $ encodeBS s | otherwise = s where lengthofmd5s = 32 diff --git a/Annex/VariantFile.hs b/Annex/VariantFile.hs index 17658a9c6..8365073d5 100644 --- a/Annex/VariantFile.hs +++ b/Annex/VariantFile.hs @@ -9,8 +9,7 @@ module Annex.VariantFile where import Annex.Common import Utility.FileSystemEncoding - -import Data.Hash.MD5 +import Utility.Hash variantMarker :: String variantMarker = ".variant-" @@ -42,4 +41,4 @@ variantFile file key doubleconflict = variantMarker `isInfixOf` file shortHash :: String -> String -shortHash = take 4 . md5s . md5FilePath +shortHash = take 4 . show . md5 . encodeBS diff --git a/Backend/Utilities.hs b/Backend/Utilities.hs index d1fb94f2a..1691fa2b2 100644 --- a/Backend/Utilities.hs +++ b/Backend/Utilities.hs @@ -7,10 +7,9 @@ module Backend.Utilities where -import Data.Hash.MD5 - import Annex.Common import Utility.FileSystemEncoding +import Utility.Hash {- Generates a keyName from an input string. Takes care of sanitizing it. - If it's not too long, the full string is used as the keyName. @@ -20,7 +19,8 @@ genKeyName :: String -> String genKeyName s -- Avoid making keys longer than the length of a SHA256 checksum. | bytelen > sha256len = - truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++ md5s (Str s) + truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++ + show (md5 (encodeBS s)) | otherwise = s' where s' = preSanitizeKeyName s diff --git a/Utility/FileSystemEncoding.hs b/Utility/FileSystemEncoding.hs index ae3bd35d7..785b078ef 100644 --- a/Utility/FileSystemEncoding.hs +++ b/Utility/FileSystemEncoding.hs @@ -12,7 +12,6 @@ module Utility.FileSystemEncoding ( useFileSystemEncoding, fileEncoding, withFilePath, - md5FilePath, decodeBS, encodeBS, decodeW8, @@ -27,7 +26,6 @@ import qualified GHC.IO.Encoding as Encoding import Foreign.C import System.IO import System.IO.Unsafe -import qualified Data.Hash.MD5 as MD5 import Data.Word import Data.Bits.Utils import Data.List @@ -101,10 +99,6 @@ _encodeFilePath fp = unsafePerformIO $ do GHC.withCString enc fp (GHC.peekCString Encoding.char8) `catchNonAsync` (\_ -> return fp) -{- Encodes a FilePath into a Md5.Str, applying the filesystem encoding. -} -md5FilePath :: FilePath -> MD5.Str -md5FilePath = MD5.Str . _encodeFilePath - {- Decodes a ByteString into a FilePath, applying the filesystem encoding. -} decodeBS :: L.ByteString -> FilePath #ifndef mingw32_HOST_OS diff --git a/Utility/Hash.hs b/Utility/Hash.hs index b6bf996f8..70f826b7a 100644 --- a/Utility/Hash.hs +++ b/Utility/Hash.hs @@ -1,8 +1,4 @@ -{- Convenience wrapper around cryptohash/cryptonite. - - - - SHA3 hashes are currently only enabled when using cryptonite, - - because of https://github.com/vincenthz/hs-cryptohash/issues/36 - -} +{- Convenience wrapper around cryptonite's hashing. -} module Utility.Hash ( sha1, diff --git a/Utility/LockFile/PidLock.hs b/Utility/LockFile/PidLock.hs index 87c11c01c..23560fa57 100644 --- a/Utility/LockFile/PidLock.hs +++ b/Utility/LockFile/PidLock.hs @@ -25,6 +25,8 @@ import Utility.Path import Utility.FileMode import Utility.LockFile.LockStatus import Utility.ThreadScheduler +import Utility.Hash +import Utility.FileSystemEncoding import qualified Utility.LockFile.Posix as Posix import System.IO @@ -33,7 +35,6 @@ import Data.Maybe import Data.List import Network.BSD import System.FilePath -import Data.Hash.MD5 import Control.Applicative import Prelude @@ -99,7 +100,9 @@ sideLockFile lockfile = do f <- absPath lockfile let base = intercalate "_" (splitDirectories (makeRelative "/" f)) let shortbase = reverse $ take 32 $ reverse base - let md5sum = if base == shortbase then "" else md5s (Str base) + let md5sum = if base == shortbase + then "" + else show (md5 (encodeBS base)) dir <- ifM (doesDirectoryExist "/dev/shm") ( return "/dev/shm" , return "/tmp" diff --git a/Utility/Tuple.hs b/Utility/Tuple.hs new file mode 100644 index 000000000..78dd5d0c9 --- /dev/null +++ b/Utility/Tuple.hs @@ -0,0 +1,15 @@ +{- tuple utility functions + - + - Copyright 2017 Joey Hess + - + - License: BSD-2-clause + -} + +fst3 :: (a,b,c) -> a +fst3 (a,b,c) = a + +snd3 :: (a,b,c) -> b +snd3 (a,b,c) = b + +thd3 :: (a,b,c) -> c +thd3 (a,b,c) = c diff --git a/debian/control b/debian/control index fa68c87fe..1d012cea0 100644 --- a/debian/control +++ b/debian/control @@ -11,6 +11,7 @@ Build-Depends: libghc-hslogger-dev, libghc-pcre-light-dev, libghc-cryptonite-dev, + libghc-memory-dev, libghc-sandi-dev, libghc-utf8-string-dev, libghc-aws-dev (>= 0.9.2-2~), diff --git a/doc/todo/make_copy_--fast__faster/comment_11_1ca8d9765e6e3a18ae09df74bc390a00._comment b/doc/todo/make_copy_--fast__faster/comment_11_1ca8d9765e6e3a18ae09df74bc390a00._comment new file mode 100644 index 000000000..619351d4c --- /dev/null +++ b/doc/todo/make_copy_--fast__faster/comment_11_1ca8d9765e6e3a18ae09df74bc390a00._comment @@ -0,0 +1,49 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 11""" + date="2017-05-15T21:56:52Z" + content=""" +Switched from MissingH to cryptonite for md5. It did move md5 out of the top CPU spot but +the overall runtime didn't change much. Memory allocations did go down by a +good amount. + +Updated profiles: + + git-annex +RTS -p -RTS find + + total time = 1.63 secs (1629 ticks @ 1000 us, 1 processor) + total alloc = 1,496,336,496 bytes (excludes profiling overheads) + + COST CENTRE MODULE SRC %time %alloc + + catchIO Utility.Exception Utility/Exception.hs:79:1-17 14.1 15.1 + inAnnex'.checkindirect Annex.Content Annex/Content.hs:(108,9)-(119,39) 10.6 4.8 + catches Control.Monad.Catch src/Control/Monad/Catch.hs:(432,1)-(436,76) 8.6 6.9 + spanList Data.List.Utils src/Data/List/Utils.hs:(150,1)-(155,36) 6.7 11.1 + isAnnexLink Annex.Link Annex/Link.hs:35:1-85 5.0 10.2 + keyFile Annex.Locations Annex/Locations.hs:(456,1)-(462,19) 5.0 7.0 + readish Utility.PartialPrelude Utility/PartialPrelude.hs:(48,1)-(50,20) 3.8 2.0 + startswith Data.List.Utils src/Data/List/Utils.hs:103:1-23 3.6 2.3 + splitc Utility.Misc Utility/Misc.hs:(52,1)-(54,25) 3.4 6.5 + s2w8 Data.Bits.Utils src/Data/Bits/Utils.hs:65:1-15 2.6 6.4 + keyPath Annex.Locations Annex/Locations.hs:(492,1)-(494,23) 2.5 4.4 + fileKey.unesc Annex.Locations Annex/Locations.hs:(469,9)-(474,39) 2.0 3.5 + copyAndFreeze Data.ByteArray.Methods Data/ByteArray/Methods.hs:(224,1)-(227,21) 1.8 0.5 + + git-annex +RTS -p -RTS find --not --in web + + total time = 5.33 secs (5327 ticks @ 1000 us, 1 processor) + total alloc = 2,908,489,000 bytes (excludes profiling overheads) + + COST CENTRE MODULE SRC %time %alloc + + catObjectDetails.\ Git.CatFile Git/CatFile.hs:(80,72)-(88,97) 7.8 2.8 + catchIO Utility.Exception Utility/Exception.hs:79:1-17 7.6 8.3 + spanList Data.List.Utils src/Data/List/Utils.hs:(150,1)-(155,36) 5.8 9.1 + readish Utility.PartialPrelude Utility/PartialPrelude.hs:(48,1)-(50,20) 4.5 4.0 + parseResp Git.CatFile Git/CatFile.hs:(113,1)-(124,28) 4.4 2.9 + readFileStrict Utility.Misc Utility/Misc.hs:33:1-59 3.7 1.6 + catches Control.Monad.Catch src/Control/Monad/Catch.hs:(432,1)-(436,76) 3.1 3.6 + encodeW8 Utility.FileSystemEncoding Utility/FileSystemEncoding.hs:(131,1)-(133,70) 3.1 2.3 + +"""]] diff --git a/git-annex.cabal b/git-annex.cabal index 545fab3f7..8d8255027 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -361,7 +361,8 @@ Executable git-annex stm-chans, securemem, crypto-api, - cryptonite + cryptonite, + memory CC-Options: -Wall GHC-Options: -Wall -fno-warn-tabs Extensions: PackageImports -- cgit v1.2.3