diff options
-rw-r--r-- | CmdLine/GitAnnex.hs | 6 | ||||
-rw-r--r-- | Command/Benchmark.hs | 106 | ||||
-rw-r--r-- | Common.hs | 2 | ||||
-rw-r--r-- | Database/Keys.hs | 123 | ||||
-rw-r--r-- | Database/Keys/SQL.hs | 108 | ||||
-rw-r--r-- | doc/bugs/cannot_add_a_files_with_an_accent_in_it/comment_1_d465d7f88c8f11b4b636fba56711d745._comment | 23 | ||||
-rw-r--r-- | doc/git-annex.mdwn | 5 | ||||
-rw-r--r-- | doc/todo/smudge.mdwn | 8 | ||||
-rw-r--r-- | git-annex.cabal | 8 |
9 files changed, 281 insertions, 108 deletions
diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs index ba7689f70..0383dada3 100644 --- a/CmdLine/GitAnnex.hs +++ b/CmdLine/GitAnnex.hs @@ -115,6 +115,9 @@ import qualified Command.Test import qualified Command.FuzzTest import qualified Command.TestRemote #endif +#ifdef WITH_BENCHMARK +import qualified Command.Benchmark +#endif #ifdef WITH_EKG import System.Remote.Monitoring #endif @@ -221,6 +224,9 @@ cmds testoptparser testrunner = , Command.FuzzTest.cmd , Command.TestRemote.cmd #endif +#ifdef WITH_BENCHMARK + , Command.Benchmark.cmd +#endif ] run :: Parser TestOptions -> Maybe TestRunner -> [String] -> IO () diff --git a/Command/Benchmark.hs b/Command/Benchmark.hs new file mode 100644 index 000000000..d4585fdb8 --- /dev/null +++ b/Command/Benchmark.hs @@ -0,0 +1,106 @@ +{- git-annex benchmark + - + - Copyright 2016 Joey Hess <id@joeyh.name> + - + - Licensed under the GNU GPL version 3 or higher. + -} + +{-# OPTIONS_GHC -fno-warn-orphans #-} + +module Command.Benchmark where + +import Command +import Database.Types +import qualified Database.Keys.SQL as SQL +import qualified Database.Queue as H +import Utility.Tmp +import Git.FilePath + +import Criterion.Main +import Criterion.Internal (runAndAnalyse) +import Criterion.Monad +import Control.Monad.IO.Class (liftIO) +import Control.Monad +import Control.DeepSeq +import System.FilePath +import System.Random + +cmd :: Command +cmd = noRepo (withParams benchmark) $ + dontCheck repoExists $ + command "benchmark" SectionTesting + "run benchmarks" + paramNothing + (withParams (liftIO . benchmark)) + +benchmark :: CmdParams -> IO () +benchmark _ = withTmpDirIn "." "benchmark" $ \tmpdir -> do + -- benchmark different sizes of databases + dbs <- mapM (benchDb tmpdir) + [ 1000 + , 10000 + -- , 100000 + ] + -- can't use Criterion's defaultMain here because it looks at + -- command-line parameters + withConfig defaultConfig $ runAndAnalyse (const True) $ + bgroup "keys database" $ flip concatMap dbs $ \db -> + [ getAssociatedFilesHitBench db + , getAssociatedFilesMissBench db + , getAssociatedKeyHitBench db + , getAssociatedKeyMissBench db + ] + +getAssociatedFilesHitBench :: BenchDb -> Benchmark +getAssociatedFilesHitBench ( BenchDb h num) = bench ("getAssociatedFiles from " ++ show num ++ " (hit)") $ nfIO $ do + n <- getStdRandom (randomR (1,num)) + SQL.getAssociatedFiles (keyN n) (SQL.ReadHandle h) + +getAssociatedFilesMissBench :: BenchDb -> Benchmark +getAssociatedFilesMissBench ( BenchDb h num) = bench ("getAssociatedFiles from " ++ show num ++ " (miss)") $ nfIO $ + SQL.getAssociatedFiles keyMiss (SQL.ReadHandle h) + +getAssociatedKeyHitBench :: BenchDb -> Benchmark +getAssociatedKeyHitBench (BenchDb h num) = bench ("getAssociatedKey from " ++ show num ++ " (hit)") $ nfIO $ do + n <- getStdRandom (randomR (1,num)) + SQL.getAssociatedKey (fileN n) (SQL.ReadHandle h) + +getAssociatedKeyMissBench :: BenchDb -> Benchmark +getAssociatedKeyMissBench (BenchDb h num) = bench ("getAssociatedKey from " ++ show num ++ " (miss)") $ nfIO $ + SQL.getAssociatedKey fileMiss (SQL.ReadHandle h) + +populateAssociatedFiles :: H.DbQueue -> Int -> IO () +populateAssociatedFiles h num = do + forM_ [1..num] $ \n -> + SQL.addAssociatedFile (keyN n) (fileN n) (SQL.WriteHandle h) + H.flushDbQueue h + +keyN :: Int -> SKey +keyN n = SKey ("key" ++ show n) + +fileN :: Int -> TopFilePath +fileN n = asTopFilePath ("file" ++ show n) + +keyMiss :: SKey +keyMiss = keyN 0 -- 0 is never stored + +fileMiss :: TopFilePath +fileMiss = fileN 0 -- 0 is never stored + +data BenchDb = BenchDb H.DbQueue Int + +benchDb :: FilePath -> Int -> IO BenchDb +benchDb tmpdir num = do + putStrLn $ "setting up database with " ++ show num + H.initDb f SQL.createTables + h <- H.openDbQueue f SQL.containedTable + populateAssociatedFiles h num + return (BenchDb h num) + where + f = tmpdir </> "db" ++ show num + +instance NFData TopFilePath where + rnf = rnf . getTopFilePath + +instance NFData SKey where + rnf (SKey s) = rnf s @@ -5,7 +5,7 @@ module Common (module X) where import Control.Monad as X import Control.Monad.IfElse as X import Control.Applicative as X -import "mtl" Control.Monad.State.Strict as X (liftIO) +import Control.Monad.IO.Class as X (liftIO) import Data.Maybe as X import Data.List as X hiding (head, tail, init, last) diff --git a/Database/Keys.hs b/Database/Keys.hs index 4c4c65850..faf34132a 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -21,10 +21,9 @@ module Database.Keys ( addInodeCaches, getInodeCaches, removeInodeCaches, - AssociatedId, - ContentId, ) where +import qualified Database.Keys.SQL as SQL import Database.Types import Database.Keys.Handle import qualified Database.Queue as H @@ -42,24 +41,7 @@ import Git.Ref import Git.FilePath import Annex.CatFile -import Database.Persist.TH import Database.Esqueleto hiding (Key) -import Data.Time.Clock - -share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase| -Associated - key SKey - file FilePath - KeyFileIndex key file -Content - key SKey - cache SInodeCache - KeyCacheIndex key cache -|] - -newtype ReadHandle = ReadHandle H.DbQueue - -type Reader v = ReadHandle -> Annex v {- Runs an action that reads from the database. - @@ -72,7 +54,7 @@ type Reader v = ReadHandle -> Annex v - - Any queued writes will be flushed before the read. -} -runReader :: Monoid v => Reader v -> Annex v +runReader :: Monoid v => (SQL.ReadHandle -> Annex v) -> Annex v runReader a = do h <- getDbHandle withDbState h go @@ -80,50 +62,39 @@ runReader a = do go DbEmpty = return (mempty, DbEmpty) go st@(DbOpen qh) = do liftIO $ H.flushDbQueue qh - v <- a (ReadHandle qh) + v <- a (SQL.ReadHandle qh) return (v, st) go DbClosed = do st' <- openDb False DbClosed v <- case st' of - (DbOpen qh) -> a (ReadHandle qh) + (DbOpen qh) -> a (SQL.ReadHandle qh) _ -> return mempty return (v, st') -readDb :: SqlPersistM a -> ReadHandle -> Annex a -readDb a (ReadHandle h) = liftIO $ H.queryDbQueue h a - -newtype WriteHandle = WriteHandle H.DbQueue - -type Writer = WriteHandle -> Annex () +runReaderIO :: Monoid v => (SQL.ReadHandle -> IO v) -> Annex v +runReaderIO a = runReader (liftIO . a) {- Runs an action that writes to the database. Typically this is used to - queue changes, which will be flushed at a later point. - - The database is created if it doesn't exist yet. -} -runWriter :: Writer -> Annex () +runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex () runWriter a = do h <- getDbHandle withDbState h go where go st@(DbOpen qh) = do - v <- a (WriteHandle qh) + v <- a (SQL.WriteHandle qh) return (v, st) go st = do st' <- openDb True st v <- case st' of - DbOpen qh -> a (WriteHandle qh) + DbOpen qh -> a (SQL.WriteHandle qh) _ -> error "internal" return (v, st') -queueDb :: SqlPersistM () -> WriteHandle -> Annex () -queueDb a (WriteHandle h) = liftIO $ H.queueDb h checkcommit a - where - -- commit queue after 1000 changes or 5 minutes, whichever comes first - checkcommit sz lastcommittime - | sz > 1000 = return True - | otherwise = do - now <- getCurrentTime - return $ diffUTCTime lastcommittime now > 300 +runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex () +runWriterIO a = runWriter (liftIO . a) {- Gets the handle cached in Annex state; creates a new one if it's not yet - available, but doesn't open the database. -} @@ -156,60 +127,30 @@ openDb createdb _ = withExclusiveLock gitAnnexKeysDbLock $ do (False, True) -> do liftIO $ do createDirectoryIfMissing True dbdir - H.initDb db $ void $ - runMigrationSilent migrateKeysDb + H.initDb db SQL.createTables setAnnexDirPerm dbdir setAnnexFilePerm db open db (False, False) -> return DbEmpty where - open db = liftIO $ DbOpen <$> H.openDbQueue db "content" + open db = liftIO $ DbOpen <$> H.openDbQueue db SQL.containedTable addAssociatedFile :: Key -> TopFilePath -> Annex () -addAssociatedFile k f = runWriter $ addAssociatedFile' k f - -addAssociatedFile' :: Key -> TopFilePath -> Writer -addAssociatedFile' k f = queueDb $ do - -- If the same file was associated with a different key before, - -- remove that. - delete $ from $ \r -> do - where_ (r ^. AssociatedFile ==. val (getTopFilePath f) &&. not_ (r ^. AssociatedKey ==. val sk)) - void $ insertUnique $ Associated sk (getTopFilePath f) - where - sk = toSKey k +addAssociatedFile k f = runWriterIO $ SQL.addAssociatedFile (toSKey k) f {- Note that the files returned were once associated with the key, but - some of them may not be any longer. -} getAssociatedFiles :: Key -> Annex [TopFilePath] -getAssociatedFiles = runReader . getAssociatedFiles' . toSKey - -getAssociatedFiles' :: SKey -> Reader [TopFilePath] -getAssociatedFiles' sk = readDb $ do - l <- select $ from $ \r -> do - where_ (r ^. AssociatedKey ==. val sk) - return (r ^. AssociatedFile) - return $ map (asTopFilePath . unValue) l +getAssociatedFiles = runReaderIO . SQL.getAssociatedFiles . toSKey {- Gets any keys that are on record as having a particular associated file. - (Should be one or none but the database doesn't enforce that.) -} getAssociatedKey :: TopFilePath -> Annex [Key] -getAssociatedKey = runReader . getAssociatedKey' - -getAssociatedKey' :: TopFilePath -> Reader [Key] -getAssociatedKey' f = readDb $ do - l <- select $ from $ \r -> do - where_ (r ^. AssociatedFile ==. val (getTopFilePath f)) - return (r ^. AssociatedKey) - return $ map (fromSKey . unValue) l +getAssociatedKey = map fromSKey <$$> runReaderIO . SQL.getAssociatedKey removeAssociatedFile :: Key -> TopFilePath -> Annex () -removeAssociatedFile k = runWriter . removeAssociatedFile' (toSKey k) +removeAssociatedFile k = runWriterIO . SQL.removeAssociatedFile (toSKey k) -removeAssociatedFile' :: SKey -> TopFilePath -> Writer -removeAssociatedFile' sk f = queueDb $ - delete $ from $ \r -> do - where_ (r ^. AssociatedKey ==. val sk &&. r ^. AssociatedFile ==. val (getTopFilePath f)) - {- Find all unlocked associated files. This is expensive, and so normally - the associated files are updated incrementally when changes are noticed. -} scanAssociatedFiles :: Annex () @@ -224,12 +165,12 @@ scanAssociatedFiles = whenM (isJust <$> inRepo Git.Branch.current) $ =<< catKey (Git.LsTree.sha i) liftIO $ void cleanup where - dropallassociated = queueDb $ - delete $ from $ \(_r :: SqlExpr (Entity Associated)) -> + dropallassociated h = liftIO $ flip SQL.queueDb h $ + delete $ from $ \(_r :: SqlExpr (Entity SQL.Associated)) -> return () isregfile i = Git.Types.toBlobType (Git.LsTree.mode i) == Just Git.Types.FileBlob - add h i k = flip queueDb h $ - void $ insertUnique $ Associated + add h i k = liftIO $ flip SQL.queueDb h $ + void $ insertUnique $ SQL.Associated (toSKey k) (getTopFilePath $ Git.LsTree.file i) @@ -239,28 +180,12 @@ storeInodeCaches k fs = withTSDelta $ \d -> addInodeCaches k . catMaybes =<< liftIO (mapM (`genInodeCache` d) fs) addInodeCaches :: Key -> [InodeCache] -> Annex () -addInodeCaches k is = runWriter $ addInodeCaches' (toSKey k) is - -addInodeCaches' :: SKey -> [InodeCache] -> Writer -addInodeCaches' sk is = queueDb $ - forM_ is $ \i -> insertUnique $ Content sk (toSInodeCache i) +addInodeCaches k is = runWriterIO $ SQL.addInodeCaches (toSKey k) is {- A key may have multiple InodeCaches; one for the annex object, and one - for each pointer file that is a copy of it. -} getInodeCaches :: Key -> Annex [InodeCache] -getInodeCaches = runReader . getInodeCaches' . toSKey - -getInodeCaches' :: SKey -> Reader [InodeCache] -getInodeCaches' sk = readDb $ do - l <- select $ from $ \r -> do - where_ (r ^. ContentKey ==. val sk) - return (r ^. ContentCache) - return $ map (fromSInodeCache . unValue) l +getInodeCaches = runReaderIO . SQL.getInodeCaches . toSKey removeInodeCaches :: Key -> Annex () -removeInodeCaches = runWriter . removeInodeCaches' . toSKey - -removeInodeCaches' :: SKey -> Writer -removeInodeCaches' sk = queueDb $ - delete $ from $ \r -> do - where_ (r ^. ContentKey ==. val sk) +removeInodeCaches = runWriterIO . SQL.removeInodeCaches . toSKey diff --git a/Database/Keys/SQL.hs b/Database/Keys/SQL.hs new file mode 100644 index 000000000..79230b60d --- /dev/null +++ b/Database/Keys/SQL.hs @@ -0,0 +1,108 @@ +{- Sqlite database of information about Keys + - + - Copyright 2015-2016 Joey Hess <id@joeyh.name> + - + - Licensed under the GNU GPL version 3 or higher. + -} + +{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-} +{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-} +{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-} +{-# LANGUAGE RankNTypes, ScopedTypeVariables #-} + +module Database.Keys.SQL where + +import Database.Types +import Database.Handle +import qualified Database.Queue as H +import Utility.InodeCache +import Git.FilePath + +import Database.Persist.TH +import Database.Esqueleto hiding (Key) +import Data.Time.Clock +import Control.Monad + +share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase| +Associated + key SKey + file FilePath + KeyFileIndex key file + FileKeyIndex file key +Content + key SKey + cache SInodeCache + KeyCacheIndex key cache +|] + +containedTable :: TableName +containedTable = "content" + +createTables :: SqlPersistM () +createTables = void $ runMigrationSilent migrateKeysDb + +newtype ReadHandle = ReadHandle H.DbQueue + +readDb :: SqlPersistM a -> ReadHandle -> IO a +readDb a (ReadHandle h) = H.queryDbQueue h a + +newtype WriteHandle = WriteHandle H.DbQueue + +queueDb :: SqlPersistM () -> WriteHandle -> IO () +queueDb a (WriteHandle h) = H.queueDb h checkcommit a + where + -- commit queue after 1000 changes or 5 minutes, whichever comes first + checkcommit sz lastcommittime + | sz > 1000 = return True + | otherwise = do + now <- getCurrentTime + return $ diffUTCTime lastcommittime now > 300 + +addAssociatedFile :: SKey -> TopFilePath -> WriteHandle -> IO () +addAssociatedFile sk f = queueDb $ do + -- If the same file was associated with a different key before, + -- remove that. + delete $ from $ \r -> do + where_ (r ^. AssociatedFile ==. val (getTopFilePath f) &&. not_ (r ^. AssociatedKey ==. val sk)) + void $ insertUnique $ Associated sk (getTopFilePath f) + +{- Note that the files returned were once associated with the key, but + - some of them may not be any longer. -} +getAssociatedFiles :: SKey -> ReadHandle -> IO [TopFilePath] +getAssociatedFiles sk = readDb $ do + l <- select $ from $ \r -> do + where_ (r ^. AssociatedKey ==. val sk) + return (r ^. AssociatedFile) + return $ map (asTopFilePath . unValue) l + +{- Gets any keys that are on record as having a particular associated file. + - (Should be one or none but the database doesn't enforce that.) -} +getAssociatedKey :: TopFilePath -> ReadHandle -> IO [SKey] +getAssociatedKey f = readDb $ do + l <- select $ from $ \r -> do + where_ (r ^. AssociatedFile ==. val (getTopFilePath f)) + return (r ^. AssociatedKey) + return $ map unValue l + +removeAssociatedFile :: SKey -> TopFilePath -> WriteHandle -> IO () +removeAssociatedFile sk f = queueDb $ + delete $ from $ \r -> do + where_ (r ^. AssociatedKey ==. val sk &&. r ^. AssociatedFile ==. val (getTopFilePath f)) + +addInodeCaches :: SKey -> [InodeCache] -> WriteHandle -> IO () +addInodeCaches sk is = queueDb $ + forM_ is $ \i -> insertUnique $ Content sk (toSInodeCache i) + +{- A key may have multiple InodeCaches; one for the annex object, and one + - for each pointer file that is a copy of it. -} +getInodeCaches :: SKey -> ReadHandle -> IO [InodeCache] +getInodeCaches sk = readDb $ do + l <- select $ from $ \r -> do + where_ (r ^. ContentKey ==. val sk) + return (r ^. ContentCache) + return $ map (fromSInodeCache . unValue) l + +removeInodeCaches :: SKey -> WriteHandle -> IO () +removeInodeCaches sk = queueDb $ + delete $ from $ \r -> do + where_ (r ^. ContentKey ==. val sk) diff --git a/doc/bugs/cannot_add_a_files_with_an_accent_in_it/comment_1_d465d7f88c8f11b4b636fba56711d745._comment b/doc/bugs/cannot_add_a_files_with_an_accent_in_it/comment_1_d465d7f88c8f11b4b636fba56711d745._comment new file mode 100644 index 000000000..0d47449a0 --- /dev/null +++ b/doc/bugs/cannot_add_a_files_with_an_accent_in_it/comment_1_d465d7f88c8f11b4b636fba56711d745._comment @@ -0,0 +1,23 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2016-01-11T16:46:42Z" + content=""" +I can't even type an e with an accent in it into the bash prompt on OSX. +However, I copied over such a file, and had no problem adding it: + + bash-3.2$ ls + e?? + bash-3.2$ git annex add . + add ë ok + bash-3.2$ git annex status + A ë + +I also tried tab-completing a file named "testë" and that also works: + + oberon:gd joeyh$ git annex add teste\314\210 + add testë ok + (recording state in git...) + +So, I can't reproduce your problem. +"""]] diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 299428d1e..329fb8932 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -672,6 +672,11 @@ subdirectories). See [[git-annex-fuzztest]](1) for details. +* `benchmark` + + This runs git-annex's built-in benchmarks, if it was built with + benchmarking support. + # COMMON OPTIONS These common options are accepted by all git-annex commands, and diff --git a/doc/todo/smudge.mdwn b/doc/todo/smudge.mdwn index 6498863e4..5f3d521bf 100644 --- a/doc/todo/smudge.mdwn +++ b/doc/todo/smudge.mdwn @@ -32,14 +32,6 @@ git-annex should use smudge/clean filters. when pushing changes committed in such a repo. Ideally, should avoid committing implicit unlocks, or should prevent such commits leaking out in pushes. -* Optimisation: See if the database schema can be improved to speed things - up. Are there enough indexes? getAssociatedKey in particular does a - reverse lookup and might benefit from an index. -* Optimisation: Reads from the Keys database avoid doing anything if the - database doesn't exist. This makes v5 repos, or v6 with all locked files - faster. However, if a v6 repo unlocks and then re-locks a file, its - database will exist, and so this optimisation will no longer apply. - Could try to detect when the database is empty, and remove it or avoid reads. * Eventually (but not yet), make v6 the default for new repositories. Note that the assistant forces repos into direct mode; that will need to diff --git a/git-annex.cabal b/git-annex.cabal index 5c09a0c61..289e7a255 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -88,6 +88,10 @@ Flag EKG Description: Enable use of EKG to monitor git-annex as it runs (at http://localhost:4242/) Default: False +Flag Benchmark + Description: Enable benchmarking + Default: False + Flag network-uri Description: Get Network.URI from the network-uri package Default: True @@ -260,6 +264,10 @@ Executable git-annex Build-Depends: ekg GHC-Options: -with-rtsopts=-T CPP-Options: -DWITH_EKG + + if flag(Benchmark) + Build-Depends: criterion, deepseq + CPP-Options: -DWITH_BENCHMARK source-repository head type: git |