9 files changed, 281 insertions, 108 deletions
diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs
index ba7689f70..0383dada3 100644
--- a/CmdLine/GitAnnex.hs
+++ b/CmdLine/GitAnnex.hs
@@ -115,6 +115,9 @@ import qualified Command.Test
 import qualified Command.FuzzTest
 import qualified Command.TestRemote
 #endif
+#ifdef WITH_BENCHMARK
+import qualified Command.Benchmark
+#endif
 #ifdef WITH_EKG
 import System.Remote.Monitoring
 #endif
@@ -221,6 +224,9 @@ cmds testoptparser testrunner =
 	, Command.FuzzTest.cmd
 	, Command.TestRemote.cmd
 #endif
+#ifdef WITH_BENCHMARK
+	, Command.Benchmark.cmd
+#endif
 	]
 
 run :: Parser TestOptions -> Maybe TestRunner -> [String] -> IO ()
diff --git a/Command/Benchmark.hs b/Command/Benchmark.hs
new file mode 100644
index 000000000..d4585fdb8
--- /dev/null
+++ b/Command/Benchmark.hs
@@ -0,0 +1,106 @@
+{- git-annex benchmark
+ -
+ - Copyright 2016 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+module Command.Benchmark where
+
+import Command
+import Database.Types
+import qualified Database.Keys.SQL as SQL
+import qualified Database.Queue as H
+import Utility.Tmp
+import Git.FilePath
+
+import Criterion.Main
+import Criterion.Internal (runAndAnalyse)
+import Criterion.Monad
+import Control.Monad.IO.Class (liftIO)
+import Control.Monad
+import Control.DeepSeq
+import System.FilePath
+import System.Random
+
+cmd :: Command
+cmd = noRepo (withParams benchmark) $
+	dontCheck repoExists $
+		command "benchmark" SectionTesting
+			"run benchmarks"
+			paramNothing
+			(withParams (liftIO . benchmark))
+
+benchmark :: CmdParams -> IO ()
+benchmark _ = withTmpDirIn "." "benchmark" $ \tmpdir -> do
+	-- benchmark different sizes of databases
+	dbs <- mapM (benchDb tmpdir)
+		[ 1000
+		, 10000
+		-- , 100000
+		]
+	-- can't use Criterion's defaultMain here because it looks at
+	-- command-line parameters
+	withConfig defaultConfig $ runAndAnalyse (const True) $
+		bgroup "keys database" $ flip concatMap dbs $ \db ->
+			[ getAssociatedFilesHitBench db
+			, getAssociatedFilesMissBench db
+			, getAssociatedKeyHitBench db
+			, getAssociatedKeyMissBench db
+			]
+
+getAssociatedFilesHitBench :: BenchDb -> Benchmark
+getAssociatedFilesHitBench ( BenchDb h num) = bench ("getAssociatedFiles from " ++ show num ++ " (hit)") $ nfIO $ do
+	n <- getStdRandom (randomR (1,num))
+	SQL.getAssociatedFiles (keyN n) (SQL.ReadHandle h)
+
+getAssociatedFilesMissBench :: BenchDb -> Benchmark
+getAssociatedFilesMissBench ( BenchDb h num) = bench ("getAssociatedFiles from " ++ show num ++ " (miss)") $ nfIO $
+	SQL.getAssociatedFiles keyMiss (SQL.ReadHandle h)
+
+getAssociatedKeyHitBench :: BenchDb -> Benchmark
+getAssociatedKeyHitBench (BenchDb h num) = bench ("getAssociatedKey from " ++ show num ++ " (hit)") $ nfIO $ do
+	n <- getStdRandom (randomR (1,num))
+	SQL.getAssociatedKey (fileN n) (SQL.ReadHandle h)
+
+getAssociatedKeyMissBench :: BenchDb -> Benchmark
+getAssociatedKeyMissBench (BenchDb h num) = bench ("getAssociatedKey from " ++ show num ++ " (miss)") $ nfIO $
+	SQL.getAssociatedKey fileMiss (SQL.ReadHandle h)
+
+populateAssociatedFiles :: H.DbQueue -> Int -> IO ()
+populateAssociatedFiles h num = do
+	forM_ [1..num] $ \n ->
+		SQL.addAssociatedFile (keyN n) (fileN n) (SQL.WriteHandle h)
+	H.flushDbQueue h
+
+keyN :: Int -> SKey
+keyN n = SKey ("key" ++ show n)
+
+fileN :: Int -> TopFilePath
+fileN n = asTopFilePath ("file" ++ show n)
+
+keyMiss :: SKey
+keyMiss = keyN 0 -- 0 is never stored
+
+fileMiss :: TopFilePath
+fileMiss = fileN 0 -- 0 is never stored
+
+data BenchDb = BenchDb H.DbQueue Int
+
+benchDb :: FilePath -> Int -> IO BenchDb
+benchDb tmpdir num = do
+	putStrLn $ "setting up database with " ++ show num
+	H.initDb f SQL.createTables
+	h <- H.openDbQueue f SQL.containedTable
+	populateAssociatedFiles h num
+	return (BenchDb h num)
+  where
+	f = tmpdir </> "db" ++ show num
+
+instance NFData TopFilePath where
+	rnf = rnf . getTopFilePath
+
+instance NFData SKey where
+	rnf (SKey s) = rnf s
diff --git a/Common.hs b/Common.hs
index 8272043c2..2b8b9a805 100644
--- a/Common.hs
+++ b/Common.hs
@@ -5,7 +5,7 @@ module Common (module X) where
 import Control.Monad as X
 import Control.Monad.IfElse as X
 import Control.Applicative as X
-import "mtl" Control.Monad.State.Strict as X (liftIO)
+import Control.Monad.IO.Class as X (liftIO)
 
 import Data.Maybe as X
 import Data.List as X hiding (head, tail, init, last)
diff --git a/Database/Keys.hs b/Database/Keys.hs
index 4c4c65850..faf34132a 100644
--- a/Database/Keys.hs
+++ b/Database/Keys.hs
@@ -21,10 +21,9 @@ module Database.Keys (
 	addInodeCaches,
 	getInodeCaches,
 	removeInodeCaches,
-	AssociatedId,
-	ContentId,
 ) where
 
+import qualified Database.Keys.SQL as SQL
 import Database.Types
 import Database.Keys.Handle
 import qualified Database.Queue as H
@@ -42,24 +41,7 @@ import Git.Ref
 import Git.FilePath
 import Annex.CatFile
 
-import Database.Persist.TH
 import Database.Esqueleto hiding (Key)
-import Data.Time.Clock
-
-share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase|
-Associated
-  key SKey
-  file FilePath
-  KeyFileIndex key file
-Content
-  key SKey
-  cache SInodeCache
-  KeyCacheIndex key cache
-|]
-
-newtype ReadHandle = ReadHandle H.DbQueue
-
-type Reader v = ReadHandle -> Annex v
 
 {- Runs an action that reads from the database.
  -
@@ -72,7 +54,7 @@ type Reader v = ReadHandle -> Annex v
  -
  - Any queued writes will be flushed before the read.
  -}
-runReader :: Monoid v => Reader v -> Annex v
+runReader :: Monoid v => (SQL.ReadHandle -> Annex v) -> Annex v
 runReader a = do
 	h <- getDbHandle
 	withDbState h go
@@ -80,50 +62,39 @@ runReader a = do
 	go DbEmpty = return (mempty, DbEmpty)
 	go st@(DbOpen qh) = do
 		liftIO $ H.flushDbQueue qh
-		v <- a (ReadHandle qh)
+		v <- a (SQL.ReadHandle qh)
 		return (v, st)
 	go DbClosed = do
 		st' <- openDb False DbClosed
 		v <- case st' of
-			(DbOpen qh) -> a (ReadHandle qh)
+			(DbOpen qh) -> a (SQL.ReadHandle qh)
 			_ -> return mempty
 		return (v, st')
 
-readDb :: SqlPersistM a -> ReadHandle -> Annex a
-readDb a (ReadHandle h) = liftIO $ H.queryDbQueue h a
-
-newtype WriteHandle = WriteHandle H.DbQueue
-
-type Writer = WriteHandle -> Annex ()
+runReaderIO :: Monoid v => (SQL.ReadHandle -> IO v) -> Annex v
+runReaderIO a = runReader (liftIO . a)
 
 {- Runs an action that writes to the database. Typically this is used to
  - queue changes, which will be flushed at a later point.
  -
  - The database is created if it doesn't exist yet. -}
-runWriter :: Writer -> Annex ()
+runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex ()
 runWriter a = do
 	h <- getDbHandle
 	withDbState h go
   where
 	go st@(DbOpen qh) = do
-		v <- a (WriteHandle qh)
+		v <- a (SQL.WriteHandle qh)
 		return (v, st)
 	go st = do
 		st' <- openDb True st
 		v <- case st' of
-			DbOpen qh -> a (WriteHandle qh)
+			DbOpen qh -> a (SQL.WriteHandle qh)
 			_ -> error "internal"
 		return (v, st')
 
-queueDb :: SqlPersistM () -> WriteHandle -> Annex ()
-queueDb a (WriteHandle h) = liftIO $ H.queueDb h checkcommit a
-  where
-	-- commit queue after 1000 changes or 5 minutes, whichever comes first
-	checkcommit sz lastcommittime
-		| sz > 1000 = return True
-		| otherwise = do
-			now <- getCurrentTime
-			return $ diffUTCTime lastcommittime now > 300
+runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex ()
+runWriterIO a = runWriter (liftIO . a)
 
 {- Gets the handle cached in Annex state; creates a new one if it's not yet
  - available, but doesn't open the database. -}
@@ -156,60 +127,30 @@ openDb createdb _ = withExclusiveLock gitAnnexKeysDbLock $ do
 		(False, True) -> do
 			liftIO $ do
 				createDirectoryIfMissing True dbdir
-				H.initDb db $ void $
-					runMigrationSilent migrateKeysDb
+				H.initDb db SQL.createTables
 			setAnnexDirPerm dbdir
 			setAnnexFilePerm db
 			open db
 		(False, False) -> return DbEmpty
   where
-	open db = liftIO $ DbOpen <$> H.openDbQueue db "content"
+	open db = liftIO $ DbOpen <$> H.openDbQueue db SQL.containedTable
 
 addAssociatedFile :: Key -> TopFilePath -> Annex ()
-addAssociatedFile k f = runWriter $ addAssociatedFile' k f
-
-addAssociatedFile' :: Key -> TopFilePath -> Writer
-addAssociatedFile' k f = queueDb $ do
-	-- If the same file was associated with a different key before,
-	-- remove that.
-	delete $ from $ \r -> do
-		where_ (r ^. AssociatedFile ==. val (getTopFilePath f) &&. not_ (r ^. AssociatedKey ==. val sk))
-	void $ insertUnique $ Associated sk (getTopFilePath f)
-  where
-	sk = toSKey k
+addAssociatedFile k f = runWriterIO $ SQL.addAssociatedFile (toSKey k) f
 
 {- Note that the files returned were once associated with the key, but
  - some of them may not be any longer. -}
 getAssociatedFiles :: Key -> Annex [TopFilePath]
-getAssociatedFiles = runReader . getAssociatedFiles' . toSKey
-
-getAssociatedFiles' :: SKey -> Reader [TopFilePath]
-getAssociatedFiles' sk = readDb $ do
-	l <- select $ from $ \r -> do
-		where_ (r ^. AssociatedKey ==. val sk)
-		return (r ^. AssociatedFile)
-	return $ map (asTopFilePath . unValue) l
+getAssociatedFiles = runReaderIO . SQL.getAssociatedFiles . toSKey
 
 {- Gets any keys that are on record as having a particular associated file.
  - (Should be one or none but the database doesn't enforce that.) -}
 getAssociatedKey :: TopFilePath -> Annex [Key]
-getAssociatedKey = runReader . getAssociatedKey'
-
-getAssociatedKey' :: TopFilePath -> Reader [Key]
-getAssociatedKey' f = readDb $ do
-	l <- select $ from $ \r -> do
-		where_ (r ^. AssociatedFile ==. val (getTopFilePath f))
-		return (r ^. AssociatedKey)
-	return $ map (fromSKey . unValue) l
+getAssociatedKey = map fromSKey <$$> runReaderIO . SQL.getAssociatedKey
 
 removeAssociatedFile :: Key -> TopFilePath -> Annex ()
-removeAssociatedFile k = runWriter . removeAssociatedFile' (toSKey k)
+removeAssociatedFile k = runWriterIO . SQL.removeAssociatedFile (toSKey k)
 
-removeAssociatedFile' :: SKey -> TopFilePath -> Writer
-removeAssociatedFile' sk f = queueDb $ 
-	delete $ from $ \r -> do
-		where_ (r ^. AssociatedKey ==. val sk &&. r ^. AssociatedFile ==. val (getTopFilePath f))
-	
 {- Find all unlocked associated files. This is expensive, and so normally
  - the associated files are updated incrementally when changes are noticed. -}
 scanAssociatedFiles :: Annex ()
@@ -224,12 +165,12 @@ scanAssociatedFiles = whenM (isJust <$> inRepo Git.Branch.current) $
 					=<< catKey (Git.LsTree.sha i)
 		liftIO $ void cleanup
   where
-	dropallassociated = queueDb $
-		delete $ from $ \(_r :: SqlExpr (Entity Associated)) ->
+	dropallassociated h = liftIO $ flip SQL.queueDb h $
+		delete $ from $ \(_r :: SqlExpr (Entity SQL.Associated)) ->
 			return ()
 	isregfile i = Git.Types.toBlobType (Git.LsTree.mode i) == Just Git.Types.FileBlob
-	add h i k = flip queueDb h $ 
-		void $ insertUnique $ Associated
+	add h i k = liftIO $ flip SQL.queueDb h $ 
+		void $ insertUnique $ SQL.Associated
 			(toSKey k)
 			(getTopFilePath $ Git.LsTree.file i)
 
@@ -239,28 +180,12 @@ storeInodeCaches k fs = withTSDelta $ \d ->
 	addInodeCaches k . catMaybes =<< liftIO (mapM (`genInodeCache` d) fs)
 
 addInodeCaches :: Key -> [InodeCache] -> Annex ()
-addInodeCaches k is = runWriter $ addInodeCaches' (toSKey k) is
-
-addInodeCaches' :: SKey -> [InodeCache] -> Writer
-addInodeCaches' sk is = queueDb $
-	forM_ is $ \i -> insertUnique $ Content sk (toSInodeCache i)
+addInodeCaches k is = runWriterIO $ SQL.addInodeCaches (toSKey k) is
 
 {- A key may have multiple InodeCaches; one for the annex object, and one
  - for each pointer file that is a copy of it. -}
 getInodeCaches :: Key -> Annex [InodeCache]
-getInodeCaches = runReader . getInodeCaches' . toSKey
-
-getInodeCaches' :: SKey -> Reader [InodeCache]
-getInodeCaches' sk = readDb $ do
-	l <- select $ from $ \r -> do
-		where_ (r ^. ContentKey ==. val sk)
-		return (r ^. ContentCache)
-	return $ map (fromSInodeCache . unValue) l
+getInodeCaches = runReaderIO . SQL.getInodeCaches . toSKey
 
 removeInodeCaches :: Key -> Annex ()
-removeInodeCaches = runWriter . removeInodeCaches' . toSKey
-
-removeInodeCaches' :: SKey -> Writer
-removeInodeCaches' sk = queueDb $ 
-	delete $ from $ \r -> do
-		where_ (r ^. ContentKey ==. val sk)
+removeInodeCaches = runWriterIO . SQL.removeInodeCaches . toSKey
diff --git a/Database/Keys/SQL.hs b/Database/Keys/SQL.hs
new file mode 100644
index 000000000..79230b60d
--- /dev/null
+++ b/Database/Keys/SQL.hs
@@ -0,0 +1,108 @@
+{- Sqlite database of information about Keys
+ -
+ - Copyright 2015-2016 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
+{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-}
+{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE RankNTypes, ScopedTypeVariables #-}
+
+module Database.Keys.SQL where
+
+import Database.Types
+import Database.Handle
+import qualified Database.Queue as H
+import Utility.InodeCache
+import Git.FilePath
+
+import Database.Persist.TH
+import Database.Esqueleto hiding (Key)
+import Data.Time.Clock
+import Control.Monad
+
+share [mkPersist sqlSettings, mkMigrate "migrateKeysDb"] [persistLowerCase|
+Associated
+  key SKey
+  file FilePath
+  KeyFileIndex key file
+  FileKeyIndex file key
+Content
+  key SKey
+  cache SInodeCache
+  KeyCacheIndex key cache
+|]
+
+containedTable :: TableName
+containedTable = "content"
+
+createTables :: SqlPersistM ()
+createTables = void $ runMigrationSilent migrateKeysDb
+
+newtype ReadHandle = ReadHandle H.DbQueue
+
+readDb :: SqlPersistM a -> ReadHandle -> IO a
+readDb a (ReadHandle h) = H.queryDbQueue h a
+
+newtype WriteHandle = WriteHandle H.DbQueue
+
+queueDb :: SqlPersistM () -> WriteHandle -> IO ()
+queueDb a (WriteHandle h) = H.queueDb h checkcommit a
+  where
+	-- commit queue after 1000 changes or 5 minutes, whichever comes first
+	checkcommit sz lastcommittime
+		| sz > 1000 = return True
+		| otherwise = do
+			now <- getCurrentTime
+			return $ diffUTCTime lastcommittime now > 300
+
+addAssociatedFile :: SKey -> TopFilePath -> WriteHandle -> IO ()
+addAssociatedFile sk f = queueDb $ do
+	-- If the same file was associated with a different key before,
+	-- remove that.
+	delete $ from $ \r -> do
+		where_ (r ^. AssociatedFile ==. val (getTopFilePath f) &&. not_ (r ^. AssociatedKey ==. val sk))
+	void $ insertUnique $ Associated sk (getTopFilePath f)
+
+{- Note that the files returned were once associated with the key, but
+ - some of them may not be any longer. -}
+getAssociatedFiles :: SKey -> ReadHandle -> IO [TopFilePath]
+getAssociatedFiles sk = readDb $ do
+	l <- select $ from $ \r -> do
+		where_ (r ^. AssociatedKey ==. val sk)
+		return (r ^. AssociatedFile)
+	return $ map (asTopFilePath . unValue) l
+
+{- Gets any keys that are on record as having a particular associated file.
+ - (Should be one or none but the database doesn't enforce that.) -}
+getAssociatedKey :: TopFilePath -> ReadHandle -> IO [SKey]
+getAssociatedKey f = readDb $ do
+	l <- select $ from $ \r -> do
+		where_ (r ^. AssociatedFile ==. val (getTopFilePath f))
+		return (r ^. AssociatedKey)
+	return $ map unValue l
+
+removeAssociatedFile :: SKey -> TopFilePath -> WriteHandle -> IO ()
+removeAssociatedFile sk f = queueDb $ 
+	delete $ from $ \r -> do
+		where_ (r ^. AssociatedKey ==. val sk &&. r ^. AssociatedFile ==. val (getTopFilePath f))
+
+addInodeCaches :: SKey -> [InodeCache] -> WriteHandle -> IO ()
+addInodeCaches sk is = queueDb $
+	forM_ is $ \i -> insertUnique $ Content sk (toSInodeCache i)
+
+{- A key may have multiple InodeCaches; one for the annex object, and one
+ - for each pointer file that is a copy of it. -}
+getInodeCaches :: SKey -> ReadHandle -> IO [InodeCache]
+getInodeCaches sk = readDb $ do
+	l <- select $ from $ \r -> do
+		where_ (r ^. ContentKey ==. val sk)
+		return (r ^. ContentCache)
+	return $ map (fromSInodeCache . unValue) l
+
+removeInodeCaches :: SKey -> WriteHandle -> IO ()
+removeInodeCaches sk = queueDb $ 
+	delete $ from $ \r -> do
+		where_ (r ^. ContentKey ==. val sk)
diff --git a/doc/bugs/cannot_add_a_files_with_an_accent_in_it/comment_1_d465d7f88c8f11b4b636fba56711d745._comment b/doc/bugs/cannot_add_a_files_with_an_accent_in_it/comment_1_d465d7f88c8f11b4b636fba56711d745._comment
new file mode 100644
index 000000000..0d47449a0
--- /dev/null
+++ b/doc/bugs/cannot_add_a_files_with_an_accent_in_it/comment_1_d465d7f88c8f11b4b636fba56711d745._comment
@@ -0,0 +1,23 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 1"""
+ date="2016-01-11T16:46:42Z"
+ content="""
+I can't even type an e with an accent in it into the bash prompt on OSX.
+However, I copied over such a file, and had no problem adding it:
+
+	bash-3.2$ ls
+	e??
+	bash-3.2$ git annex add .
+	add ë ok
+	bash-3.2$ git annex status
+	A ë
+
+I also tried tab-completing a file named "testë" and that also works:
+
+	oberon:gd joeyh$ git annex add teste\314\210 
+	add testë ok
+	(recording state in git...)
+
+So, I can't reproduce your problem.
+"""]]
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index 299428d1e..329fb8932 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -672,6 +672,11 @@ subdirectories).
   
   See [[git-annex-fuzztest]](1) for details.
 
+* `benchmark`
+
+  This runs git-annex's built-in benchmarks, if it was built with
+  benchmarking support.
+
 # COMMON OPTIONS
 
 These common options are accepted by all git-annex commands, and
diff --git a/doc/todo/smudge.mdwn b/doc/todo/smudge.mdwn
index 6498863e4..5f3d521bf 100644
--- a/doc/todo/smudge.mdwn
+++ b/doc/todo/smudge.mdwn
@@ -32,14 +32,6 @@ git-annex should use smudge/clean filters.
   when pushing changes committed in such a repo. Ideally, should avoid
   committing implicit unlocks, or should prevent such commits leaking out
   in pushes.
-* Optimisation: See if the database schema can be improved to speed things
-  up. Are there enough indexes? getAssociatedKey in particular does a
-  reverse lookup and might benefit from an index.
-* Optimisation: Reads from the Keys database avoid doing anything if the
-  database doesn't exist. This makes v5 repos, or v6 with all locked files
-  faster. However, if a v6 repo unlocks and then re-locks a file, its
-  database will exist, and so this optimisation will no longer apply.
-  Could try to detect when the database is empty, and remove it or avoid reads.
 
 * Eventually (but not yet), make v6 the default for new repositories.
   Note that the assistant forces repos into direct mode; that will need to
diff --git a/git-annex.cabal b/git-annex.cabal
index 5c09a0c61..289e7a255 100644
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -88,6 +88,10 @@ Flag EKG
   Description: Enable use of EKG to monitor git-annex as it runs (at http://localhost:4242/)
   Default: False
 
+Flag Benchmark
+  Description: Enable benchmarking
+  Default: False
+
 Flag network-uri
   Description: Get Network.URI from the network-uri package
   Default: True
@@ -260,6 +264,10 @@ Executable git-annex
     Build-Depends: ekg
     GHC-Options: -with-rtsopts=-T
     CPP-Options: -DWITH_EKG
+  
+  if flag(Benchmark)
+    Build-Depends: criterion, deepseq
+    CPP-Options: -DWITH_BENCHMARK
 
 source-repository head
   type: git