summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2011-09-20 18:57:05 -0400
committerGravatar Joey Hess <joey@kitenet.net>2011-09-20 18:57:05 -0400
commit9f5c7a246b786e350671551cafae0f9678d83648 (patch)
tree1ed7e550c7a0fce4a2bb1da456d66a44df788cb8
parentcabbefd9d2d16b52b28f69a8410a9eb84e506666 (diff)
status: Massively sped up; remove --fast mode.
Using Sets is the right thing; they have constant size lookup like my SizeList, and logn insertation, which beats nub to death. Runs faster than --fast mode did before, and gives accurate counts. 13 seconds total runtime with a warm cache in a repository with 40 thousand keys.
-rw-r--r--Command/Status.hs98
-rw-r--r--debian/changelog3
-rw-r--r--doc/git-annex.mdwn5
3 files changed, 40 insertions, 66 deletions
diff --git a/Command/Status.hs b/Command/Status.hs
index 067128f62..d06865b6a 100644
--- a/Command/Status.hs
+++ b/Command/Status.hs
@@ -13,8 +13,9 @@ import Data.Maybe
import System.IO
import Data.List
import qualified Data.Map as M
+import qualified Data.Set as S
+import Data.Set (Set)
-import qualified Annex
import qualified Types.Backend as B
import qualified Types.Remote as R
import qualified Remote
@@ -23,33 +24,23 @@ import qualified Git
import Command
import Types
import Utility.DataUnits
-import Utility.Conditional
import Content
import Types.Key
import Locations
import Backend
-import Messages
-- a named computation that produces a statistic
-type Stat = StatState (Maybe (String, Bool, StatState String))
+type Stat = StatState (Maybe (String, StatState String))
-- cached info that multiple Stats may need
data StatInfo = StatInfo
- { keysPresentCache :: Maybe (SizeList Key)
- , keysReferencedCache :: Maybe (SizeList Key)
+ { keysPresentCache :: Maybe (Set Key)
+ , keysReferencedCache :: Maybe (Set Key)
}
-- a state monad for running Stats in
type StatState = StateT StatInfo Annex
--- a list with a known length
--- (Integer is used for the length to avoid
--- blowing up if someone annexed billions of files..)
-type SizeList a = ([a], Integer)
-
-sizeList :: [a] -> SizeList a
-sizeList l = (l, genericLength l)
-
command :: [Command]
command = [repoCommand "status" paramNothing seek
"shows status information about the annex"]
@@ -76,15 +67,10 @@ stats =
start :: CommandStart
start = do
evalStateT (mapM_ showStat stats) (StatInfo Nothing Nothing)
- fastmode_note
stop
-fastmode_note :: Annex ()
-fastmode_note = whenM (Annex.getState Annex.fast) $
- showLongNote "(*) approximate due to fast mode"
-
-stat :: String -> Bool -> StatState String -> Stat
-stat desc approx a = return $ Just (desc, approx, a)
+stat :: String -> StatState String -> Stat
+stat desc a = return $ Just (desc, a)
nostat :: Stat
nostat = return Nothing
@@ -92,37 +78,35 @@ nostat = return Nothing
showStat :: Stat -> StatState ()
showStat s = calc =<< s
where
- calc (Just (desc, approx, a)) = do
- fast <- lift $ Annex.getState Annex.fast
- let star = if fast && approx then "(*)" else ""
- liftIO $ putStr $ desc ++ star ++ ": "
+ calc (Just (desc, a)) = do
+ liftIO $ putStr $ desc ++ ": "
liftIO $ hFlush stdout
liftIO . putStrLn =<< a
calc Nothing = return ()
supported_backends :: Stat
-supported_backends = stat "supported backends" False $
+supported_backends = stat "supported backends" $
return $ unwords $ map B.name Backend.list
supported_remote_types :: Stat
-supported_remote_types = stat "supported remote types" False $
+supported_remote_types = stat "supported remote types" $
return $ unwords $ map R.typename Remote.remoteTypes
local_annex_size :: Stat
-local_annex_size = stat "local annex size" False $
+local_annex_size = stat "local annex size" $
cachedKeysPresent >>= keySizeSum
total_annex_size :: Stat
-total_annex_size = stat "total annex size" True $
+total_annex_size = stat "total annex size" $
cachedKeysReferenced >>= keySizeSum
local_annex_keys :: Stat
-local_annex_keys = stat "local annex keys" False $
- show . snd <$> cachedKeysPresent
+local_annex_keys = stat "local annex keys" $
+ show . S.size <$> cachedKeysPresent
total_annex_keys :: Stat
-total_annex_keys = stat "total annex keys" True $
- show . snd <$> cachedKeysReferenced
+total_annex_keys = stat "total annex keys" $
+ show . S.size <$> cachedKeysReferenced
tmp_size :: Stat
tmp_size = staleSize "temporary directory size" gitAnnexTmpDir
@@ -131,9 +115,9 @@ bad_data_size :: Stat
bad_data_size = staleSize "bad keys size" gitAnnexBadDir
backend_usage :: Stat
-backend_usage = stat "backend usage" True $ usage <$> cachedKeysReferenced
+backend_usage = stat "backend usage" $ usage <$> cachedKeysReferenced
where
- usage (ks, _) = pp "" $ sort $ map swap $ splits ks
+ usage ks = pp "" $ sort $ map swap $ splits $ S.toList ks
splits :: [Key] -> [(String, Integer)]
splits ks = M.toList $ M.fromListWith (+) $ map tcount ks
tcount k = (keyBackendName k, 1)
@@ -141,48 +125,44 @@ backend_usage = stat "backend usage" True $ usage <$> cachedKeysReferenced
pp c [] = c
pp c ((n, b):xs) = "\n\t" ++ b ++ ": " ++ show n ++ pp c xs
-cachedKeysPresent :: StatState (SizeList Key)
+cachedKeysPresent :: StatState (Set Key)
cachedKeysPresent = do
s <- get
case keysPresentCache s of
Just v -> return v
Nothing -> do
- keys <- lift getKeysPresent
- let v = sizeList keys
- put s { keysPresentCache = Just v }
- return v
+ keys <- S.fromList <$> lift getKeysPresent
+ put s { keysPresentCache = Just keys }
+ return keys
-cachedKeysReferenced :: StatState (SizeList Key)
+cachedKeysReferenced :: StatState (Set Key)
cachedKeysReferenced = do
s <- get
case keysReferencedCache s of
Just v -> return v
Nothing -> do
- -- A given key may be referenced repeatedly,
- -- so nub is needed for accuracy, but is slow.
- keys <- lift Command.Unused.getKeysReferenced
- fast <- lift $ Annex.getState Annex.fast
- let v = sizeList $ if fast then keys else nub keys
- put s { keysReferencedCache = Just v }
- return v
-
-keySizeSum :: SizeList Key -> StatState String
-keySizeSum (keys, len) = do
- let knownsizes = mapMaybe keySize keys
- let total = roughSize storageUnits False $ sum knownsizes
- let missing = len - genericLength knownsizes
+ keys <- S.fromList <$> lift Command.Unused.getKeysReferenced
+ put s { keysReferencedCache = Just keys }
+ return keys
+
+keySizeSum :: Set Key -> StatState String
+keySizeSum s = do
+ let (sizes, unknownsizes) = S.partition isJust $ S.map keySize s
+ let total = roughSize storageUnits False $
+ fromJust $ S.fold (liftM2 (+)) (Just 0) sizes
+ let num = S.size unknownsizes
return $ total ++
- if missing > 0
- then aside $ "but " ++ show missing ++ " keys have unknown size"
- else ""
+ if num == 0
+ then ""
+ else aside $ "but " ++ show num ++ " keys have unknown size"
staleSize :: String -> (Git.Repo -> FilePath) -> Stat
staleSize label dirspec = do
keys <- lift (Command.Unused.staleKeys dirspec)
if null keys
then nostat
- else stat label False $ do
- s <- keySizeSum $ sizeList keys
+ else stat label $ do
+ s <- keySizeSum $ S.fromList keys
return $ s ++ aside "clean up with git-annex unused"
aside :: String -> String
diff --git a/debian/changelog b/debian/changelog
index 938e2feaf..4807956ad 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -12,8 +12,7 @@ git-annex (3.20110916) UNRELEASED; urgency=low
match the specified conditions.
* Note that this is a behavior change for git-annex find! Old behavior
can be gotten by using: git-annex find --in .
- * status: In --fast mode, all status info is displayed now; but some
- of it is only approximate, and is marked as such.
+ * status: Massively sped up; remove --fast mode.
-- Joey Hess <joeyh@debian.org> Sun, 18 Sep 2011 18:25:51 -0400
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index 26cfb2c61..59da9dd8c 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -238,11 +238,6 @@ subdirectories).
Displays some statistics and other information, including how much data
is in the annex.
- Some of the statistics can take a while to generate, and those
- come last. You can ctrl-c this command once it's displayed the
- information you wanted to see. Or, use --fast to produce statistics
- more quickly, but possibly less accurately.
-
* map
Helps you keep track of your repositories, and the connections between them,