summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2015-04-05 12:50:02 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2015-04-05 12:50:02 -0400
commit2ccd9c5e7cf3266d2270741e1535d78350afcee9 (patch)
tree3e933e4ccf7f53f7f9972fc6c01d6de83d643d28
parentb855580614852c7558fb2aad387609d15c4b0c6b (diff)
rethought distributed fsck; instead add activity.log and expire command
This is much more space efficient!
-rw-r--r--CmdLine/GitAnnex.hs2
-rw-r--r--Command/Expire.hs100
-rw-r--r--Command/Fsck.hs114
-rw-r--r--Logs.hs7
-rw-r--r--Logs/Activity.hs37
-rw-r--r--Logs/Location.hs1
-rw-r--r--Logs/Presence.hs4
-rw-r--r--debian/changelog6
-rw-r--r--doc/git-annex-expire.mdwn61
-rw-r--r--doc/git-annex-fsck.mdwn38
-rw-r--r--doc/git-annex.mdwn5
-rw-r--r--doc/internals.mdwn8
12 files changed, 239 insertions, 144 deletions
diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs
index 6dedd94f5..6aeefda05 100644
--- a/CmdLine/GitAnnex.hs
+++ b/CmdLine/GitAnnex.hs
@@ -45,6 +45,7 @@ import qualified Command.Describe
import qualified Command.InitRemote
import qualified Command.EnableRemote
import qualified Command.Fsck
+import qualified Command.Expire
import qualified Command.Repair
import qualified Command.Unused
import qualified Command.DropUnused
@@ -169,6 +170,7 @@ cmds = concat
, Command.VCycle.cmd
, Command.Fix.cmd
, Command.Fsck.cmd
+ , Command.Expire.cmd
, Command.Repair.cmd
, Command.Unused.cmd
, Command.DropUnused.cmd
diff --git a/Command/Expire.hs b/Command/Expire.hs
new file mode 100644
index 000000000..d01d3a965
--- /dev/null
+++ b/Command/Expire.hs
@@ -0,0 +1,100 @@
+{- git-annex command
+ -
+ - Copyright 2015 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Command.Expire where
+
+import Common.Annex
+import Command
+import Logs.Activity
+import Logs.UUID
+import Logs.MapLog
+import Logs.Trust
+import Annex.UUID
+import qualified Remote
+import Utility.HumanTime
+
+import Data.Time.Clock.POSIX
+import qualified Data.Map as M
+
+cmd :: [Command]
+cmd = [withOptions [activityOption] $ command "expire" paramExpire seek
+ SectionMaintenance "expire inactive repositories"]
+
+paramExpire :: String
+paramExpire = (paramRepeating $ paramOptional paramRemote ++ ":" ++ paramTime)
+
+activityOption :: Option
+activityOption = fieldOption [] "activity" "Name" "specify activity"
+
+seek :: CommandSeek
+seek ps = do
+ expire <- parseExpire ps
+ wantact <- getOptionField activityOption (pure . parseActivity)
+ actlog <- lastActivities wantact
+ u <- getUUID
+ us <- filter (/= u) . M.keys <$> uuidMap
+ descs <- uuidMap
+ seekActions $ pure $ map (start expire actlog descs) us
+
+start :: Expire -> Log Activity -> M.Map UUID String -> UUID -> CommandStart
+start (Expire expire) actlog descs u =
+ case lastact of
+ Just ent | notexpired ent -> checktrust (== DeadTrusted) $ do
+ showStart "unexpire" desc
+ showNote =<< whenactive
+ trustSet u SemiTrusted
+ _ -> checktrust (/= DeadTrusted) $ do
+ showStart "expire" desc
+ showNote =<< whenactive
+ trustSet u DeadTrusted
+ where
+ lastact = changed <$> M.lookup u actlog
+ whenactive = case lastact of
+ Just (Date t) -> do
+ d <- liftIO $ durationSince $ posixSecondsToUTCTime t
+ return $ "last active: " ++ fromDuration d ++ " ago"
+ _ -> return "no activity"
+ desc = fromUUID u ++ " " ++ fromMaybe "" (M.lookup u descs)
+ notexpired ent = case ent of
+ Unknown -> False
+ Date t -> case lookupexpire of
+ Just (Just expiretime) -> t >= expiretime
+ _ -> True
+ lookupexpire = headMaybe $ catMaybes $
+ map (`M.lookup` expire) [Just u, Nothing]
+ checktrust want a = ifM (want <$> lookupTrust u)
+ ( do
+ void a
+ next $ next $ return True
+ , stop
+ )
+
+data Expire = Expire (M.Map (Maybe UUID) (Maybe POSIXTime))
+
+parseExpire :: [String] -> Annex Expire
+parseExpire [] = error "Specify an expire time."
+parseExpire ps = do
+ now <- liftIO getPOSIXTime
+ Expire . M.fromList <$> mapM (parse now) ps
+ where
+ parse now s = case separate (== ':') s of
+ (t, []) -> return (Nothing, parsetime now t)
+ (n, t) -> do
+ r <- Remote.nameToUUID n
+ return (Just r, parsetime now t)
+ parsetime _ "never" = Nothing
+ parsetime now s = case parseDuration s of
+ Nothing -> error $ "bad expire time: " ++ s
+ Just d -> Just (now - durationToPOSIXTime d)
+
+parseActivity :: Maybe String -> Maybe Activity
+parseActivity Nothing = Nothing
+parseActivity (Just s) = case readish s of
+ Nothing -> error $ "Unknown activity. Choose from: " ++
+ unwords (map show [minBound..maxBound :: Activity])
+ Just v -> Just v
+
diff --git a/Command/Fsck.hs b/Command/Fsck.hs
index ec89a4351..08753b612 100644
--- a/Command/Fsck.hs
+++ b/Command/Fsck.hs
@@ -22,8 +22,8 @@ import Annex.Direct
import Annex.Perms
import Annex.Link
import Logs.Location
-import Logs.Presence
import Logs.Trust
+import Logs.Activity
import Config.NumCopies
import Annex.UUID
import Utility.DataUnits
@@ -39,7 +39,6 @@ import Data.Time.Clock.POSIX
import Data.Time
import System.Posix.Types (EpochTime)
import System.Locale
-import qualified Data.Map as M
cmd :: [Command]
cmd = [withOptions fsckOptions $ command "fsck" paramPaths seek
@@ -58,22 +57,12 @@ incrementalScheduleOption :: Option
incrementalScheduleOption = fieldOption [] "incremental-schedule" paramTime
"schedule incremental fscking"
-distributedOption :: Option
-distributedOption = flagOption [] "distributed" "distributed fsck mode"
-
-expireOption :: Option
-expireOption = fieldOption [] "expire"
- (paramRepeating $ paramOptional paramRemote ++ ":" ++ paramTime)
- "distributed expire mode"
-
fsckOptions :: [Option]
fsckOptions =
[ fsckFromOption
, startIncrementalOption
, moreIncrementalOption
, incrementalScheduleOption
- , distributedOption
- , expireOption
] ++ keyOptions ++ annexedMatchingOptions
seek :: CommandSeek
@@ -81,28 +70,28 @@ seek ps = do
from <- getOptionField fsckFromOption Remote.byNameWithUUID
u <- maybe getUUID (pure . Remote.uuid) from
i <- getIncremental u
- d <- getDistributed
withKeyOptions False
- (\k -> startKey i d k =<< getNumCopies)
- (withFilesInGit $ whenAnnexed $ start from i d)
+ (\k -> startKey i k =<< getNumCopies)
+ (withFilesInGit $ whenAnnexed $ start from i)
ps
withFsckDb i FsckDb.closeDb
+ recordActivity Fsck u
-start :: Maybe Remote -> Incremental -> Distributed -> FilePath -> Key -> CommandStart
-start from inc dist file key = do
+start :: Maybe Remote -> Incremental -> FilePath -> Key -> CommandStart
+start from inc file key = do
v <- Backend.getBackend file key
case v of
Nothing -> stop
Just backend -> do
numcopies <- getFileNumCopies file
case from of
- Nothing -> go $ perform dist key file backend numcopies
- Just r -> go $ performRemote dist key file backend numcopies r
+ Nothing -> go $ perform key file backend numcopies
+ Just r -> go $ performRemote key file backend numcopies r
where
go = runFsck inc file key
-perform :: Distributed -> Key -> FilePath -> Backend -> NumCopies -> Annex Bool
-perform dist key file backend numcopies = check
+perform :: Key -> FilePath -> Backend -> NumCopies -> Annex Bool
+perform key file backend numcopies = check
-- order matters
[ fixLink key file
, verifyLocationLog key file
@@ -110,14 +99,13 @@ perform dist key file backend numcopies = check
, verifyDirectMode key file
, checkKeySize key
, checkBackend backend key (Just file)
- , checkDistributed dist key Nothing
, checkKeyNumCopies key file numcopies
]
{- To fsck a remote, the content is retrieved to a tmp file,
- and checked locally. -}
-performRemote :: Distributed -> Key -> FilePath -> Backend -> NumCopies -> Remote -> Annex Bool
-performRemote dist key file backend numcopies remote =
+performRemote :: Key -> FilePath -> Backend -> NumCopies -> Remote -> Annex Bool
+performRemote key file backend numcopies remote =
dispatch =<< Remote.hasKey remote key
where
dispatch (Left err) = do
@@ -136,7 +124,6 @@ performRemote dist key file backend numcopies remote =
[ verifyLocationLogRemote key file remote present
, checkKeySizeRemote key remote localcopy
, checkBackendRemote backend key remote localcopy
- , checkDistributed dist key (Just $ Remote.uuid remote)
, checkKeyNumCopies key file numcopies
]
withtmp a = do
@@ -157,19 +144,18 @@ performRemote dist key file backend numcopies remote =
)
dummymeter _ = noop
-startKey :: Incremental -> Distributed -> Key -> NumCopies -> CommandStart
-startKey inc dist key numcopies =
+startKey :: Incremental -> Key -> NumCopies -> CommandStart
+startKey inc key numcopies =
case Backend.maybeLookupBackendName (Types.Key.keyBackendName key) of
Nothing -> stop
Just backend -> runFsck inc (key2file key) key $
- performKey dist key backend numcopies
+ performKey key backend numcopies
-performKey :: Distributed -> Key -> Backend -> NumCopies -> Annex Bool
-performKey dist key backend numcopies = check
+performKey :: Key -> Backend -> NumCopies -> Annex Bool
+performKey key backend numcopies = check
[ verifyLocationLog key (key2file key)
, checkKeySize key
, checkBackend backend key Nothing
- , checkDistributed dist key Nothing
, checkKeyNumCopies key (key2file key) numcopies
]
@@ -513,69 +499,3 @@ getIncremental u = do
when (now - realToFrac started >= durationToPOSIXTime delta) $
resetStartTime u
return True
-
-data Distributed
- = NonDistributed
- | Distributed POSIXTime
- | DistributedExpire POSIXTime (M.Map (Maybe UUID) (Maybe POSIXTime))
- deriving (Show)
-
-getDistributed :: Annex Distributed
-getDistributed = go =<< getOptionField expireOption parseexpire
- where
- go (Just m) = DistributedExpire <$> liftIO getPOSIXTime <*> pure m
- go Nothing = ifM (getOptionFlag distributedOption)
- ( Distributed <$> liftIO getPOSIXTime
- , return NonDistributed
- )
-
- parseexpire Nothing = return Nothing
- parseexpire (Just s) = do
- now <- liftIO getPOSIXTime
- Just . M.fromList <$> mapM (parseexpire' now) (words s)
- parseexpire' now s = case separate (== ':') s of
- (t, []) -> return (Nothing, parsetime now t)
- (n, t) -> do
- r <- Remote.nameToUUID n
- return (Just r, parsetime now t)
- parsetime _ "never" = Nothing
- parsetime now s = case parseDuration s of
- Nothing -> error $ "bad expire time: " ++ s
- Just d -> Just (now - durationToPOSIXTime d)
-
-checkDistributed :: Distributed -> Key -> Maybe UUID -> Annex Bool
-checkDistributed d k mu = do
- go d
- return True
- where
- go NonDistributed = noop
-
- -- This is called after fsck has checked the key's content, so
- -- if the key is present in the annex now, we just need to update
- -- the location log with the timestamp of the start of the fsck.
- --
- -- Note that reusing this timestamp means that the same log line
- -- is generated for each key, which keeps the size increase
- -- of the git-annex branch down.
- go (Distributed ts) = whenM (inAnnex k) $ do
- u <- maybe getUUID return mu
- logChange' (logThen ts) k u InfoPresent
-
- -- Get the location log for the key, and expire all entries
- -- that are older than their uuid's listed expiration date.
- -- (Except for the local repository.)
- go (DistributedExpire ts m) = do
- ls <- locationLog k
- hereu <- getUUID
- forM_ ls $ \l -> do
- let u = toUUID (info l)
- unless (u == hereu) $
- case lookupexpire u of
- Just (Just expiretime)
- | date l < expiretime ->
- logChange' (logThen ts) k u InfoMissing
- _ -> noop
- where
- lookupexpire u = headMaybe $ catMaybes $
- map (`M.lookup` m) [Just u, Nothing]
-
diff --git a/Logs.hs b/Logs.hs
index 1f8cf9f9c..252b5814d 100644
--- a/Logs.hs
+++ b/Logs.hs
@@ -1,6 +1,6 @@
{- git-annex log file names
-
- - Copyright 2013-2014 Joey Hess <id@joeyh.name>
+ - Copyright 2013-2015 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@@ -40,6 +40,7 @@ topLevelUUIDBasedLogs =
, preferredContentLog
, requiredContentLog
, scheduleLog
+ , activityLog
, differenceLog
]
@@ -84,9 +85,13 @@ groupPreferredContentLog = "group-preferred-content.log"
scheduleLog :: FilePath
scheduleLog = "schedule.log"
+activityLog :: FilePath
+activityLog = "activity.log"
+
differenceLog :: FilePath
differenceLog = "difference.log"
+
{- The pathname of the location log file for a given key. -}
locationLogFile :: GitConfig -> Key -> String
locationLogFile config key = branchHashDir config key </> keyFile key ++ ".log"
diff --git a/Logs/Activity.hs b/Logs/Activity.hs
new file mode 100644
index 000000000..45262a633
--- /dev/null
+++ b/Logs/Activity.hs
@@ -0,0 +1,37 @@
+{- git-annex activity log
+ -
+ - Copyright 2015 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Logs.Activity (
+ Log,
+ Activity(..),
+ recordActivity,
+ lastActivities,
+) where
+
+import Data.Time.Clock.POSIX
+
+import Common.Annex
+import qualified Annex.Branch
+import Logs
+import Logs.UUIDBased
+
+data Activity = Fsck
+ deriving (Eq, Read, Show, Enum, Bounded)
+
+recordActivity :: Activity -> UUID -> Annex ()
+recordActivity act uuid = do
+ ts <- liftIO getPOSIXTime
+ Annex.Branch.change activityLog $
+ showLog id . changeLog ts uuid (show act) . parseLog readish
+
+lastActivities :: Maybe Activity -> Annex (Log Activity)
+lastActivities wantact = parseLog onlywanted <$> Annex.Branch.get activityLog
+ where
+ onlywanted s = case readish s of
+ Just a | wanted a -> Just a
+ _ -> Nothing
+ wanted a = maybe True (a ==) wantact
diff --git a/Logs/Location.hs b/Logs/Location.hs
index 59375a512..391edb680 100644
--- a/Logs/Location.hs
+++ b/Logs/Location.hs
@@ -17,7 +17,6 @@ module Logs.Location (
LogStatus(..),
logStatus,
logChange,
- logChange',
loggedLocations,
loggedLocationsHistorical,
locationLog,
diff --git a/Logs/Presence.hs b/Logs/Presence.hs
index 469ed8de9..60e0c542a 100644
--- a/Logs/Presence.hs
+++ b/Logs/Presence.hs
@@ -16,7 +16,6 @@ module Logs.Presence (
addLog,
readLog,
logNow,
- logThen,
currentLog,
currentLogInfo,
historicalLogInfo,
@@ -44,9 +43,6 @@ logNow s i = do
now <- liftIO getPOSIXTime
return $ LogLine now s i
-logThen :: POSIXTime -> LogStatus -> String -> Annex LogLine
-logThen t s i = return $ LogLine t s i
-
{- Reads a log and returns only the info that is still in effect. -}
currentLogInfo :: FilePath -> Annex [String]
currentLogInfo file = map info <$> currentLog file
diff --git a/debian/changelog b/debian/changelog
index f0aaf9b8e..eea9bab0b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-git-annex (5.20150328) UNRELEASED; urgency=medium
+git-annex (5.20150405) UNRELEASED; urgency=medium
* Prevent git-ls-files from double-expanding wildcards when an
unexpanded wildcard is passed to a git-annex command like add or find.
@@ -20,8 +20,8 @@ git-annex (5.20150328) UNRELEASED; urgency=medium
multiple files.
* import: --deduplicate and --cleanduplicates now output the keys
corresponding to duplicated files they process.
- * fsck: Added --distributed and --expire options,
- for distributed fsck.
+ * expire: New command, for expiring inactive repositories.
+ * fsck: Record fsck activity for use by expire command.
* Fix truncation of parameters that could occur when using xargs git-annex.
* Significantly sped up processing of large numbers of directories
passed to a single git-annex command.
diff --git a/doc/git-annex-expire.mdwn b/doc/git-annex-expire.mdwn
new file mode 100644
index 000000000..ff167804a
--- /dev/null
+++ b/doc/git-annex-expire.mdwn
@@ -0,0 +1,61 @@
+# NAME
+
+git-annex expire - expire inactive repositories
+
+# SYNOPSIS
+
+git annex expire `[repository:]time ...`
+
+# DESCRIPTION
+
+This command expires repositories that have not performed some activity
+within a specified time period. A repository is expired by marking it as
+dead. De-expiration is also done; if a dead repository performed some
+activity recently, it is marked as semitrusted again.
+
+This can be useful when it's not possible to keep track of the state
+of repositories manually. For example, a distributed network of
+repositories where nobody can directly access all the repositories to
+check their status.
+
+The repository can be specified using the name of a remote,
+or the description or uuid of the repository.
+
+The time is in the form "60d" or "1y". A time of "never" will disable
+expiration.
+
+If a time is specified without a repository, it is used as the default
+value for all repositories. Note that the current repository is never
+expired.
+
+# OPTIONS
+
+* `--activity=Name`
+
+ Specify the activity that a repository must have performed to avoid being
+ expired. The default is any activity.
+
+ Currently, the only activity that can be performed to avoid expiration
+ is `git annex fsck`. Note that fscking a remote updates the
+ expiration of the remote repository, not the local repository.
+
+ The first version of git-annex that recorded fsck activity was
+ 5.20150405.
+
+# SEE ALSO
+
+[[git-annex]](1)
+
+[[git-annex-fsck]](1)
+
+[[git-annex-schedule]](1)
+
+[[git-annex-dead]](1)
+
+[[git-annex-semitrust]](1)
+
+# AUTHOR
+
+Joey Hess <id@joeyh.name>
+
+Warning: Automatically converted into a man page by mdwn2man. Edit with care.
diff --git a/doc/git-annex-fsck.mdwn b/doc/git-annex-fsck.mdwn
index 1f5d75f3e..cb27fe452 100644
--- a/doc/git-annex-fsck.mdwn
+++ b/doc/git-annex-fsck.mdwn
@@ -53,44 +53,6 @@ With parameters, only the specified files are checked.
git annex fsck --incremental-schedule 30d --time-limit 5h
-* `--distributed`
-
- Normally, fsck only fixes the git-annex location logs when an inconsistecy
- is detected. In distributed mode, each file that is checked will result
- in a location log update noting the time that it was present.
-
- This is useful in situations where repositories cannot be trusted to
- continue to exist. By running a periodic distributed fsck, those
- repositories can verify that they still exist and that the information
- about their contents is still accurate.
-
- This is not the default mode, because each distributed fsck increases
- the size of the git-annex branch. While it takes care to log identical
- location tracking lines for all keys, which will delta-compress well,
- there is still overhead in committing the changes. If this causes
- the git-annex branch to grow too big, it can be pruned using
- [[git-annex-forget]](1)
-
-* `--expire="[repository:]time`..."
-
- This option makes the fsck check for location logs of the specified
- repository that have not been updated by a distributed fsck within the
- specified time period. Such stale location logs are then thrown out, so
- git-annex will no longer think that a repository contains data, if it is
- not participating in distributed fscking.
-
- The repository can be specified using the name of a remote,
- or the description or uuid of the repository. If a time is specified
- without a repository, it is used as the default value for all
- repositories. Note that location logs for the current repository are
- never expired, since they can be verified directly.
-
- The time is in the form "60d" or "1y". A time of "never" will disable
- expiration.
-
- Note that a remote can always run `fsck` later on to re-update the
- location log if it was expired in error.
-
* `--numcopies=N`
Override the normally configured number of copies.
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index 27439cd3a..11fbc6e01 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -302,6 +302,11 @@ subdirectories).
See [[git-annex-fsck]](1) for details.
+* `expire [repository:]time ...`
+
+ Expires repositories that have not recently performed an activity
+ (such as a fsck).
+
* `unused`
Checks the annex for data that does not correspond to any files present
diff --git a/doc/internals.mdwn b/doc/internals.mdwn
index 824655a92..7e8f4b19a 100644
--- a/doc/internals.mdwn
+++ b/doc/internals.mdwn
@@ -247,6 +247,14 @@ Example:
42bf2035-0636-461d-a367-49e9dfd361dd fsck self 30m every day at any time; fsck 4b3ebc86-0faf-4892-83c5-ce00cbe30f0a 1h every year at any time timestamp=1385646997.053162s
+## `activity.log`
+
+Used to record the times of activities, such as fscks.
+
+Example:
+
+ 42bf2035-0636-461d-a367-49e9dfd361dd Fsck timestamp=1422387398.30395s
+
## `transitions.log`
Used to record transitions, eg by `git annex forget`