aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2014-01-22 22:48:56 -0400
committerGravatar Joey Hess <joey@kitenet.net>2014-01-22 22:53:18 -0400
commit43e3f104239eda7d484cfacc0c13ab6066997845 (patch)
tree09f2d8133cd720394e1bc882594fc44a59e20588
parentd1c13779bb4606cdb6d80a6e1dfd2cdb11478199 (diff)
assistant unused file handling
Make sanity checker run git annex unused daily, and queue up transfers of unused files to any remotes that will have them. The transfer retrying code works for us here, so eg when a backup disk remote is plugged in, any transfers to it are done. Once the unused files reach a remote, they'll be removed locally as unwanted. If the setup does not cause unused files to go to a remote, they'll pile up, and the sanity checker detects this using some heuristics that are pretty good -- 1000 unused files, or 10% of disk used by unused files, or more disk wasted by unused files than is left free. Once it detects this, it pops up an alert in the webapp, with a button to take action. TODO: Webapp UI to configure this, and also the ability to launch an immediate cleanup of all unused files. This commit was sponsored by Simon Michael.
-rw-r--r--Assistant.hs2
-rw-r--r--Assistant/Alert.hs19
-rw-r--r--Assistant/Threads/SanityChecker.hs105
-rw-r--r--Assistant/Threads/WebApp.hs1
-rw-r--r--Assistant/TransferQueue.hs2
-rw-r--r--Assistant/Types/Alert.hs1
-rw-r--r--Logs/Unused.hs1
-rw-r--r--Types/GitConfig.hs5
-rw-r--r--Types/StandardGroups.hs2
-rw-r--r--debian/changelog9
-rw-r--r--doc/git-annex.mdwn13
-rw-r--r--doc/preferred_content.mdwn4
-rw-r--r--doc/todo/Limit_file_revision_history.mdwn7
13 files changed, 157 insertions, 14 deletions
diff --git a/Assistant.hs b/Assistant.hs
index d4786f99a..800a3ef78 100644
--- a/Assistant.hs
+++ b/Assistant.hs
@@ -145,7 +145,7 @@ startDaemon assistant foreground startdelay cannotrun listenhost startbrowser =
, assist $ transferPollerThread
, assist $ transfererThread
, assist $ daemonStatusThread
- , assist $ sanityCheckerDailyThread
+ , assist $ sanityCheckerDailyThread urlrenderer
, assist $ sanityCheckerHourlyThread
, assist $ problemFixerThread urlrenderer
#ifdef WITH_CLIBS
diff --git a/Assistant/Alert.hs b/Assistant/Alert.hs
index 055e66de5..b5ee706b8 100644
--- a/Assistant/Alert.hs
+++ b/Assistant/Alert.hs
@@ -260,6 +260,25 @@ upgradeFailedAlert :: String -> Alert
upgradeFailedAlert msg = (errorAlert msg [])
{ alertHeader = Just $ fromString "Upgrade failed." }
+unusedFilesAlert :: [AlertButton] -> String -> Alert
+unusedFilesAlert buttons message = Alert
+ { alertHeader = Just $ fromString $ unwords
+ [ "Old and deleted files are piling up --"
+ , message
+ ]
+ , alertIcon = Just InfoIcon
+ , alertPriority = High
+ , alertButtons = buttons
+ , alertClosable = True
+ , alertClass = Message
+ , alertMessageRender = renderData
+ , alertCounter = 0
+ , alertBlockDisplay = True
+ , alertName = Just UnusedFilesAlert
+ , alertCombiner = Just $ fullCombiner $ \new _old -> new
+ , alertData = []
+ }
+
brokenRepositoryAlert :: [AlertButton] -> Alert
brokenRepositoryAlert = errorAlert "Serious problems have been detected with your repository. This needs your immediate attention!"
diff --git a/Assistant/Threads/SanityChecker.hs b/Assistant/Threads/SanityChecker.hs
index a1be8dffc..07d46ff84 100644
--- a/Assistant/Threads/SanityChecker.hs
+++ b/Assistant/Threads/SanityChecker.hs
@@ -5,6 +5,8 @@
- Licensed under the GNU GPL version 3 or higher.
-}
+{-# LANGUAGE CPP #-}
+
module Assistant.Threads.SanityChecker (
sanityCheckerStartupThread,
sanityCheckerDailyThread,
@@ -16,7 +18,10 @@ import Assistant.DaemonStatus
import Assistant.Alert
import Assistant.Repair
import Assistant.Ssh
+import Assistant.TransferQueue
+import Assistant.Types.UrlRenderer
import qualified Annex.Branch
+import qualified Git
import qualified Git.LsFiles
import qualified Git.Command
import qualified Git.Config
@@ -25,12 +30,26 @@ import qualified Assistant.Threads.Watcher as Watcher
import Utility.LogFile
import Utility.Batch
import Utility.NotificationBroadcaster
+import Utility.DiskFree
import Config
import Utility.HumanTime
+import Utility.DataUnits
import Git.Repair
import Git.Index
+import Logs.Unused
+import Logs.Location
+import Logs.Transfer
+import Annex.Content
+import Config.Files
+import Types.Key
+import qualified Annex
+#ifdef WITH_WEBAPP
+import Assistant.WebApp.Types
+#endif
import Data.Time.Clock.POSIX
+import qualified Data.Map as M
+import qualified Data.Text as T
{- This thread runs once at startup, and most other threads wait for it
- to finish. (However, the webapp thread does not, to prevent the UI
@@ -78,8 +97,8 @@ sanityCheckerHourlyThread = namedThread "SanityCheckerHourly" $ forever $ do
hourlyCheck
{- This thread wakes up daily to make sure the tree is in good shape. -}
-sanityCheckerDailyThread :: NamedThread
-sanityCheckerDailyThread = namedThread "SanityCheckerDaily" $ forever $ do
+sanityCheckerDailyThread :: UrlRenderer -> NamedThread
+sanityCheckerDailyThread urlrenderer = namedThread "SanityCheckerDaily" $ forever $ do
waitForNextCheck
debug ["starting sanity check"]
@@ -90,7 +109,8 @@ sanityCheckerDailyThread = namedThread "SanityCheckerDaily" $ forever $ do
modifyDaemonStatus_ $ \s -> s { sanityCheckRunning = True }
now <- liftIO getPOSIXTime -- before check started
- r <- either showerr return =<< (tryIO . batch) <~> dailyCheck
+ r <- either showerr return
+ =<< (tryIO . batch) <~> dailyCheck urlrenderer
modifyDaemonStatus_ $ \s -> s
{ sanityCheckRunning = False
@@ -119,8 +139,8 @@ waitForNextCheck = do
{- It's important to stay out of the Annex monad as much as possible while
- running potentially expensive parts of this check, since remaining in it
- will block the watcher. -}
-dailyCheck :: Assistant Bool
-dailyCheck = do
+dailyCheck :: UrlRenderer -> Assistant Bool
+dailyCheck urlrenderer = do
g <- liftAnnex gitRepo
batchmaker <- liftIO getBatchCommandMaker
@@ -147,6 +167,22 @@ dailyCheck = do
, Param "--auto"
] g
+ {- Check if the unused files found last time have been dealt with. -}
+ checkOldUnused urlrenderer
+
+ {- Run git-annex unused once per day. This is run as a separate
+ - process to stay out of the annex monad and so it can run as a
+ - batch job. -}
+ program <- liftIO readProgramFile
+ let (program', params') = batchmaker (program, [Param "unused"])
+ void $ liftIO $ boolSystem program' params'
+ {- Invalidate unused keys cache, and queue transfers of all unused
+ - keys. -}
+ unused <- liftAnnex unusedKeys'
+ void $ liftAnnex $ setUnusedKeys unused
+ forM_ unused $ \k ->
+ queueTransfers "unused" Later k Nothing Upload
+
return True
where
toonew timestamp now = now < (realToFrac (timestamp + slop) :: POSIXTime)
@@ -160,7 +196,8 @@ dailyCheck = do
insanity $ "found unstaged symlink: " ++ file
hourlyCheck :: Assistant ()
-hourlyCheck = checkLogSize 0
+hourlyCheck = do
+ checkLogSize 0
{- Rotate logs until log file size is < 1 mb. -}
checkLogSize :: Int -> Assistant ()
@@ -185,3 +222,59 @@ oneHour = 60 * 60
oneDay :: Int
oneDay = 24 * oneHour
+{- If annex.expireunused is set, find any keys that have lingered unused
+ - for the specified duration, and remove them.
+ -
+ - Otherwise, check to see if unused keys are piling up, and let the user
+ - know. This uses heuristics: 1000 unused keys, or more unused keys
+ - than the remaining free disk space, or more than 1/10th the total
+ - disk space being unused keys all suggest a problem. -}
+checkOldUnused :: UrlRenderer -> Assistant ()
+checkOldUnused urlrenderer = go =<< annexExpireUnused <$> liftAnnex Annex.getGitConfig
+ where
+ go (Just expireunused) = do
+ m <- liftAnnex $ readUnusedLog ""
+ now <- liftIO getPOSIXTime
+ let duration = durationToPOSIXTime expireunused
+ let oldkeys = M.keys $ M.filter (tooold now duration) m
+ forM_ oldkeys $ \k -> do
+ debug ["removing old unused key", key2file k]
+ liftAnnex $ do
+ removeAnnex k
+ logStatus k InfoMissing
+ go Nothing = maybe noop prompt
+ =<< toobig =<< liftAnnex (readUnusedLog "")
+
+ tooold now duration (_, mt) =
+ maybe False (\t -> now - t >= duration) mt
+
+ toobig m = do
+ let num = M.size m
+ let diskused = foldl' sumkeysize 0 (M.keys m)
+ df <- forpath getDiskFree
+ disksize <- forpath getDiskSize
+ return $ if moreused df diskused || tenthused disksize diskused
+ then Just $ roughSize storageUnits False diskused ++
+ " is used by old files"
+ else if num > 1000
+ then Just $ show num ++ " old files exist"
+ else Nothing
+
+ moreused Nothing _ = False
+ moreused (Just df) used = df <= used
+
+ tenthused Nothing _ = False
+ tenthused (Just disksize) used = used >= disksize `div` 10
+
+ sumkeysize s k = s + fromMaybe 0 (keySize k)
+
+ forpath a = liftAnnex $ inRepo $ liftIO . a . Git.repoPath
+
+ prompt msg =
+#ifdef WITH_WEBAPP
+ do
+ button <- mkAlertButton True (T.pack "Fix This") urlrenderer ConfigUnusedR
+ void $ addAlert $ unusedFilesAlert [button] msg
+#else
+ debug [msg]
+#endif
diff --git a/Assistant/Threads/WebApp.hs b/Assistant/Threads/WebApp.hs
index a7f9cc5b2..d2c2afd47 100644
--- a/Assistant/Threads/WebApp.hs
+++ b/Assistant/Threads/WebApp.hs
@@ -27,6 +27,7 @@ import Assistant.WebApp.Configurators.IA
import Assistant.WebApp.Configurators.WebDAV
import Assistant.WebApp.Configurators.XMPP
import Assistant.WebApp.Configurators.Preferences
+import Assistant.WebApp.Configurators.Unused
import Assistant.WebApp.Configurators.Edit
import Assistant.WebApp.Configurators.Delete
import Assistant.WebApp.Configurators.Fsck
diff --git a/Assistant/TransferQueue.hs b/Assistant/TransferQueue.hs
index 98fb2f06c..6d8e72852 100644
--- a/Assistant/TransferQueue.hs
+++ b/Assistant/TransferQueue.hs
@@ -1,6 +1,6 @@
{- git-annex assistant pending transfer queue
-
- - Copyright 2012 Joey Hess <joey@kitenet.net>
+ - Copyright 2012-2014 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
diff --git a/Assistant/Types/Alert.hs b/Assistant/Types/Alert.hs
index e6fbe86d3..19fe55e6e 100644
--- a/Assistant/Types/Alert.hs
+++ b/Assistant/Types/Alert.hs
@@ -32,6 +32,7 @@ data AlertName
| SyncAlert
| NotFsckedAlert
| UpgradeAlert
+ | UnusedFilesAlert
deriving (Eq)
{- The first alert is the new alert, the second is an old alert.
diff --git a/Logs/Unused.hs b/Logs/Unused.hs
index 74f46b85e..cdfcd2165 100644
--- a/Logs/Unused.hs
+++ b/Logs/Unused.hs
@@ -21,6 +21,7 @@ module Logs.Unused (
readUnusedLog,
readUnusedMap,
unusedKeys,
+ setUnusedKeys,
unusedKeys'
) where
diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs
index e19fdc42f..f8e3be98e 100644
--- a/Types/GitConfig.hs
+++ b/Types/GitConfig.hs
@@ -1,6 +1,6 @@
{- git-annex configuration
-
- - Copyright 2012 Joey Hess <joey@kitenet.net>
+ - Copyright 2012-2014 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@@ -20,6 +20,7 @@ import Config.Cost
import Types.Distribution
import Types.Availability
import Types.NumCopies
+import Utility.HumanTime
{- Main git-annex settings. Each setting corresponds to a git-config key
- such as annex.foo -}
@@ -46,6 +47,7 @@ data GitConfig = GitConfig
, annexLargeFiles :: Maybe String
, annexFsckNudge :: Bool
, annexAutoUpgrade :: AutoUpgrade
+ , annexExpireUnused :: Maybe Duration
, coreSymlinks :: Bool
, gcryptId :: Maybe String
}
@@ -75,6 +77,7 @@ extractGitConfig r = GitConfig
, annexLargeFiles = getmaybe (annex "largefiles")
, annexFsckNudge = getbool (annex "fscknudge") True
, annexAutoUpgrade = toAutoUpgrade $ getmaybe (annex "autoupgrade")
+ , annexExpireUnused = parseDuration =<< getmaybe (annex "expireunused")
, coreSymlinks = getbool "core.symlinks" True
, gcryptId = getmaybe "core.gcrypt-id"
}
diff --git a/Types/StandardGroups.hs b/Types/StandardGroups.hs
index f89b4e424..434156a0e 100644
--- a/Types/StandardGroups.hs
+++ b/Types/StandardGroups.hs
@@ -75,7 +75,7 @@ associatedDirectory _ _ = Nothing
{- See doc/preferred_content.mdwn for explanations of these expressions. -}
preferredContent :: StandardGroup -> PreferredContentExpression
preferredContent ClientGroup = lastResort $
- "(exclude=*/archive/* and exclude=archive/*) or (" ++ notArchived ++ ")"
+ "((exclude=*/archive/* and exclude=archive/*) or (" ++ notArchived ++ ")) and not unused"
preferredContent TransferGroup = lastResort $
"not (inallgroup=client and copies=client:2) and (" ++ preferredContent ClientGroup ++ ")"
preferredContent BackupGroup = "include=*"
diff --git a/debian/changelog b/debian/changelog
index 900d84984..420ae603b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -18,6 +18,15 @@ git-annex (5.20140118) UNRELEASED; urgency=medium
preferred content expressions.
* Client, transfer, incremental backup, and archive repositories
now want to get content that does not yet have enough copies.
+ * Client, transfer, and source repositories now do not want to retain
+ unused file contents.
+ * assistant: Checks daily for unused file contents, and when possible
+ moves them to a repository (such as a backup repository) that
+ wants to retain them.
+ * assistant: annex.expireunused can be configured to cause unused
+ file contents to be deleted after some period of time.
+ * webapp: Nudge user to see if they want to expire old unused file
+ contents when a lot of them seem to be piling up in the repository.
* repair: Check git version at run time.
* assistant: Run the periodic git gc in batch mode.
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index 279fa24dd..3e239f7d6 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -1228,6 +1228,19 @@ Here are all the supported configuration settings.
to close it. On Mac OSX, when not using direct mode this defaults to
1 second, to work around a bad interaction with software there.
+* `annex.expireunused`
+
+ Controls what the assistant does about unused file contents
+ that are stored in the repository.
+
+ The default is `false`, which causes
+ all old and unused file contents to be retained, unless the assistant
+ is able to move them to some other repository (such as a backup repository).
+
+ Can be set to a time specification, like "7d" or "1m", and then
+ file contents that have been known to be unused for a week or a
+ month will be deleted.
+
* `annex.fscknudge`
When set to false, prevents the webapp from reminding you when using
diff --git a/doc/preferred_content.mdwn b/doc/preferred_content.mdwn
index 039df3878..92250c034 100644
--- a/doc/preferred_content.mdwn
+++ b/doc/preferred_content.mdwn
@@ -111,9 +111,9 @@ any repository that can will back it up.)
### client
All content is preferred, unless it's for a file in a "archive" directory,
-which has reached an archive repository.
+which has reached an archive repository, or is unused.
-`((exclude=*/archive/* and exclude=archive/*) or (not (copies=archive:1 or copies=smallarchive:1))) or roughlylackingcopies=1`
+`(((exclude=*/archive/* and exclude=archive/*) or (not (copies=archive:1 or copies=smallarchive:1))) or roughlylackingcopies=1) and not unused`
### transfer
diff --git a/doc/todo/Limit_file_revision_history.mdwn b/doc/todo/Limit_file_revision_history.mdwn
index c00b555b1..901d6f052 100644
--- a/doc/todo/Limit_file_revision_history.mdwn
+++ b/doc/todo/Limit_file_revision_history.mdwn
@@ -51,12 +51,13 @@ Finally, how to specify a feature request for git-annex?
> `and (not unused)`. Transfer repositories too, because typically
> only client repos connect to them, and so otherwise unused files
> would build up there. Backup repos would want unused files. I
-> think that archive repos would too.
+> think that archive repos would too. **done**
> * Make the assistant check for unused files periodically. Exactly
> how often may need to be tuned, but once per day seems reasonable
> for most repos. Note that the assistant could also notice on the
> fly when files are removed and mark their keys as unused if that was
> the last associated file. (Only currently possible in direct mode.)
+> **done**
> * After scanning for unused files, it makes sense for the
> assistant to queue transfers of unused files to any remotes that
> do want them (eg, backup remotes). If the files can successfully be
@@ -75,7 +76,9 @@ Finally, how to specify a feature request for git-annex?
> is not set, and there is some significant quantity of unused files
> (eg, more than 1000, or more than 1 gb, or more than the amount of
> remaining free disk space),
-> it can pop up a webapp alert asking to configure it.
+> it can pop up a webapp alert asking to configure it. **done**
+> * Webapp interface to configure annex.expireunused. Reasonable values
+> are no expiring, or any number of days.
>
> This does not cover every use case that was requested.
> But I don't see a cheap way to ensure it keeps eg the past 10 versions of