diff options
author | Joey Hess <joey@kitenet.net> | 2014-01-21 18:46:39 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2014-01-21 18:49:25 -0400 |
commit | 7c3f0eae77b07ccc65c4e30d9eb1288781bd0c02 (patch) | |
tree | af6b86482c4f67f710349677a48917ac29fa404f | |
parent | d71baf07108c4903c444175ca482af1ed4cca1b4 (diff) |
benchmarked numcopies .gitattributes in preferred content
Checking .gitattributes adds a full minute to a git annex find looking for
files that don't have enough copies. 2:25 increasts to 3:27. I feel this is
too much of a slowdown to justify making it the default. So, exposed two
versions of the preferred content expression, a slow one and a fast but
approximate one.
I'm using the approximate one in the default preferred content expressions
to avoid slowing down the assistant.
-rw-r--r-- | Annex/FileMatcher.hs | 3 | ||||
-rw-r--r-- | GitAnnex/Options.hs | 4 | ||||
-rw-r--r-- | Limit.hs | 41 | ||||
-rw-r--r-- | Types/StandardGroups.hs | 6 | ||||
-rw-r--r-- | debian/changelog | 2 | ||||
-rw-r--r-- | doc/git-annex.mdwn | 10 | ||||
-rw-r--r-- | doc/preferred_content.mdwn | 8 | ||||
-rw-r--r-- | doc/todo/preferred_content_numcopies_check.mdwn | 4 |
8 files changed, 42 insertions, 36 deletions
diff --git a/Annex/FileMatcher.hs b/Annex/FileMatcher.hs index 6ec0bace9..b26a0d7ac 100644 --- a/Annex/FileMatcher.hs +++ b/Annex/FileMatcher.hs @@ -70,7 +70,8 @@ parseToken checkpresent checkpreferreddir groupmap t [ ("include", limitInclude) , ("exclude", limitExclude) , ("copies", limitCopies) - , ("numcopiesneeded", limitNumCopiesNeeded) + , ("lackingcopies", limitLackingCopies False) + , ("approxlackingcopies", limitLackingCopies True) , ("inbackend", limitInBackend) , ("largerthan", limitSize (>)) , ("smallerthan", limitSize (<)) diff --git a/GitAnnex/Options.hs b/GitAnnex/Options.hs index 10fcc0073..dc9a0be31 100644 --- a/GitAnnex/Options.hs +++ b/GitAnnex/Options.hs @@ -42,8 +42,10 @@ options = Option.common ++ "match files present in a remote" , Option ['C'] ["copies"] (ReqArg Limit.addCopies paramNumber) "skip files with fewer copies" - , Option [] ["numcopiesneeded"] (ReqArg Limit.addNumCopiesNeeded paramNumber) + , Option [] ["lackingcopies"] (ReqArg (Limit.addLackingCopies False) paramNumber) "match files that need more copies" + , Option [] ["approxlackingcopies"] (ReqArg (Limit.addLackingCopies True) paramNumber) + "match files that need more copies (faster)" , Option ['B'] ["inbackend"] (ReqArg Limit.addInBackend paramName) "match files using a key-value backend" , Option [] ["inallgroup"] (ReqArg Limit.addInAllGroup paramGroup) @@ -178,29 +178,26 @@ limitCopies want = case split ":" want of | "+" `isSuffixOf` s = (>=) <$> readTrustLevel (beginning s) | otherwise = (==) <$> readTrustLevel s -{- Adds a limit to match files that need more copies made. - - - - Does not look at annex.numcopies .gitattributes, because that - - would require querying git check-attr every time a preferred content - - expression is checked, which would probably be quite slow. - -} -addNumCopiesNeeded :: String -> Annex () -addNumCopiesNeeded = addLimit . limitNumCopiesNeeded - -limitNumCopiesNeeded :: MkLimit -limitNumCopiesNeeded want = case readish want of - Just needed -> Right $ \notpresent -> checkKey $ - handle needed notpresent - Nothing -> Left "bad value for numcopiesneeded" +{- Adds a limit to match files that need more copies made. -} +addLackingCopies :: Bool -> String -> Annex () +addLackingCopies approx = addLimit . limitLackingCopies approx + +limitLackingCopies :: Bool -> MkLimit +limitLackingCopies approx want = case readish want of + Just needed -> Right $ \notpresent mi -> flip checkKey mi $ + handle mi needed notpresent + Nothing -> Left "bad value for number of lacking copies" where - handle needed notpresent key = do - gv <- getGlobalNumCopies - case gv of - Nothing -> return False - Just (NumCopies numcopies) -> do - us <- filter (`S.notMember` notpresent) - <$> (trustExclude UnTrusted =<< Remote.keyLocations key) - return $ numcopies - length us >= needed + handle mi needed notpresent key = do + NumCopies numcopies <- if approx + then approxNumCopies + else case mi of + MatchingKey _ -> approxNumCopies + MatchingFile fi -> getGlobalFileNumCopies $ matchFile fi + us <- filter (`S.notMember` notpresent) + <$> (trustExclude UnTrusted =<< Remote.keyLocations key) + return $ numcopies - length us >= needed + approxNumCopies = fromMaybe defaultNumCopies <$> getGlobalNumCopies {- Adds a limit to skip files not believed to be present in all - repositories in the specified group. -} diff --git a/Types/StandardGroups.hs b/Types/StandardGroups.hs index c4c3ba9f3..f89b4e424 100644 --- a/Types/StandardGroups.hs +++ b/Types/StandardGroups.hs @@ -93,6 +93,8 @@ notArchived :: String notArchived = "not (copies=archive:1 or copies=smallarchive:1)" {- Most repositories want any content that is only on untrusted - - or dead repositories, or that otherwise does not have enough copies. -} + - or dead repositories, or that otherwise does not have enough copies. + - Does not look at .gitattributes since that is quite a lot slower. + -} lastResort :: String -> PreferredContentExpression -lastResort s = "(" ++ s ++ ") or numcopiesneeded=1" +lastResort s = "(" ++ s ++ ") or approxlackingcopies=1" diff --git a/debian/changelog b/debian/changelog index d41d2aac1..aba8a5d3f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -14,7 +14,7 @@ git-annex (5.20140118) UNRELEASED; urgency=medium command is used to set the global number of copies, any annex.numcopies git configs will be ignored. * assistant: Make the prefs page set the global numcopies. - * Add numcopiesneeded preferred content expression. + * Add lackingcopies and approxlackingcopies preferred content expressions. * Client, transfer, incremental backup, and archive repositories now want to get content that does not yet have enough copies. * repair: Check git version at run time. diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 6e7a6ed55..279fa24dd 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1022,14 +1022,16 @@ file contents are present at either of two repositories. copies, on remotes in the specified group. For example, `--copies=archive:2` -* `--numcopiesneeded=number` +* `--lackingcopies=number` Matches only files that git-annex believes need the specified number or more additional copies to be made in order to satisfy their numcopies - setting, as configured by the global numcopies setting of the repository. + settings. - Note that for various reasons, including speed, this does not look - at the annex.numcopies .gitattributes settings of files. +* `--approxlackingcopies=number` + + Like lackingcopies, but does not look at .gitattributes annex.numcopies + settings. This makes it significantly faster. * `--inbackend=name` diff --git a/doc/preferred_content.mdwn b/doc/preferred_content.mdwn index b18f46c33..039df3878 100644 --- a/doc/preferred_content.mdwn +++ b/doc/preferred_content.mdwn @@ -113,7 +113,7 @@ any repository that can will back it up.) All content is preferred, unless it's for a file in a "archive" directory, which has reached an archive repository. -`((exclude=*/archive/* and exclude=archive/*) or (not (copies=archive:1 or copies=smallarchive:1))) or numcopiesneeded=1` +`((exclude=*/archive/* and exclude=archive/*) or (not (copies=archive:1 or copies=smallarchive:1))) or roughlylackingcopies=1` ### transfer @@ -147,20 +147,20 @@ All content is preferred. Only prefers content that's not already backed up to another backup or incremental backup repository. -`(include=* and (not copies=backup:1) and (not copies=incrementalbackup:1)) or numcopiesneeded=1` +`(include=* and (not copies=backup:1) and (not copies=incrementalbackup:1)) or approxlackingcopies=1` ### small archive Only prefers content that's located in an "archive" directory, and only if it's not already been archived somewhere else. -`((include=*/archive/* or include=archive/*) and not (copies=archive:1 or copies=smallarchive:1)) or numcopiesneeded=1` +`((include=*/archive/* or include=archive/*) and not (copies=archive:1 or copies=smallarchive:1)) or approxlackingcopies=1` ### full archive All content is preferred, unless it's already been archived somewhere else. -`(not (copies=archive:1 or copies=smallarchive:1)) or numcopiesneeded=1` +`(not (copies=archive:1 or copies=smallarchive:1)) or approxlackingcopies=1` Note that if you want to archive multiple copies (not a bad idea!), you should instead configure all your archive repositories with a diff --git a/doc/todo/preferred_content_numcopies_check.mdwn b/doc/todo/preferred_content_numcopies_check.mdwn index 8aa736a04..2e007460f 100644 --- a/doc/todo/preferred_content_numcopies_check.mdwn +++ b/doc/todo/preferred_content_numcopies_check.mdwn @@ -59,7 +59,9 @@ Conclusion: to instead end with "or numcopiesneeded=1" **done** * See if "numcopiesneeded=N" can check .gitattributes without getting a lot slower. If now, perhaps add a "numcopiesneededaccurate=N" that - checks it. + checks it. **done** + +[[done]] ## Stability analysis |