summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2014-01-21 18:46:39 -0400
committerGravatar Joey Hess <joey@kitenet.net>2014-01-21 18:49:25 -0400
commit7c3f0eae77b07ccc65c4e30d9eb1288781bd0c02 (patch)
treeaf6b86482c4f67f710349677a48917ac29fa404f
parentd71baf07108c4903c444175ca482af1ed4cca1b4 (diff)
benchmarked numcopies .gitattributes in preferred content
Checking .gitattributes adds a full minute to a git annex find looking for files that don't have enough copies. 2:25 increasts to 3:27. I feel this is too much of a slowdown to justify making it the default. So, exposed two versions of the preferred content expression, a slow one and a fast but approximate one. I'm using the approximate one in the default preferred content expressions to avoid slowing down the assistant.
-rw-r--r--Annex/FileMatcher.hs3
-rw-r--r--GitAnnex/Options.hs4
-rw-r--r--Limit.hs41
-rw-r--r--Types/StandardGroups.hs6
-rw-r--r--debian/changelog2
-rw-r--r--doc/git-annex.mdwn10
-rw-r--r--doc/preferred_content.mdwn8
-rw-r--r--doc/todo/preferred_content_numcopies_check.mdwn4
8 files changed, 42 insertions, 36 deletions
diff --git a/Annex/FileMatcher.hs b/Annex/FileMatcher.hs
index 6ec0bace9..b26a0d7ac 100644
--- a/Annex/FileMatcher.hs
+++ b/Annex/FileMatcher.hs
@@ -70,7 +70,8 @@ parseToken checkpresent checkpreferreddir groupmap t
[ ("include", limitInclude)
, ("exclude", limitExclude)
, ("copies", limitCopies)
- , ("numcopiesneeded", limitNumCopiesNeeded)
+ , ("lackingcopies", limitLackingCopies False)
+ , ("approxlackingcopies", limitLackingCopies True)
, ("inbackend", limitInBackend)
, ("largerthan", limitSize (>))
, ("smallerthan", limitSize (<))
diff --git a/GitAnnex/Options.hs b/GitAnnex/Options.hs
index 10fcc0073..dc9a0be31 100644
--- a/GitAnnex/Options.hs
+++ b/GitAnnex/Options.hs
@@ -42,8 +42,10 @@ options = Option.common ++
"match files present in a remote"
, Option ['C'] ["copies"] (ReqArg Limit.addCopies paramNumber)
"skip files with fewer copies"
- , Option [] ["numcopiesneeded"] (ReqArg Limit.addNumCopiesNeeded paramNumber)
+ , Option [] ["lackingcopies"] (ReqArg (Limit.addLackingCopies False) paramNumber)
"match files that need more copies"
+ , Option [] ["approxlackingcopies"] (ReqArg (Limit.addLackingCopies True) paramNumber)
+ "match files that need more copies (faster)"
, Option ['B'] ["inbackend"] (ReqArg Limit.addInBackend paramName)
"match files using a key-value backend"
, Option [] ["inallgroup"] (ReqArg Limit.addInAllGroup paramGroup)
diff --git a/Limit.hs b/Limit.hs
index 471a0c278..6ce444325 100644
--- a/Limit.hs
+++ b/Limit.hs
@@ -178,29 +178,26 @@ limitCopies want = case split ":" want of
| "+" `isSuffixOf` s = (>=) <$> readTrustLevel (beginning s)
| otherwise = (==) <$> readTrustLevel s
-{- Adds a limit to match files that need more copies made.
- -
- - Does not look at annex.numcopies .gitattributes, because that
- - would require querying git check-attr every time a preferred content
- - expression is checked, which would probably be quite slow.
- -}
-addNumCopiesNeeded :: String -> Annex ()
-addNumCopiesNeeded = addLimit . limitNumCopiesNeeded
-
-limitNumCopiesNeeded :: MkLimit
-limitNumCopiesNeeded want = case readish want of
- Just needed -> Right $ \notpresent -> checkKey $
- handle needed notpresent
- Nothing -> Left "bad value for numcopiesneeded"
+{- Adds a limit to match files that need more copies made. -}
+addLackingCopies :: Bool -> String -> Annex ()
+addLackingCopies approx = addLimit . limitLackingCopies approx
+
+limitLackingCopies :: Bool -> MkLimit
+limitLackingCopies approx want = case readish want of
+ Just needed -> Right $ \notpresent mi -> flip checkKey mi $
+ handle mi needed notpresent
+ Nothing -> Left "bad value for number of lacking copies"
where
- handle needed notpresent key = do
- gv <- getGlobalNumCopies
- case gv of
- Nothing -> return False
- Just (NumCopies numcopies) -> do
- us <- filter (`S.notMember` notpresent)
- <$> (trustExclude UnTrusted =<< Remote.keyLocations key)
- return $ numcopies - length us >= needed
+ handle mi needed notpresent key = do
+ NumCopies numcopies <- if approx
+ then approxNumCopies
+ else case mi of
+ MatchingKey _ -> approxNumCopies
+ MatchingFile fi -> getGlobalFileNumCopies $ matchFile fi
+ us <- filter (`S.notMember` notpresent)
+ <$> (trustExclude UnTrusted =<< Remote.keyLocations key)
+ return $ numcopies - length us >= needed
+ approxNumCopies = fromMaybe defaultNumCopies <$> getGlobalNumCopies
{- Adds a limit to skip files not believed to be present in all
- repositories in the specified group. -}
diff --git a/Types/StandardGroups.hs b/Types/StandardGroups.hs
index c4c3ba9f3..f89b4e424 100644
--- a/Types/StandardGroups.hs
+++ b/Types/StandardGroups.hs
@@ -93,6 +93,8 @@ notArchived :: String
notArchived = "not (copies=archive:1 or copies=smallarchive:1)"
{- Most repositories want any content that is only on untrusted
- - or dead repositories, or that otherwise does not have enough copies. -}
+ - or dead repositories, or that otherwise does not have enough copies.
+ - Does not look at .gitattributes since that is quite a lot slower.
+ -}
lastResort :: String -> PreferredContentExpression
-lastResort s = "(" ++ s ++ ") or numcopiesneeded=1"
+lastResort s = "(" ++ s ++ ") or approxlackingcopies=1"
diff --git a/debian/changelog b/debian/changelog
index d41d2aac1..aba8a5d3f 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -14,7 +14,7 @@ git-annex (5.20140118) UNRELEASED; urgency=medium
command is used to set the global number of copies, any annex.numcopies
git configs will be ignored.
* assistant: Make the prefs page set the global numcopies.
- * Add numcopiesneeded preferred content expression.
+ * Add lackingcopies and approxlackingcopies preferred content expressions.
* Client, transfer, incremental backup, and archive repositories
now want to get content that does not yet have enough copies.
* repair: Check git version at run time.
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index 6e7a6ed55..279fa24dd 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -1022,14 +1022,16 @@ file contents are present at either of two repositories.
copies, on remotes in the specified group. For example,
`--copies=archive:2`
-* `--numcopiesneeded=number`
+* `--lackingcopies=number`
Matches only files that git-annex believes need the specified number or
more additional copies to be made in order to satisfy their numcopies
- setting, as configured by the global numcopies setting of the repository.
+ settings.
- Note that for various reasons, including speed, this does not look
- at the annex.numcopies .gitattributes settings of files.
+* `--approxlackingcopies=number`
+
+ Like lackingcopies, but does not look at .gitattributes annex.numcopies
+ settings. This makes it significantly faster.
* `--inbackend=name`
diff --git a/doc/preferred_content.mdwn b/doc/preferred_content.mdwn
index b18f46c33..039df3878 100644
--- a/doc/preferred_content.mdwn
+++ b/doc/preferred_content.mdwn
@@ -113,7 +113,7 @@ any repository that can will back it up.)
All content is preferred, unless it's for a file in a "archive" directory,
which has reached an archive repository.
-`((exclude=*/archive/* and exclude=archive/*) or (not (copies=archive:1 or copies=smallarchive:1))) or numcopiesneeded=1`
+`((exclude=*/archive/* and exclude=archive/*) or (not (copies=archive:1 or copies=smallarchive:1))) or roughlylackingcopies=1`
### transfer
@@ -147,20 +147,20 @@ All content is preferred.
Only prefers content that's not already backed up to another backup
or incremental backup repository.
-`(include=* and (not copies=backup:1) and (not copies=incrementalbackup:1)) or numcopiesneeded=1`
+`(include=* and (not copies=backup:1) and (not copies=incrementalbackup:1)) or approxlackingcopies=1`
### small archive
Only prefers content that's located in an "archive" directory, and
only if it's not already been archived somewhere else.
-`((include=*/archive/* or include=archive/*) and not (copies=archive:1 or copies=smallarchive:1)) or numcopiesneeded=1`
+`((include=*/archive/* or include=archive/*) and not (copies=archive:1 or copies=smallarchive:1)) or approxlackingcopies=1`
### full archive
All content is preferred, unless it's already been archived somewhere else.
-`(not (copies=archive:1 or copies=smallarchive:1)) or numcopiesneeded=1`
+`(not (copies=archive:1 or copies=smallarchive:1)) or approxlackingcopies=1`
Note that if you want to archive multiple copies (not a bad idea!),
you should instead configure all your archive repositories with a
diff --git a/doc/todo/preferred_content_numcopies_check.mdwn b/doc/todo/preferred_content_numcopies_check.mdwn
index 8aa736a04..2e007460f 100644
--- a/doc/todo/preferred_content_numcopies_check.mdwn
+++ b/doc/todo/preferred_content_numcopies_check.mdwn
@@ -59,7 +59,9 @@ Conclusion:
to instead end with "or numcopiesneeded=1" **done**
* See if "numcopiesneeded=N" can check .gitattributes without getting
a lot slower. If now, perhaps add a "numcopiesneededaccurate=N" that
- checks it.
+ checks it. **done**
+
+[[done]]
## Stability analysis