summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2015-03-31 13:29:51 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2015-03-31 13:30:13 -0400
commit23e90f6f3cb5a570e8bf9eb3d2798c27a5ba2537 (patch)
tree6d6eab72231cb43bce52f0b332315b044b91341f
parent41f7627baee212ce4a4a75126109d9569c790dd0 (diff)
importfeed: Avoid downloading a redundant item from a feed whose guid has been downloaded before, even when the url has changed.
To support this, always store itemid in metadata; before this was only done when annex.genmetadata was set.
-rw-r--r--Command/ImportFeed.hs49
-rw-r--r--Logs/Web.hs8
-rw-r--r--debian/changelog4
-rw-r--r--doc/git-annex-importfeed.mdwn2
-rw-r--r--doc/git-annex.mdwn3
-rw-r--r--doc/todo/podcatching_handling_updated_files.mdwn2
6 files changed, 54 insertions, 14 deletions
diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs
index 231c921c3..b60627cfe 100644
--- a/Command/ImportFeed.hs
+++ b/Command/ImportFeed.hs
@@ -73,7 +73,7 @@ perform opts cache url = do
v <- findDownloads url
case v of
[] -> do
- feedProblem url "bad feed content"
+ feedProblem url "bad feed content; no enclosures to download"
next $ return True
l -> do
ok <- and <$> mapM (performDownload opts cache) l
@@ -96,21 +96,32 @@ data ToDownload = ToDownload
data DownloadLocation = Enclosure URLString | QuviLink URLString
+type ItemId = String
+
data Cache = Cache
{ knownurls :: S.Set URLString
+ , knownitems :: S.Set ItemId
, template :: Utility.Format.Format
}
getCache :: Maybe String -> Annex Cache
getCache opttemplate = ifM (Annex.getState Annex.force)
- ( ret S.empty
+ ( ret S.empty S.empty
, do
showSideAction "checking known urls"
- ret =<< S.fromList <$> knownUrls
+ (is, us) <- unzip <$> (mapM knownItems =<< knownUrls)
+ ret (S.fromList us) (S.fromList (concat is))
)
where
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
- ret s = return $ Cache s tmpl
+ ret us is = return $ Cache us is tmpl
+
+knownItems :: (Key, URLString) -> Annex ([ItemId], URLString)
+knownItems (k, u) = do
+ itemids <- S.toList . S.filter (/= noneValue) . S.map fromMetaValue
+ . currentMetaDataValues itemIdField
+ <$> getCurrentMetaData k
+ return (itemids, u)
findDownloads :: URLString -> Annex [ToDownload]
findDownloads u = go =<< downloadFeed u
@@ -191,12 +202,18 @@ performDownload opts cache todownload = case location todownload of
where
forced = Annex.getState Annex.force
- {- Avoids downloading any urls that are already known to be
+ {- Avoids downloading any items that are already known to be
- associated with a file in the annex, unless forced. -}
checkknown url a
- | S.member url (knownurls cache) = ifM forced (a, return True)
+ | knownitemid || S.member url (knownurls cache)
+ = ifM forced (a, return True)
| otherwise = a
+ knownitemid = case getItemId (item todownload) of
+ -- only when it's a permalink
+ Just (True, itemid) -> S.member itemid (knownitems cache)
+ _ -> False
+
rundownload url extension getter = do
dest <- makeunique url (1 :: Integer) $
feedFile (template cache) todownload extension
@@ -211,8 +228,10 @@ performDownload opts cache todownload = case location todownload of
checkFeedBroken (feedurl todownload)
else do
forM_ ks $ \key ->
- whenM (annexGenMetaData <$> Annex.getGitConfig) $
- addMetaData key $ extractMetaData todownload
+ ifM (annexGenMetaData <$> Annex.getGitConfig)
+ ( addMetaData key $ extractMetaData todownload
+ , addMetaData key $ minimalMetaData todownload
+ )
showEndOk
return True
@@ -275,6 +294,12 @@ extractMetaData i = meta
tometa (k, v) = (mkMetaFieldUnchecked k, S.singleton (toMetaValue v))
meta = MetaData $ M.fromList $ map tometa $ extractFields i
+minimalMetaData :: ToDownload -> MetaData
+minimalMetaData i = case getItemId (item i) of
+ (Nothing) -> emptyMetaData
+ (Just (_, itemid)) -> MetaData $ M.singleton itemIdField
+ (S.singleton $ toMetaValue itemid)
+
{- Extract fields from the feed and item, that are both used as metadata,
- and to generate the filename. -}
extractFields :: ToDownload -> [(String, String)]
@@ -296,12 +321,18 @@ extractFields i = map (uncurry extractField)
feedauthor = getFeedAuthor $ feed i
itemauthor = getItemAuthor $ item i
+itemIdField :: MetaField
+itemIdField = mkMetaFieldUnchecked "itemid"
+
extractField :: String -> [Maybe String] -> (String, String)
-extractField k [] = (k, "none")
+extractField k [] = (k, noneValue)
extractField k (Just v:_)
| not (null v) = (k, v)
extractField k (_:rest) = extractField k rest
+noneValue :: String
+noneValue = "none"
+
{- Called when there is a problem with a feed.
- Throws an error if the feed is broken, otherwise shows a warning. -}
feedProblem :: URLString -> String -> Annex ()
diff --git a/Logs/Web.hs b/Logs/Web.hs
index ed2f575bd..02d60170f 100644
--- a/Logs/Web.hs
+++ b/Logs/Web.hs
@@ -21,7 +21,6 @@ module Logs.Web (
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Map as M
-import Data.Tuple.Utils
import Common.Annex
import qualified Annex
@@ -70,7 +69,7 @@ setUrlMissing uuid key url = do
logChange key uuid InfoMissing
{- Finds all known urls. -}
-knownUrls :: Annex [URLString]
+knownUrls :: Annex [(Key, URLString)]
knownUrls = do
{- Ensure the git-annex branch's index file is up-to-date and
- any journaled changes are reflected in it, since we're going
@@ -80,10 +79,13 @@ knownUrls = do
Annex.Branch.withIndex $ do
top <- fromRepo Git.repoPath
(l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
- r <- mapM (geturls . snd3) $ filter (isUrlLog . fst3) l
+ r <- mapM getkeyurls l
void $ liftIO cleanup
return $ concat r
where
+ getkeyurls (f, s, _) = case urlLogFileKey f of
+ Just k -> zip (repeat k) <$> geturls s
+ Nothing -> return []
geturls Nothing = return []
geturls (Just logsha) = getLog . L.unpack <$> catObject logsha
diff --git a/debian/changelog b/debian/changelog
index e9d2558c1..196743bd7 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -8,6 +8,10 @@ git-annex (5.20150328) UNRELEASED; urgency=medium
* Fix GETURLS in external special remote protocol to strip
downloader prefix from logged url info before checking for the
specified prefix.
+ * importfeed: Avoid downloading a redundant item from a feed whose
+ guid has been downloaded before, even when the url has changed.
+ * importfeed: Always store itemid in metadata; before this was only
+ done when annex.genmetadata was set.
-- Joey Hess <id@joeyh.name> Fri, 27 Mar 2015 16:04:43 -0400
diff --git a/doc/git-annex-importfeed.mdwn b/doc/git-annex-importfeed.mdwn
index 69c841667..3bb19d25b 100644
--- a/doc/git-annex-importfeed.mdwn
+++ b/doc/git-annex-importfeed.mdwn
@@ -21,7 +21,7 @@ importing e.g., youtube playlists.
* `--force`
- Force downoading urls it's seen before.
+ Force downoading items it's seen before.
* `--template`
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index 316d33537..d8f3fb6fb 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -792,7 +792,8 @@ Here are all the supported configuration settings.
In particular, it stores year and month metadata, from the file's
modification date.
- When importfeed is used, it stores additional metadata from the feed.
+ When importfeed is used, it stores additional metadata from the feed,
+ such as the author, title, etc.
* `annex.queuesize`
diff --git a/doc/todo/podcatching_handling_updated_files.mdwn b/doc/todo/podcatching_handling_updated_files.mdwn
index 998b084a6..2d9800283 100644
--- a/doc/todo/podcatching_handling_updated_files.mdwn
+++ b/doc/todo/podcatching_handling_updated_files.mdwn
@@ -11,6 +11,8 @@ known items, it could instead build a `Map (Either URlString GUID) Key`.
This would at least prevent the duplication, when the feed has guids.
+> [[done]] --[[Joey]]
+
It would be even nicer if the old file could be updated with the new
content. But, since files can be moved around, deleted, tagged, etc,
that only seems practical at all if the file is still in the directory