summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2013-07-28 15:27:36 -0400
committerGravatar Joey Hess <joey@kitenet.net>2013-07-28 16:55:42 -0400
commit217068206893864ed05911c3b06d8fdb802750a1 (patch)
tree3bfa87ee63197405f3aa18138eb8affd3ce7c7e7
parent468e2c789371f924ec4586148e4e9e5618c58303 (diff)
importfeed: git-annex becomes a podcatcher in 150 LOC
-rw-r--r--Annex/Branch.hs1
-rw-r--r--Annex/Direct.hs2
-rw-r--r--Command/AddUrl.hs54
-rw-r--r--Command/ImportFeed.hs148
-rw-r--r--Command/Indirect.hs2
-rw-r--r--Git/LsFiles.hs21
-rw-r--r--GitAnnex.hs6
-rw-r--r--Logs/Web.hs47
-rw-r--r--Utility/Url.hs3
-rw-r--r--debian/changelog1
-rw-r--r--debian/control1
-rw-r--r--doc/git-annex.mdwn13
-rw-r--r--doc/install/fromscratch.mdwn1
-rw-r--r--doc/tips/downloading_podcasts.mdwn44
-rw-r--r--git-annex.cabal7
15 files changed, 319 insertions, 32 deletions
diff --git a/Annex/Branch.hs b/Annex/Branch.hs
index 1c260ff7e..bc3736a9a 100644
--- a/Annex/Branch.hs
+++ b/Annex/Branch.hs
@@ -21,6 +21,7 @@ module Annex.Branch (
change,
commit,
files,
+ withIndex,
) where
import qualified Data.ByteString.Lazy.Char8 as L
diff --git a/Annex/Direct.hs b/Annex/Direct.hs
index cf5806ad6..d2e2cdc00 100644
--- a/Annex/Direct.hs
+++ b/Annex/Direct.hs
@@ -35,7 +35,7 @@ stageDirect :: Annex Bool
stageDirect = do
Annex.Queue.flush
top <- fromRepo Git.repoPath
- (l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
+ (l, cleanup) <- inRepo $ Git.LsFiles.stagedOthersDetails [top]
forM_ l go
void $ liftIO cleanup
staged <- Annex.Queue.size
diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs
index f45d00cc6..5c8c224f2 100644
--- a/Command/AddUrl.hs
+++ b/Command/AddUrl.hs
@@ -1,6 +1,6 @@
{- git-annex command
-
- - Copyright 2011 Joey Hess <joey@kitenet.net>
+ - Copyright 2011-2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@@ -61,10 +61,7 @@ start relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
perform :: Bool -> String -> FilePath -> CommandPerform
perform relaxed url file = ifAnnexed file addurl geturl
where
- geturl = do
- liftIO $ createDirectoryIfMissing True (parentDir file)
- ifM (Annex.getState Annex.fast <||> pure relaxed)
- ( nodownload relaxed url file , download url file )
+ geturl = next $ addUrlFile relaxed url file
addurl (key, _backend)
| relaxed = do
setUrlPresent key url
@@ -80,22 +77,35 @@ perform relaxed url file = ifAnnexed file addurl geturl
stop
)
-download :: String -> FilePath -> CommandPerform
+addUrlFile :: Bool -> String -> FilePath -> Annex Bool
+addUrlFile relaxed url file = do
+ liftIO $ createDirectoryIfMissing True (parentDir file)
+ ifM (Annex.getState Annex.fast <||> pure relaxed)
+ ( nodownload relaxed url file
+ , do
+ showAction $ "downloading " ++ url ++ " "
+ download url file
+ )
+
+download :: String -> FilePath -> Annex Bool
download url file = do
- showAction $ "downloading " ++ url ++ " "
dummykey <- genkey
tmp <- fromRepo $ gitAnnexTmpLocation dummykey
- stopUnless (runtransfer dummykey tmp) $ do
- backend <- chooseBackend file
- let source = KeySource
- { keyFilename = file
- , contentLocation = tmp
- , inodeCache = Nothing
- }
- k <- genKey source backend
- case k of
- Nothing -> stop
- Just (key, _) -> next $ cleanup url file key (Just tmp)
+ showOutput
+ ifM (runtransfer dummykey tmp)
+ ( do
+ backend <- chooseBackend file
+ let source = KeySource
+ { keyFilename = file
+ , contentLocation = tmp
+ , inodeCache = Nothing
+ }
+ k <- genKey source backend
+ case k of
+ Nothing -> return False
+ Just (key, _) -> cleanup url file key (Just tmp)
+ , return False
+ )
where
{- Generate a dummy key to use for this download, before we can
- examine the file and find its real key. This allows resuming
@@ -119,7 +129,7 @@ download url file = do
downloadUrl [url] tmp
-cleanup :: String -> FilePath -> Key -> Maybe FilePath -> CommandCleanup
+cleanup :: String -> FilePath -> Key -> Maybe FilePath -> Annex Bool
cleanup url file key mtmp = do
when (isJust mtmp) $
logStatus key InfoPresent
@@ -133,7 +143,7 @@ cleanup url file key mtmp = do
maybe noop (moveAnnex key) mtmp
return True
-nodownload :: Bool -> String -> FilePath -> CommandPerform
+nodownload :: Bool -> String -> FilePath -> Annex Bool
nodownload relaxed url file = do
headers <- getHttpHeaders
(exists, size) <- if relaxed
@@ -142,10 +152,10 @@ nodownload relaxed url file = do
if exists
then do
let key = Backend.URL.fromUrl url size
- next $ cleanup url file key Nothing
+ cleanup url file key Nothing
else do
warning $ "unable to access url: " ++ url
- stop
+ return False
url2file :: URI -> Maybe Int -> FilePath
url2file url pathdepth = case pathdepth of
diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs
new file mode 100644
index 000000000..a4a5cfd4c
--- /dev/null
+++ b/Command/ImportFeed.hs
@@ -0,0 +1,148 @@
+{- git-annex command
+ -
+ - Copyright 2013 Joey Hess <joey@kitenet.net>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Command.ImportFeed where
+
+import Text.Feed.Import
+import Text.Feed.Query
+import Text.Feed.Types
+import qualified Data.Set as S
+import qualified Data.Map as M
+import Data.Char
+
+import Common.Annex
+import Command
+import qualified Utility.Url as Url
+import Logs.Web
+import qualified Option
+import qualified Utility.Format
+import Utility.Tmp
+import Command.AddUrl (addUrlFile, relaxedOption)
+
+data ToDownload = ToDownload
+ { feed :: Feed
+ , item :: Item
+ , location :: URLString
+ }
+
+mkToDownload :: Feed -> Item -> Maybe ToDownload
+mkToDownload f i = case getItemEnclosure i of
+ Nothing -> Nothing
+ Just (enclosureurl, _, _) -> Just $ ToDownload f i enclosureurl
+
+def :: [Command]
+def = [notBareRepo $ withOptions [templateOption, relaxedOption] $
+ command "importfeed" (paramRepeating paramUrl) seek
+ SectionCommon "import files from podcast feeds"]
+
+templateOption :: Option
+templateOption = Option.field [] "template" paramFormat "template for filenames"
+
+seek :: [CommandSeek]
+seek = [withField templateOption return $ \tmpl ->
+ withFlag relaxedOption $ \relaxed ->
+ withWords $ start relaxed tmpl]
+
+start :: Bool -> Maybe String -> [URLString] -> CommandStart
+start relaxed opttemplate = go Nothing
+ where
+ go _ [] = stop
+ go cache (url:urls) = do
+ showStart "importfeed" url
+ v <- findEnclosures url
+ if isJust v then showEndOk else showEndFail
+ case v of
+ Just l | not (null l) -> do
+ knownurls <- getknownurls cache
+ mapM_ (downloadEnclosure relaxed template knownurls) l
+ go (Just knownurls) urls
+ _ -> go cache urls
+
+ defaulttemplate = "${feedtitle}/${itemtitle}.${extension}"
+ template = Utility.Format.gen $ fromMaybe defaulttemplate opttemplate
+
+ {- This is expensive, so avoid running it more than once. -}
+ getknownurls (Just cached) = return cached
+ getknownurls Nothing = S.fromList <$> knownUrls
+
+findEnclosures :: URLString -> Annex (Maybe [ToDownload])
+findEnclosures url = go =<< downloadFeed url
+ where
+ go Nothing = do
+ warning $ "failed to parse feed " ++ url
+ return Nothing
+ go (Just f) = return $ Just $
+ mapMaybe (mkToDownload f) (feedItems f)
+
+{- Feeds change, so a feed download cannot be resumed. -}
+downloadFeed :: URLString -> Annex (Maybe Feed)
+downloadFeed url = do
+ showOutput
+ liftIO $ withTmpFile "feed" $ \f h -> do
+ ifM (Url.download url [] [] f)
+ ( parseFeedString <$> hGetContentsStrict h
+ , return Nothing
+ )
+
+{- Avoids downloading any urls that are already known to be associated
+ - with a file in the annex. -}
+downloadEnclosure :: Bool -> Utility.Format.Format -> S.Set URLString -> ToDownload -> Annex ()
+downloadEnclosure relaxed template knownurls enclosure
+ | S.member url knownurls = noop
+ | otherwise = do
+ dest <- liftIO $ feedFile template enclosure
+ showStart "addurl" dest
+ ifM (addUrlFile relaxed url dest)
+ ( showEndOk
+ , showEndFail
+ )
+ where
+ url = location enclosure
+
+{- Generate a unique filename for the feed item by filling
+ - out the template.
+ -
+ - Since each feed url is only downloaded once,
+ - if the file already exists, two items with different urls
+ - has the same title. A number is added to disambiguate.
+ -}
+feedFile :: Utility.Format.Format -> ToDownload -> IO FilePath
+feedFile template i = makeUnique 0 $
+ Utility.Format.format template $ M.fromList
+ [ field "feedtitle" $ getFeedTitle $ feed i
+ , fieldMaybe "itemtitle" $ getItemTitle $ item i
+ , fieldMaybe "feedauthor" $ getFeedAuthor $ feed i
+ , fieldMaybe "itemauthor" $ getItemAuthor $ item i
+ , fieldMaybe "itemsummary" $ getItemSummary $ item i
+ , fieldMaybe "itemdescription" $ getItemDescription $ item i
+ , fieldMaybe "itemrights" $ getItemRights $ item i
+ , fieldMaybe "itemid" $ snd <$> getItemId (item i)
+ , field "extension" $ takeExtension $ location i
+ ]
+ where
+ field k v =
+ let s = map sanitize v in
+ if null s then (k, "none") else (k, s)
+ fieldMaybe k Nothing = (k, "none")
+ fieldMaybe k (Just v) = field k v
+
+ sanitize c
+ | isSpace c || isPunctuation c || c == '/' = '_'
+ | otherwise = c
+
+makeUnique :: Integer -> FilePath -> IO FilePath
+makeUnique n file =
+ ifM (isJust <$> catchMaybeIO (getSymbolicLinkStatus f))
+ ( makeUnique (n + 1) file
+ , return file
+ )
+ where
+ f = if n == 0
+ then file
+ else
+ let (d, base) = splitFileName file
+ in d </> show n ++ "_" ++ base
diff --git a/Command/Indirect.hs b/Command/Indirect.hs
index bf1509944..e63c4cb8a 100644
--- a/Command/Indirect.hs
+++ b/Command/Indirect.hs
@@ -59,7 +59,7 @@ perform = do
setDirect False
top <- fromRepo Git.repoPath
- (l, clean) <- inRepo $ Git.LsFiles.stagedDetails [top]
+ (l, clean) <- inRepo $ Git.LsFiles.stagedOthersDetails [top]
forM_ l go
void $ liftIO clean
next cleanup
diff --git a/Git/LsFiles.hs b/Git/LsFiles.hs
index 82ce0edaf..f4e467215 100644
--- a/Git/LsFiles.hs
+++ b/Git/LsFiles.hs
@@ -12,6 +12,7 @@ module Git.LsFiles (
modified,
staged,
stagedNotDeleted,
+ stagedOthersDetails,
stagedDetails,
typeChanged,
typeChangedStaged,
@@ -69,16 +70,24 @@ staged' ps l = pipeNullSplit $ prefix ++ ps ++ suffix
prefix = [Params "diff --cached --name-only -z"]
suffix = Param "--" : map File l
-{- Returns details about files that are staged in the index
- - (including the Sha of their staged contents),
- - as well as files not yet in git. -}
+{- Returns details about files that are staged in the index,
+ - as well as files not yet in git. Skips ignored files. -}
+stagedOthersDetails :: [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool)
+stagedOthersDetails = stagedDetails' [Params "--others --exclude-standard"]
+
+{- Returns details about all files that are staged in the index. -}
stagedDetails :: [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool)
-stagedDetails l repo = do
+stagedDetails = stagedDetails' []
+
+{- Gets details about staged files, including the Sha of their staged
+ - contents. -}
+stagedDetails' :: [CommandParam] -> [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool)
+stagedDetails' ps l repo = do
(ls, cleanup) <- pipeNullSplit params repo
return (map parse ls, cleanup)
where
- params = [Params "ls-files --others --exclude-standard --stage -z --"] ++
- map File l
+ params = Params "ls-files --stage -z" : ps ++
+ Param "--" : map File l
parse s
| null file = (s, Nothing)
| otherwise = (file, extractSha $ take shaSize $ drop 7 metadata)
diff --git a/GitAnnex.hs b/GitAnnex.hs
index e6e24777c..1f6b1c177 100644
--- a/GitAnnex.hs
+++ b/GitAnnex.hs
@@ -57,6 +57,9 @@ import qualified Command.Ungroup
import qualified Command.Vicfg
import qualified Command.Sync
import qualified Command.AddUrl
+#ifdef WITH_FEED
+import qualified Command.ImportFeed
+#endif
import qualified Command.RmUrl
import qualified Command.Import
import qualified Command.Map
@@ -91,6 +94,9 @@ cmds = concat
, Command.Lock.def
, Command.Sync.def
, Command.AddUrl.def
+#ifdef WITH_FEED
+ , Command.ImportFeed.def
+#endif
, Command.RmUrl.def
, Command.Import.def
, Command.Init.def
diff --git a/Logs/Web.hs b/Logs/Web.hs
index 0ed537a8e..cbce7a36e 100644
--- a/Logs/Web.hs
+++ b/Logs/Web.hs
@@ -1,6 +1,6 @@
{- Web url logs.
-
- - Copyright 2011 Joey Hess <joey@kitenet.net>
+ - Copyright 2011, 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@@ -11,12 +11,21 @@ module Logs.Web (
getUrls,
setUrlPresent,
setUrlMissing,
+ urlLog,
+ urlLogKey,
+ knownUrls
) where
+import qualified Data.ByteString.Lazy.Char8 as L
+
import Common.Annex
import Logs.Presence
import Logs.Location
import Types.Key
+import qualified Annex.Branch
+import Annex.CatFile
+import qualified Git
+import qualified Git.LsFiles
type URLString = String
@@ -24,8 +33,24 @@ type URLString = String
webUUID :: UUID
webUUID = UUID "00000000-0000-0000-0000-000000000001"
+urlLogExt :: String
+urlLogExt = ".log.web"
+
urlLog :: Key -> FilePath
-urlLog key = hashDirLower key </> keyFile key ++ ".log.web"
+urlLog key = hashDirLower key </> keyFile key ++ urlLogExt
+
+{- Converts a url log file into a key.
+ - (Does not work on oldurlLogs.) -}
+urlLogKey :: FilePath -> Maybe Key
+urlLogKey file
+ | ext == urlLogExt = fileKey base
+ | otherwise = Nothing
+ where
+ (base, ext) = splitAt (length file - extlen) file
+ extlen = length urlLogExt
+
+isUrlLog :: FilePath -> Bool
+isUrlLog file = urlLogExt `isSuffixOf` file
{- Used to store the urls elsewhere. -}
oldurlLogs :: Key -> [FilePath]
@@ -58,3 +83,21 @@ setUrlMissing key url = do
addLog (urlLog key) =<< logNow InfoMissing url
whenM (null <$> getUrls key) $
logChange key webUUID InfoMissing
+
+{- Finds all known urls. -}
+knownUrls :: Annex [URLString]
+knownUrls = do
+ {- Ensure the git-annex branch's index file is up-to-date and
+ - any journaled changes are reflected in it, since we're going
+ - to query its index directly. -}
+ Annex.Branch.update
+ Annex.Branch.commit "update"
+ Annex.Branch.withIndex $ do
+ top <- fromRepo Git.repoPath
+ (l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
+ r <- mapM (geturls . snd) $ filter (isUrlLog . fst) l
+ void $ liftIO cleanup
+ return $ concat r
+ where
+ geturls Nothing = return []
+ geturls (Just logsha) = getLog . L.unpack <$> catObject logsha
diff --git a/Utility/Url.hs b/Utility/Url.hs
index c8020c814..508b9eeb4 100644
--- a/Utility/Url.hs
+++ b/Utility/Url.hs
@@ -130,6 +130,9 @@ download' quiet url headers options file =
-
- This does its own redirect following because Browser's is buggy for HEAD
- requests.
+ -
+ - Unfortunately, does not handle https, so should only be used
+ - when curl is not available.
-}
request :: URI -> Headers -> RequestMethod -> IO (Response String)
request url headers requesttype = go 5 url
diff --git a/debian/changelog b/debian/changelog
index e191930da..93aece109 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -7,6 +7,7 @@ git-annex (4.20130724) UNRELEASED; urgency=low
Like drop, dropunused checks remotes, and honors the global
annex.numcopies setting. (However, .gitattributes settings cannot
apply to unused files.)
+ * importfeed can be used to import files from podcast feeds.
* Add status message to XMPP presence tag, to identify to others that
the client is a git-annex client. Closes: #717652
* webapp: When creating a repository on a removable drive, set
diff --git a/debian/control b/debian/control
index 42c43f123..52580b1ed 100644
--- a/debian/control
+++ b/debian/control
@@ -48,6 +48,7 @@ Build-Depends:
libghc-xml-types-dev,
libghc-async-dev,
libghc-http-dev,
+ libghc-feed-dev
ikiwiki,
perlmagick,
git,
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index 00407e949..7b2a0bce5 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -190,6 +190,19 @@ subdirectories).
git annex import /media/camera/DCIM/
+* importfeed [url ...]
+
+ Imports the contents of podcast feeds. Only downloads files whose
+ urls have not already been added to the repository before, so you can
+ delete, rename, etc the resulting files and repeated runs won't duplicate
+ them.
+
+ Use --template to control where the files are stored.
+ The default template is '${feedtitle}/${itemtitle}${extension}'
+ (Other available variables: feedauthor, itemauthor, itemsummary, itemdescription, itemrights, itemid)
+
+ The --relaxed and --fast options behave the same as they do in addurl.
+
* watch
Watches for changes to files in the current directory and its subdirectories,
diff --git a/doc/install/fromscratch.mdwn b/doc/install/fromscratch.mdwn
index 8af51327e..63d4b1cbb 100644
--- a/doc/install/fromscratch.mdwn
+++ b/doc/install/fromscratch.mdwn
@@ -21,6 +21,7 @@ quite a lot.
* [UUID](http://hackage.haskell.org/package/uuid)
* [regex-tdfa](http://hackage.haskell.org/package/regex-tdfa)
* [extensible-exceptions](http://hackage.haskell.org/package/extensible-exceptions)
+ * [feed](http://hackage.haskell.org/package/feed)
* Optional haskell stuff, used by the [[assistant]] and its webapp
* [stm](http://hackage.haskell.org/package/stm)
(version 2.3 or newer)
diff --git a/doc/tips/downloading_podcasts.mdwn b/doc/tips/downloading_podcasts.mdwn
new file mode 100644
index 000000000..99ba9aa94
--- /dev/null
+++ b/doc/tips/downloading_podcasts.mdwn
@@ -0,0 +1,44 @@
+You can use git-annex as a podcatcher, to download podcast contents.
+No additional software is required, but your git-annex must be built
+with the Feeds feature (run `git annex version` to check).
+
+All you need to do is put something like this in a cron job:
+
+`cd somerepo && git annex importfeed http://url/to/podcast http://other/podcast/url`
+
+This downloads the urls, and parses them as RSS, Atom, or RDF feeds.
+All enclosures are downloaded and added to the repository, the same as if you
+had manually run `git annex addurl` on each of them.
+
+git-annex will avoid downloading a file from a feed if its url has already
+been stored in the repository before. So once a file is downloaded,
+you can move it around, delete it, `git annex drop` its content, etc,
+and it will not be downloaded again by repeated runs of
+`git annex importfeed`. Just how a podcatcher should behave.
+
+## templates
+
+To control the filenames used for items downloaded from a feed,
+there's a --template option. The default is
+`--template='${feedtitle}/${itemtitle}${extension}'`
+
+Other available template variables:
+feedauthor, itemauthor, itemsummary, itemdescription, itemrights, itemid
+
+## catching up
+
+To catch up on a feed without downloading its contents,
+use `git annex importfeed --relaxed`, and delete the symlinks it creates.
+Next time you run `git annex addurl` it will only fetch any new items.
+
+## fast mode
+
+To add a feed without downloading its contents right now,
+use `git annex importfeed --fast`. Then you can use `git annex get` as
+usual to download the content of an item.
+
+## distributed podcastching
+
+A nice benefit of using git-annex as a podcatcher is that you can
+run `git annex importfeed` on the same url in different clones
+of a repository, and `git annex sync` will sync it all up.
diff --git a/git-annex.cabal b/git-annex.cabal
index b9715b847..da4a8dbe5 100644
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -65,6 +65,9 @@ Flag TestSuite
Flag TDFA
Description: Use regex-tdfa for wildcards
+Flag Feed
+ Description: Enable podcast feed support
+
Executable git-annex
Main-Is: git-annex.hs
Build-Depends: MissingH, hslogger, directory, filepath,
@@ -154,6 +157,10 @@ Executable git-annex
Build-Depends: dns
CPP-Options: -DWITH_DNS
+ if flag(Feed)
+ Build-Depends: feed
+ CPP-Options: -DWITH_FEED
+
source-repository head
type: git
location: git://git-annex.branchable.com/