diff options
author | Joey Hess <joey@kitenet.net> | 2013-07-28 15:27:36 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2013-07-28 16:55:42 -0400 |
commit | 217068206893864ed05911c3b06d8fdb802750a1 (patch) | |
tree | 3bfa87ee63197405f3aa18138eb8affd3ce7c7e7 | |
parent | 468e2c789371f924ec4586148e4e9e5618c58303 (diff) |
importfeed: git-annex becomes a podcatcher in 150 LOC
-rw-r--r-- | Annex/Branch.hs | 1 | ||||
-rw-r--r-- | Annex/Direct.hs | 2 | ||||
-rw-r--r-- | Command/AddUrl.hs | 54 | ||||
-rw-r--r-- | Command/ImportFeed.hs | 148 | ||||
-rw-r--r-- | Command/Indirect.hs | 2 | ||||
-rw-r--r-- | Git/LsFiles.hs | 21 | ||||
-rw-r--r-- | GitAnnex.hs | 6 | ||||
-rw-r--r-- | Logs/Web.hs | 47 | ||||
-rw-r--r-- | Utility/Url.hs | 3 | ||||
-rw-r--r-- | debian/changelog | 1 | ||||
-rw-r--r-- | debian/control | 1 | ||||
-rw-r--r-- | doc/git-annex.mdwn | 13 | ||||
-rw-r--r-- | doc/install/fromscratch.mdwn | 1 | ||||
-rw-r--r-- | doc/tips/downloading_podcasts.mdwn | 44 | ||||
-rw-r--r-- | git-annex.cabal | 7 |
15 files changed, 319 insertions, 32 deletions
diff --git a/Annex/Branch.hs b/Annex/Branch.hs index 1c260ff7e..bc3736a9a 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -21,6 +21,7 @@ module Annex.Branch ( change, commit, files, + withIndex, ) where import qualified Data.ByteString.Lazy.Char8 as L diff --git a/Annex/Direct.hs b/Annex/Direct.hs index cf5806ad6..d2e2cdc00 100644 --- a/Annex/Direct.hs +++ b/Annex/Direct.hs @@ -35,7 +35,7 @@ stageDirect :: Annex Bool stageDirect = do Annex.Queue.flush top <- fromRepo Git.repoPath - (l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top] + (l, cleanup) <- inRepo $ Git.LsFiles.stagedOthersDetails [top] forM_ l go void $ liftIO cleanup staged <- Annex.Queue.size diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index f45d00cc6..5c8c224f2 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -1,6 +1,6 @@ {- git-annex command - - - Copyright 2011 Joey Hess <joey@kitenet.net> + - Copyright 2011-2013 Joey Hess <joey@kitenet.net> - - Licensed under the GNU GPL version 3 or higher. -} @@ -61,10 +61,7 @@ start relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s perform :: Bool -> String -> FilePath -> CommandPerform perform relaxed url file = ifAnnexed file addurl geturl where - geturl = do - liftIO $ createDirectoryIfMissing True (parentDir file) - ifM (Annex.getState Annex.fast <||> pure relaxed) - ( nodownload relaxed url file , download url file ) + geturl = next $ addUrlFile relaxed url file addurl (key, _backend) | relaxed = do setUrlPresent key url @@ -80,22 +77,35 @@ perform relaxed url file = ifAnnexed file addurl geturl stop ) -download :: String -> FilePath -> CommandPerform +addUrlFile :: Bool -> String -> FilePath -> Annex Bool +addUrlFile relaxed url file = do + liftIO $ createDirectoryIfMissing True (parentDir file) + ifM (Annex.getState Annex.fast <||> pure relaxed) + ( nodownload relaxed url file + , do + showAction $ "downloading " ++ url ++ " " + download url file + ) + +download :: String -> FilePath -> Annex Bool download url file = do - showAction $ "downloading " ++ url ++ " " dummykey <- genkey tmp <- fromRepo $ gitAnnexTmpLocation dummykey - stopUnless (runtransfer dummykey tmp) $ do - backend <- chooseBackend file - let source = KeySource - { keyFilename = file - , contentLocation = tmp - , inodeCache = Nothing - } - k <- genKey source backend - case k of - Nothing -> stop - Just (key, _) -> next $ cleanup url file key (Just tmp) + showOutput + ifM (runtransfer dummykey tmp) + ( do + backend <- chooseBackend file + let source = KeySource + { keyFilename = file + , contentLocation = tmp + , inodeCache = Nothing + } + k <- genKey source backend + case k of + Nothing -> return False + Just (key, _) -> cleanup url file key (Just tmp) + , return False + ) where {- Generate a dummy key to use for this download, before we can - examine the file and find its real key. This allows resuming @@ -119,7 +129,7 @@ download url file = do downloadUrl [url] tmp -cleanup :: String -> FilePath -> Key -> Maybe FilePath -> CommandCleanup +cleanup :: String -> FilePath -> Key -> Maybe FilePath -> Annex Bool cleanup url file key mtmp = do when (isJust mtmp) $ logStatus key InfoPresent @@ -133,7 +143,7 @@ cleanup url file key mtmp = do maybe noop (moveAnnex key) mtmp return True -nodownload :: Bool -> String -> FilePath -> CommandPerform +nodownload :: Bool -> String -> FilePath -> Annex Bool nodownload relaxed url file = do headers <- getHttpHeaders (exists, size) <- if relaxed @@ -142,10 +152,10 @@ nodownload relaxed url file = do if exists then do let key = Backend.URL.fromUrl url size - next $ cleanup url file key Nothing + cleanup url file key Nothing else do warning $ "unable to access url: " ++ url - stop + return False url2file :: URI -> Maybe Int -> FilePath url2file url pathdepth = case pathdepth of diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs new file mode 100644 index 000000000..a4a5cfd4c --- /dev/null +++ b/Command/ImportFeed.hs @@ -0,0 +1,148 @@ +{- git-annex command + - + - Copyright 2013 Joey Hess <joey@kitenet.net> + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Command.ImportFeed where + +import Text.Feed.Import +import Text.Feed.Query +import Text.Feed.Types +import qualified Data.Set as S +import qualified Data.Map as M +import Data.Char + +import Common.Annex +import Command +import qualified Utility.Url as Url +import Logs.Web +import qualified Option +import qualified Utility.Format +import Utility.Tmp +import Command.AddUrl (addUrlFile, relaxedOption) + +data ToDownload = ToDownload + { feed :: Feed + , item :: Item + , location :: URLString + } + +mkToDownload :: Feed -> Item -> Maybe ToDownload +mkToDownload f i = case getItemEnclosure i of + Nothing -> Nothing + Just (enclosureurl, _, _) -> Just $ ToDownload f i enclosureurl + +def :: [Command] +def = [notBareRepo $ withOptions [templateOption, relaxedOption] $ + command "importfeed" (paramRepeating paramUrl) seek + SectionCommon "import files from podcast feeds"] + +templateOption :: Option +templateOption = Option.field [] "template" paramFormat "template for filenames" + +seek :: [CommandSeek] +seek = [withField templateOption return $ \tmpl -> + withFlag relaxedOption $ \relaxed -> + withWords $ start relaxed tmpl] + +start :: Bool -> Maybe String -> [URLString] -> CommandStart +start relaxed opttemplate = go Nothing + where + go _ [] = stop + go cache (url:urls) = do + showStart "importfeed" url + v <- findEnclosures url + if isJust v then showEndOk else showEndFail + case v of + Just l | not (null l) -> do + knownurls <- getknownurls cache + mapM_ (downloadEnclosure relaxed template knownurls) l + go (Just knownurls) urls + _ -> go cache urls + + defaulttemplate = "${feedtitle}/${itemtitle}.${extension}" + template = Utility.Format.gen $ fromMaybe defaulttemplate opttemplate + + {- This is expensive, so avoid running it more than once. -} + getknownurls (Just cached) = return cached + getknownurls Nothing = S.fromList <$> knownUrls + +findEnclosures :: URLString -> Annex (Maybe [ToDownload]) +findEnclosures url = go =<< downloadFeed url + where + go Nothing = do + warning $ "failed to parse feed " ++ url + return Nothing + go (Just f) = return $ Just $ + mapMaybe (mkToDownload f) (feedItems f) + +{- Feeds change, so a feed download cannot be resumed. -} +downloadFeed :: URLString -> Annex (Maybe Feed) +downloadFeed url = do + showOutput + liftIO $ withTmpFile "feed" $ \f h -> do + ifM (Url.download url [] [] f) + ( parseFeedString <$> hGetContentsStrict h + , return Nothing + ) + +{- Avoids downloading any urls that are already known to be associated + - with a file in the annex. -} +downloadEnclosure :: Bool -> Utility.Format.Format -> S.Set URLString -> ToDownload -> Annex () +downloadEnclosure relaxed template knownurls enclosure + | S.member url knownurls = noop + | otherwise = do + dest <- liftIO $ feedFile template enclosure + showStart "addurl" dest + ifM (addUrlFile relaxed url dest) + ( showEndOk + , showEndFail + ) + where + url = location enclosure + +{- Generate a unique filename for the feed item by filling + - out the template. + - + - Since each feed url is only downloaded once, + - if the file already exists, two items with different urls + - has the same title. A number is added to disambiguate. + -} +feedFile :: Utility.Format.Format -> ToDownload -> IO FilePath +feedFile template i = makeUnique 0 $ + Utility.Format.format template $ M.fromList + [ field "feedtitle" $ getFeedTitle $ feed i + , fieldMaybe "itemtitle" $ getItemTitle $ item i + , fieldMaybe "feedauthor" $ getFeedAuthor $ feed i + , fieldMaybe "itemauthor" $ getItemAuthor $ item i + , fieldMaybe "itemsummary" $ getItemSummary $ item i + , fieldMaybe "itemdescription" $ getItemDescription $ item i + , fieldMaybe "itemrights" $ getItemRights $ item i + , fieldMaybe "itemid" $ snd <$> getItemId (item i) + , field "extension" $ takeExtension $ location i + ] + where + field k v = + let s = map sanitize v in + if null s then (k, "none") else (k, s) + fieldMaybe k Nothing = (k, "none") + fieldMaybe k (Just v) = field k v + + sanitize c + | isSpace c || isPunctuation c || c == '/' = '_' + | otherwise = c + +makeUnique :: Integer -> FilePath -> IO FilePath +makeUnique n file = + ifM (isJust <$> catchMaybeIO (getSymbolicLinkStatus f)) + ( makeUnique (n + 1) file + , return file + ) + where + f = if n == 0 + then file + else + let (d, base) = splitFileName file + in d </> show n ++ "_" ++ base diff --git a/Command/Indirect.hs b/Command/Indirect.hs index bf1509944..e63c4cb8a 100644 --- a/Command/Indirect.hs +++ b/Command/Indirect.hs @@ -59,7 +59,7 @@ perform = do setDirect False top <- fromRepo Git.repoPath - (l, clean) <- inRepo $ Git.LsFiles.stagedDetails [top] + (l, clean) <- inRepo $ Git.LsFiles.stagedOthersDetails [top] forM_ l go void $ liftIO clean next cleanup diff --git a/Git/LsFiles.hs b/Git/LsFiles.hs index 82ce0edaf..f4e467215 100644 --- a/Git/LsFiles.hs +++ b/Git/LsFiles.hs @@ -12,6 +12,7 @@ module Git.LsFiles ( modified, staged, stagedNotDeleted, + stagedOthersDetails, stagedDetails, typeChanged, typeChangedStaged, @@ -69,16 +70,24 @@ staged' ps l = pipeNullSplit $ prefix ++ ps ++ suffix prefix = [Params "diff --cached --name-only -z"] suffix = Param "--" : map File l -{- Returns details about files that are staged in the index - - (including the Sha of their staged contents), - - as well as files not yet in git. -} +{- Returns details about files that are staged in the index, + - as well as files not yet in git. Skips ignored files. -} +stagedOthersDetails :: [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool) +stagedOthersDetails = stagedDetails' [Params "--others --exclude-standard"] + +{- Returns details about all files that are staged in the index. -} stagedDetails :: [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool) -stagedDetails l repo = do +stagedDetails = stagedDetails' [] + +{- Gets details about staged files, including the Sha of their staged + - contents. -} +stagedDetails' :: [CommandParam] -> [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool) +stagedDetails' ps l repo = do (ls, cleanup) <- pipeNullSplit params repo return (map parse ls, cleanup) where - params = [Params "ls-files --others --exclude-standard --stage -z --"] ++ - map File l + params = Params "ls-files --stage -z" : ps ++ + Param "--" : map File l parse s | null file = (s, Nothing) | otherwise = (file, extractSha $ take shaSize $ drop 7 metadata) diff --git a/GitAnnex.hs b/GitAnnex.hs index e6e24777c..1f6b1c177 100644 --- a/GitAnnex.hs +++ b/GitAnnex.hs @@ -57,6 +57,9 @@ import qualified Command.Ungroup import qualified Command.Vicfg import qualified Command.Sync import qualified Command.AddUrl +#ifdef WITH_FEED +import qualified Command.ImportFeed +#endif import qualified Command.RmUrl import qualified Command.Import import qualified Command.Map @@ -91,6 +94,9 @@ cmds = concat , Command.Lock.def , Command.Sync.def , Command.AddUrl.def +#ifdef WITH_FEED + , Command.ImportFeed.def +#endif , Command.RmUrl.def , Command.Import.def , Command.Init.def diff --git a/Logs/Web.hs b/Logs/Web.hs index 0ed537a8e..cbce7a36e 100644 --- a/Logs/Web.hs +++ b/Logs/Web.hs @@ -1,6 +1,6 @@ {- Web url logs. - - - Copyright 2011 Joey Hess <joey@kitenet.net> + - Copyright 2011, 2013 Joey Hess <joey@kitenet.net> - - Licensed under the GNU GPL version 3 or higher. -} @@ -11,12 +11,21 @@ module Logs.Web ( getUrls, setUrlPresent, setUrlMissing, + urlLog, + urlLogKey, + knownUrls ) where +import qualified Data.ByteString.Lazy.Char8 as L + import Common.Annex import Logs.Presence import Logs.Location import Types.Key +import qualified Annex.Branch +import Annex.CatFile +import qualified Git +import qualified Git.LsFiles type URLString = String @@ -24,8 +33,24 @@ type URLString = String webUUID :: UUID webUUID = UUID "00000000-0000-0000-0000-000000000001" +urlLogExt :: String +urlLogExt = ".log.web" + urlLog :: Key -> FilePath -urlLog key = hashDirLower key </> keyFile key ++ ".log.web" +urlLog key = hashDirLower key </> keyFile key ++ urlLogExt + +{- Converts a url log file into a key. + - (Does not work on oldurlLogs.) -} +urlLogKey :: FilePath -> Maybe Key +urlLogKey file + | ext == urlLogExt = fileKey base + | otherwise = Nothing + where + (base, ext) = splitAt (length file - extlen) file + extlen = length urlLogExt + +isUrlLog :: FilePath -> Bool +isUrlLog file = urlLogExt `isSuffixOf` file {- Used to store the urls elsewhere. -} oldurlLogs :: Key -> [FilePath] @@ -58,3 +83,21 @@ setUrlMissing key url = do addLog (urlLog key) =<< logNow InfoMissing url whenM (null <$> getUrls key) $ logChange key webUUID InfoMissing + +{- Finds all known urls. -} +knownUrls :: Annex [URLString] +knownUrls = do + {- Ensure the git-annex branch's index file is up-to-date and + - any journaled changes are reflected in it, since we're going + - to query its index directly. -} + Annex.Branch.update + Annex.Branch.commit "update" + Annex.Branch.withIndex $ do + top <- fromRepo Git.repoPath + (l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top] + r <- mapM (geturls . snd) $ filter (isUrlLog . fst) l + void $ liftIO cleanup + return $ concat r + where + geturls Nothing = return [] + geturls (Just logsha) = getLog . L.unpack <$> catObject logsha diff --git a/Utility/Url.hs b/Utility/Url.hs index c8020c814..508b9eeb4 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -130,6 +130,9 @@ download' quiet url headers options file = - - This does its own redirect following because Browser's is buggy for HEAD - requests. + - + - Unfortunately, does not handle https, so should only be used + - when curl is not available. -} request :: URI -> Headers -> RequestMethod -> IO (Response String) request url headers requesttype = go 5 url diff --git a/debian/changelog b/debian/changelog index e191930da..93aece109 100644 --- a/debian/changelog +++ b/debian/changelog @@ -7,6 +7,7 @@ git-annex (4.20130724) UNRELEASED; urgency=low Like drop, dropunused checks remotes, and honors the global annex.numcopies setting. (However, .gitattributes settings cannot apply to unused files.) + * importfeed can be used to import files from podcast feeds. * Add status message to XMPP presence tag, to identify to others that the client is a git-annex client. Closes: #717652 * webapp: When creating a repository on a removable drive, set diff --git a/debian/control b/debian/control index 42c43f123..52580b1ed 100644 --- a/debian/control +++ b/debian/control @@ -48,6 +48,7 @@ Build-Depends: libghc-xml-types-dev, libghc-async-dev, libghc-http-dev, + libghc-feed-dev ikiwiki, perlmagick, git, diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 00407e949..7b2a0bce5 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -190,6 +190,19 @@ subdirectories). git annex import /media/camera/DCIM/ +* importfeed [url ...] + + Imports the contents of podcast feeds. Only downloads files whose + urls have not already been added to the repository before, so you can + delete, rename, etc the resulting files and repeated runs won't duplicate + them. + + Use --template to control where the files are stored. + The default template is '${feedtitle}/${itemtitle}${extension}' + (Other available variables: feedauthor, itemauthor, itemsummary, itemdescription, itemrights, itemid) + + The --relaxed and --fast options behave the same as they do in addurl. + * watch Watches for changes to files in the current directory and its subdirectories, diff --git a/doc/install/fromscratch.mdwn b/doc/install/fromscratch.mdwn index 8af51327e..63d4b1cbb 100644 --- a/doc/install/fromscratch.mdwn +++ b/doc/install/fromscratch.mdwn @@ -21,6 +21,7 @@ quite a lot. * [UUID](http://hackage.haskell.org/package/uuid) * [regex-tdfa](http://hackage.haskell.org/package/regex-tdfa) * [extensible-exceptions](http://hackage.haskell.org/package/extensible-exceptions) + * [feed](http://hackage.haskell.org/package/feed) * Optional haskell stuff, used by the [[assistant]] and its webapp * [stm](http://hackage.haskell.org/package/stm) (version 2.3 or newer) diff --git a/doc/tips/downloading_podcasts.mdwn b/doc/tips/downloading_podcasts.mdwn new file mode 100644 index 000000000..99ba9aa94 --- /dev/null +++ b/doc/tips/downloading_podcasts.mdwn @@ -0,0 +1,44 @@ +You can use git-annex as a podcatcher, to download podcast contents. +No additional software is required, but your git-annex must be built +with the Feeds feature (run `git annex version` to check). + +All you need to do is put something like this in a cron job: + +`cd somerepo && git annex importfeed http://url/to/podcast http://other/podcast/url` + +This downloads the urls, and parses them as RSS, Atom, or RDF feeds. +All enclosures are downloaded and added to the repository, the same as if you +had manually run `git annex addurl` on each of them. + +git-annex will avoid downloading a file from a feed if its url has already +been stored in the repository before. So once a file is downloaded, +you can move it around, delete it, `git annex drop` its content, etc, +and it will not be downloaded again by repeated runs of +`git annex importfeed`. Just how a podcatcher should behave. + +## templates + +To control the filenames used for items downloaded from a feed, +there's a --template option. The default is +`--template='${feedtitle}/${itemtitle}${extension}'` + +Other available template variables: +feedauthor, itemauthor, itemsummary, itemdescription, itemrights, itemid + +## catching up + +To catch up on a feed without downloading its contents, +use `git annex importfeed --relaxed`, and delete the symlinks it creates. +Next time you run `git annex addurl` it will only fetch any new items. + +## fast mode + +To add a feed without downloading its contents right now, +use `git annex importfeed --fast`. Then you can use `git annex get` as +usual to download the content of an item. + +## distributed podcastching + +A nice benefit of using git-annex as a podcatcher is that you can +run `git annex importfeed` on the same url in different clones +of a repository, and `git annex sync` will sync it all up. diff --git a/git-annex.cabal b/git-annex.cabal index b9715b847..da4a8dbe5 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -65,6 +65,9 @@ Flag TestSuite Flag TDFA Description: Use regex-tdfa for wildcards +Flag Feed + Description: Enable podcast feed support + Executable git-annex Main-Is: git-annex.hs Build-Depends: MissingH, hslogger, directory, filepath, @@ -154,6 +157,10 @@ Executable git-annex Build-Depends: dns CPP-Options: -DWITH_DNS + if flag(Feed) + Build-Depends: feed + CPP-Options: -DWITH_FEED + source-repository head type: git location: git://git-annex.branchable.com/ |