diff options
-rw-r--r-- | Annex/Quvi.hs | 20 | ||||
-rw-r--r-- | Build/Configure.hs | 1 | ||||
-rw-r--r-- | Command/AddUrl.hs | 60 | ||||
-rw-r--r-- | Command/ImportFeed.hs | 10 | ||||
-rw-r--r-- | Logs/Web.hs | 22 | ||||
-rw-r--r-- | Remote/Web.hs | 22 | ||||
-rw-r--r-- | Types/GitConfig.hs | 2 | ||||
-rw-r--r-- | Utility/Path.hs | 16 | ||||
-rw-r--r-- | Utility/Quvi.hs | 83 | ||||
-rw-r--r-- | debian/changelog | 5 | ||||
-rw-r--r-- | debian/control | 3 | ||||
-rw-r--r-- | doc/git-annex.mdwn | 8 | ||||
-rw-r--r-- | doc/install/fromscratch.mdwn | 1 | ||||
-rw-r--r-- | doc/tips/using_the_web_as_a_special_remote.mdwn | 52 | ||||
-rw-r--r-- | doc/todo/wishlist:_special-case_handling_of_Youtube_URLs_in_Web_special_remote.mdwn | 2 | ||||
-rw-r--r-- | git-annex.cabal | 4 |
16 files changed, 278 insertions, 33 deletions
diff --git a/Annex/Quvi.hs b/Annex/Quvi.hs new file mode 100644 index 000000000..a79b17d61 --- /dev/null +++ b/Annex/Quvi.hs @@ -0,0 +1,20 @@ +{- quvi options for git-annex + - + - Copyright 2013 Joey Hess <joey@kitenet.net> + - + - Licensed under the GNU GPL version 3 or higher. + -} + +{-# LANGUAGE Rank2Types #-} + +module Annex.Quvi where + +import Common.Annex +import qualified Annex +import Utility.Quvi +import Utility.Url + +withQuviOptions :: forall a. (Query a) -> [CommandParam] -> URLString -> Annex a +withQuviOptions a ps url = do + opts <- map Param . annexQuviOptions <$> Annex.getGitConfig + liftIO $ a (ps++opts) url diff --git a/Build/Configure.hs b/Build/Configure.hs index 15b90ebe3..31fbbf1dd 100644 --- a/Build/Configure.hs +++ b/Build/Configure.hs @@ -32,6 +32,7 @@ tests = , TestCase "curl" $ testCmd "curl" "curl --version >/dev/null" , TestCase "wget" $ testCmd "wget" "wget --version >/dev/null" , TestCase "bup" $ testCmd "bup" "bup --version >/dev/null" + , TestCase "quvi" $ testCmd "quvi" "quvi --version >/dev/null" , TestCase "ionice" $ testCmd "ionice" "ionice -c3 true >/dev/null" , TestCase "gpg" $ maybeSelectCmd "gpg" [ ("gpg", "--version >/dev/null") diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index d172a6869..04aa46d29 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -27,6 +27,8 @@ import Annex.Content.Direct import Logs.Location import qualified Logs.Transfer as Transfer import Utility.Daemon (checkDaemon) +import Annex.Quvi +import qualified Utility.Quvi as Quvi def :: [Command] def = [notBareRepo $ withOptions [fileOption, pathdepthOption, relaxedOption] $ @@ -51,15 +53,51 @@ seek = [withField fileOption return $ \f -> start :: Bool -> Maybe FilePath -> Maybe Int -> String -> CommandStart start relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s where - bad = fromMaybe (error $ "bad url " ++ s) $ - parseURI $ escapeURIString isUnescapedInURI s - go url = do - pathmax <- liftIO $ fileNameLengthLimit "." - let file = fromMaybe (url2file url pathdepth pathmax) optfile + (s', downloader) = getDownloader s + bad = fromMaybe (error $ "bad url " ++ s') $ + parseURI $ escapeURIString isUnescapedInURI s' + badquvi = error $ "quvi does not know how to download url " ++ s' + choosefile = flip fromMaybe optfile + go url + | downloader == QuviDownloader = usequvi + | otherwise = ifM (liftIO $ Quvi.supported s') + ( usequvi + , do + pathmax <- liftIO $ fileNameLengthLimit "." + let file = choosefile $ url2file url pathdepth pathmax + showStart "addurl" file + next $ perform relaxed s' file + ) + usequvi = do + page <- fromMaybe badquvi + <$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] s' + let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page + let file = choosefile $ sanitizeFilePath $ + Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link showStart "addurl" file - next $ perform relaxed s file + next $ performQuvi relaxed s' (Quvi.linkUrl link) file + +performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform +performQuvi relaxed pageurl videourl file = ifAnnexed file addurl geturl + where + quviurl = setDownloader pageurl QuviDownloader + addurl (key, _backend) = next $ cleanup quviurl file key Nothing + geturl = do + key <- Backend.URL.fromUrl quviurl Nothing + ifM (pure relaxed <||> Annex.getState Annex.fast) + ( next $ cleanup quviurl file key Nothing + , do + tmp <- fromRepo $ gitAnnexTmpLocation key + showOutput + ok <- Transfer.download webUUID key (Just file) Transfer.forwardRetry $ const $ do + liftIO $ createDirectoryIfMissing True (parentDir tmp) + downloadUrl [videourl] tmp + if ok + then next $ cleanup quviurl file key (Just tmp) + else stop + ) -perform :: Bool -> String -> FilePath -> CommandPerform +perform :: Bool -> URLString -> FilePath -> CommandPerform perform relaxed url file = ifAnnexed file addurl geturl where geturl = next $ addUrlFile relaxed url file @@ -78,7 +116,7 @@ perform relaxed url file = ifAnnexed file addurl geturl stop ) -addUrlFile :: Bool -> String -> FilePath -> Annex Bool +addUrlFile :: Bool -> URLString -> FilePath -> Annex Bool addUrlFile relaxed url file = do liftIO $ createDirectoryIfMissing True (parentDir file) ifM (Annex.getState Annex.fast <||> pure relaxed) @@ -88,7 +126,7 @@ addUrlFile relaxed url file = do download url file ) -download :: String -> FilePath -> Annex Bool +download :: URLString -> FilePath -> Annex Bool download url file = do dummykey <- genkey tmp <- fromRepo $ gitAnnexTmpLocation dummykey @@ -130,7 +168,7 @@ download url file = do downloadUrl [url] tmp -cleanup :: String -> FilePath -> Key -> Maybe FilePath -> Annex Bool +cleanup :: URLString -> FilePath -> Key -> Maybe FilePath -> Annex Bool cleanup url file key mtmp = do when (isJust mtmp) $ logStatus key InfoPresent @@ -144,7 +182,7 @@ cleanup url file key mtmp = do maybe noop (moveAnnex key) mtmp return True -nodownload :: Bool -> String -> FilePath -> Annex Bool +nodownload :: Bool -> URLString -> FilePath -> Annex Bool nodownload relaxed url file = do headers <- getHttpHeaders (exists, size) <- if relaxed diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs index 5ad568647..816865e8c 100644 --- a/Command/ImportFeed.hs +++ b/Command/ImportFeed.hs @@ -12,7 +12,6 @@ import Text.Feed.Query import Text.Feed.Types import qualified Data.Set as S import qualified Data.Map as M -import Data.Char import Data.Time.Clock import Common.Annex @@ -172,20 +171,15 @@ feedFile tmpl i = Utility.Format.format tmpl $ M.fromList , fieldMaybe "itemdescription" $ getItemDescription $ item i , fieldMaybe "itemrights" $ getItemRights $ item i , fieldMaybe "itemid" $ snd <$> getItemId (item i) - , ("extension", map sanitize $ takeExtension $ location i) + , ("extension", sanitizeFilePath $ takeExtension $ location i) ] where field k v = - let s = map sanitize v in + let s = sanitizeFilePath v in if null s then (k, "none") else (k, s) fieldMaybe k Nothing = (k, "none") fieldMaybe k (Just v) = field k v - sanitize c - | c == '.' = c - | isSpace c || isPunctuation c || c == '/' = '_' - | otherwise = c - {- Called when there is a problem with a feed. - Throws an error if the feed is broken, otherwise shows a warning. -} feedProblem :: URLString -> String -> Annex () diff --git a/Logs/Web.hs b/Logs/Web.hs index cbce7a36e..44d511267 100644 --- a/Logs/Web.hs +++ b/Logs/Web.hs @@ -13,7 +13,10 @@ module Logs.Web ( setUrlMissing, urlLog, urlLogKey, - knownUrls + knownUrls, + Downloader(..), + getDownloader, + setDownloader, ) where import qualified Data.ByteString.Lazy.Char8 as L @@ -101,3 +104,20 @@ knownUrls = do where geturls Nothing = return [] geturls (Just logsha) = getLog . L.unpack <$> catObject logsha + +data Downloader = DefaultDownloader | QuviDownloader + deriving (Eq) + +{- Determines the downloader for an URL. + - + - Some URLs are not downloaded by normal means, and this is indicated + - by prefixing them with downloader: when they are recorded in the url + - logs. -} +getDownloader :: URLString -> (URLString, Downloader) +getDownloader u = case separate (== ':') u of + ("quvi", u') -> (u', QuviDownloader) + _ -> (u, DefaultDownloader) + +setDownloader :: URLString -> Downloader -> URLString +setDownloader u DefaultDownloader = u +setDownloader u QuviDownloader = "quvi:" ++ u diff --git a/Remote/Web.hs b/Remote/Web.hs index 2c59528ef..b49ab0d1e 100644 --- a/Remote/Web.hs +++ b/Remote/Web.hs @@ -15,9 +15,11 @@ import Annex.Content import Config import Config.Cost import Logs.Web -import qualified Utility.Url as Url import Types.Key import Utility.Metered +import qualified Utility.Url as Url +import Annex.Quvi +import qualified Utility.Quvi as Quvi import qualified Data.Map as M @@ -67,7 +69,12 @@ downloadKey key _file dest _p = get =<< getUrls key return False get urls = do showOutput -- make way for download progress bar - downloadUrl urls dest + untilTrue urls $ \u -> do + let (u', downloader) = getDownloader u + case downloader of + QuviDownloader -> flip downloadUrl dest + =<< withQuviOptions Quvi.queryLinks [Quvi.httponly, Quvi.quiet] u' + _ -> downloadUrl [u] dest downloadKeyCheap :: Key -> FilePath -> Annex Bool downloadKeyCheap _ _ = return False @@ -90,6 +97,11 @@ checkKey key = do else return . Right =<< checkKey' key us checkKey' :: Key -> [URLString] -> Annex Bool checkKey' key us = untilTrue us $ \u -> do - showAction $ "checking " ++ u - headers <- getHttpHeaders - liftIO $ Url.check u headers (keySize key) + let (u', downloader) = getDownloader u + showAction $ "checking " ++ u' + case downloader of + QuviDownloader -> + withQuviOptions Quvi.check [Quvi.httponly, Quvi.quiet] u' + _ -> do + headers <- getHttpHeaders + liftIO $ Url.check u' headers (keySize key) diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index d5d234ca9..4f2e91331 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -37,6 +37,7 @@ data GitConfig = GitConfig , annexAutoCommit :: Bool , annexDebug :: Bool , annexWebOptions :: [String] + , annexQuviOptions :: [String] , annexWebDownloadCommand :: Maybe String , annexCrippledFileSystem :: Bool , annexLargeFiles :: Maybe String @@ -62,6 +63,7 @@ extractGitConfig r = GitConfig , annexAutoCommit = getbool (annex "autocommit") True , annexDebug = getbool (annex "debug") False , annexWebOptions = getwords (annex "web-options") + , annexQuviOptions = getwords (annex "quvi-options") , annexWebDownloadCommand = getmaybe (annex "web-download-command") , annexCrippledFileSystem = getbool (annex "crippledfilesystem") False , annexLargeFiles = getmaybe (annex "largefiles") diff --git a/Utility/Path.hs b/Utility/Path.hs index 79e8e8089..b6214b247 100644 --- a/Utility/Path.hs +++ b/Utility/Path.hs @@ -14,6 +14,7 @@ import System.FilePath import System.Directory import Data.List import Data.Maybe +import Data.Char import Control.Applicative #ifdef mingw32_HOST_OS @@ -236,3 +237,18 @@ fileNameLengthLimit dir = do else return $ minimum [l, 255] where #endif + +{- Given a string that we'd like to use as the basis for FilePath, but that + - was provided by a third party and is not to be trusted, returns the closest + - sane FilePath. + - + - All spaces and punctuation are replaced with '_', except for '.' + - "../" will thus turn into ".._", which is safe. + -} +sanitizeFilePath :: String -> FilePath +sanitizeFilePath = map sanitize + where + sanitize c + | c == '.' = c + | isSpace c || isPunctuation c || c == '/' = '_' + | otherwise = c diff --git a/Utility/Quvi.hs b/Utility/Quvi.hs new file mode 100644 index 000000000..68cc4a3f6 --- /dev/null +++ b/Utility/Quvi.hs @@ -0,0 +1,83 @@ +{- querying quvi (import qualified) + - + - Copyright 2013 Joey Hess <joey@kitenet.net> + - + - Licensed under the GNU GPL version 3 or higher. + -} + +{-# LANGUAGE OverloadedStrings #-} + +module Utility.Quvi where + +import Common +import Utility.Url + +import Data.Aeson +import Data.ByteString.Lazy.UTF8 (fromString) + +data Page = Page + { pageTitle :: String + , pageLinks :: [Link] + } deriving (Show) + +data Link = Link + { linkSuffix :: String + , linkUrl :: URLString + } deriving (Show) + +instance FromJSON Page where + parseJSON (Object v) = Page + <$> v .: "page_title" + <*> v .: "link" + parseJSON _ = mzero + +instance FromJSON Link where + parseJSON (Object v) = Link + <$> v .: "file_suffix" + <*> v .: "url" + parseJSON _ = mzero + +type Query a = [CommandParam] -> URLString -> IO a + +{- Throws an error when quvi is not installed. -} +forceQuery :: Query (Maybe Page) +forceQuery ps url = flip catchNonAsync (const notinstalled) (query' ps url) + where + notinstalled = error "quvi failed, or is not installed" + +{- Returns Nothing if the page is not a video page, or quvi is not + - installed. -} +query :: Query (Maybe Page) +query ps url = flip catchNonAsync (const $ return Nothing) (query' ps url) + +query' :: Query (Maybe Page) +query' ps url = decode . fromString + <$> readProcess "quvi" (toCommand $ ps ++ [Param url]) + +queryLinks :: Query [URLString] +queryLinks ps url = maybe [] (map linkUrl . pageLinks) <$> query ps url + +{- Checks if quvi can still find a download link for an url. + - If quvi is not installed, returns False. -} +check :: Query Bool +check ps url = maybe False (not . null . pageLinks) <$> query ps url + +{- Checks if an url is supported by quvi, without hitting it, or outputting + - anything. Also returns False if quvi is not installed. -} +supported :: URLString -> IO Bool +supported url = boolSystem "quvi" [Params "-v mute --support", Param url] + +quiet :: CommandParam +quiet = Params "-v quiet" + +noredir :: CommandParam +noredir = Params "-e -resolve" + +{- Only return http results, not streaming protocols. -} +httponly :: CommandParam +httponly = Params "-c http" + +{- Avoids error messages being printed to stderr, instead they are + - put in the JSON. -} +hideerrors :: CommandParam +hideerrors = Params "-l +errors" diff --git a/debian/changelog b/debian/changelog index a2317ff9f..9df2ba364 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,10 @@ git-annex (4.20130816) UNRELEASED; urgency=low + * Youtube support! (And 53 other video hosts). When quvi is installed, + git-annex addurl automatically uses it to detect when an page is + a video, and downloads the video file. + * web special remote: Also support using quvi, for getting files, + or checking if files exist in the web. * Debian: Run the builtin test suite as an autopkgtest. * Debian: Recommend ssh-askpass, which ssh will use when the assistant is run w/o a tty. Closes: #719832 diff --git a/debian/control b/debian/control index 041584ba1..9ee2453dc 100644 --- a/debian/control +++ b/debian/control @@ -21,6 +21,7 @@ Build-Depends: libghc-dlist-dev, libghc-uuid-dev, libghc-json-dev, + libghc-aeson-dev, libghc-ifelse-dev, libghc-bloomfilter-dev, libghc-edit-distance-dev, @@ -71,7 +72,7 @@ Depends: ${misc:Depends}, ${shlibs:Depends}, wget, curl, openssh-client (>= 1:5.6p1) -Recommends: lsof, gnupg, bind9-host, ssh-askpass +Recommends: lsof, gnupg, bind9-host, ssh-askpass, quvi Suggests: graphviz, bup, libnss-mdns Description: manage files with git, without checking their contents into git git-annex allows managing files with git, without checking the file diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 00c7b2e50..7cac9087d 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -195,6 +195,9 @@ subdirectories). alternate locations from which the file can be downloaded. In this mode, addurl can be used both to add new files, or to add urls to existing files. + When quvi is installed, urls are automatically tested to see if they + are on a video hosting site, and the video is downloaded instead. + * rmurl file url Record that the file is no longer available at the url. @@ -1112,6 +1115,11 @@ Here are all the supported configuration settings. (wget is always used in preference to curl if available.) For example, to force ipv4 only, set it to "-4" +* `annex.quvi-options` + + Options to pass to quvi when using it to find the url to download for a + video. + * `annex.http-headers` HTTP headers to send when downloading from the web. Multiple lines of diff --git a/doc/install/fromscratch.mdwn b/doc/install/fromscratch.mdwn index 63d4b1cbb..17444a4fd 100644 --- a/doc/install/fromscratch.mdwn +++ b/doc/install/fromscratch.mdwn @@ -11,6 +11,7 @@ quite a lot. * [monad-control](http://hackage.haskell.org/package/monad-control) * [QuickCheck 2](http://hackage.haskell.org/package/QuickCheck) * [json](http://hackage.haskell.org/package/json) + * [aeson](http://hackage.haskell.org/package/aeson) * [IfElse](http://hackage.haskell.org/package/IfElse) * [dlist](http://hackage.haskell.org/package/dlist) * [bloomfilter](http://hackage.haskell.org/package/bloomfilter) diff --git a/doc/tips/using_the_web_as_a_special_remote.mdwn b/doc/tips/using_the_web_as_a_special_remote.mdwn index 3ce02a56a..227d8e75c 100644 --- a/doc/tips/using_the_web_as_a_special_remote.mdwn +++ b/doc/tips/using_the_web_as_a_special_remote.mdwn @@ -8,10 +8,16 @@ The web can be used as a [[special_remote|special_remotes]] too. Now the file is downloaded, and has been added to the annex like any other file. So it can be renamed, copied to other repositories, and so on. +To add a lot of urls at once, just list them all as parameters to +`git annex addurl`. + +## trust issues + Note that git-annex assumes that, if the web site does not 404, and has the right file size, the file is still present on the web, and this counts as -one [[copy|copies]] of the file. So it will let you remove your last copy, -trusting it can be downloaded again: +one [[copy|copies]] of the file. If the file still seems to be present +on the web, it will let you remove your last copy, trusting it can be +downloaded again: # git annex drop example.com_video.mpeg drop example.com_video.mpeg (checking http://example.com/video.mpeg) ok @@ -31,7 +37,9 @@ With the result that it will hang onto files: (Use --force to override this check, or adjust annex.numcopies.) failed -You can also add urls to any file already in the annex: +## attaching urls to existing files + +You can also attach urls to any file already in the annex: # git annex addurl --file my_cool_big_file http://example.com/cool_big_file addurl my_cool_big_file ok @@ -40,8 +48,10 @@ You can also add urls to any file already in the annex: 00000000-0000-0000-0000-000000000001 -- web 27a9510c-760a-11e1-b9a0-c731d2b77df9 -- here -To add a lot of urls at once, just list them all as parameters to -`git annex addurl`. +## configuring filenames + +By default, `addurl` will generate a filename for you. You can use +`--file=` to specify the filename to use. If you're adding a bunch of related files to a directory, or just don't like the default filenames generated by `addurl`, you can use `--pathdepth` @@ -55,3 +65,35 @@ number takes that many paths from the end. addurl 2012_01_video.mpeg (downloading http://example.com/videos/2012/01/video.mpeg) # git annex addurl http://example.com/videos/2012/01/video.mpeg --pathdepth=-2 addurl 01_video.mpeg (downloading http://example.com/videos/2012/01/video.mpeg) + +## videos + +There's support for downloading videos from sites like YouTube, Vimeo, +and many more. This relies on [quvi](http://quvi.sourceforge.net/) to find +urls to the actual videos files. + +When you have quvi installed, you can just +`git annex addurl http://youtube.com/foo` and it will detect that +it is a video and download the video content for offline viewing. + +Later, in another clone of the repository, you can run `git annex get` on +the file and it will also be downloaded with the help of quvi. This works +even if the video host has transcoded or otherwise changed the video +in the meantime; the assumption is that these video files are equivilant. + +There is an `annex.quvi-options` configuration setting that can be used +to pass parameters to quvi. For example, you could set `git config +annex.quvi-options "--format low"` to configure it to download low +quality videos from YouTube. + +Note that for performance reasons, the url is not checked for redirects, +so shortened urls to sites like youtu.be will not be detected. You can +either load the short url in a browser to get the full url, or you +can force use of quvi with redirect detection, by prepending "quvi:" to the +url. For example, `git annex addurl quvi:http://youtu.be/foo` + +Downloading whole YouTube playlists is not currently supported by quvi. + +## podcasts + +This is done using `git annex importfeed`. See [[downloading podcasts]]. diff --git a/doc/todo/wishlist:_special-case_handling_of_Youtube_URLs_in_Web_special_remote.mdwn b/doc/todo/wishlist:_special-case_handling_of_Youtube_URLs_in_Web_special_remote.mdwn index cfe07324e..229dc258b 100644 --- a/doc/todo/wishlist:_special-case_handling_of_Youtube_URLs_in_Web_special_remote.mdwn +++ b/doc/todo/wishlist:_special-case_handling_of_Youtube_URLs_in_Web_special_remote.mdwn @@ -18,3 +18,5 @@ The [[Web special remote|special remotes/web]] could possibly be improved by det > > URL may yield different file contents depending on the quality > > chosen. Also, it seems that the URLs guessed by quvi may be > > ephemeral. --[[anarcat]] + +> [[done]]!!! --[[Joey]] diff --git a/git-annex.cabal b/git-annex.cabal index a06ecc39b..3d343374a 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -76,7 +76,7 @@ Executable git-annex extensible-exceptions, dataenc, SHA, process, json, base (>= 4.5 && < 4.8), monad-control, MonadCatchIO-transformers, IfElse, text, QuickCheck >= 2.1, bloomfilter, edit-distance, process, - SafeSemaphore, uuid, random, dlist, unix-compat + SafeSemaphore, uuid, random, dlist, unix-compat, aeson -- Need to list these because they're generated from .hsc files. Other-Modules: Utility.Touch Utility.Mounts Include-Dirs: Utility @@ -141,7 +141,7 @@ Executable git-annex Build-Depends: yesod, yesod-default, yesod-static, yesod-form, yesod-core, case-insensitive, http-types, transformers, wai, wai-logger, warp, - blaze-builder, crypto-api, hamlet, clientsession, aeson, + blaze-builder, crypto-api, hamlet, clientsession, template-haskell, data-default CPP-Options: -DWITH_WEBAPP |