diff options
-rw-r--r-- | CHANGELOG | 4 | ||||
-rw-r--r-- | Command/AddUrl.hs | 172 | ||||
-rw-r--r-- | Logs/Web.hs | 11 | ||||
-rw-r--r-- | Utility/HtmlDetect.hs | 4 | ||||
-rw-r--r-- | doc/git-annex-addurl.mdwn | 18 | ||||
-rw-r--r-- | doc/todo/switch_from_quvi_to_youtube-dl.mdwn | 22 |
6 files changed, 150 insertions, 81 deletions
@@ -3,6 +3,10 @@ git-annex (6.20171125) UNRELEASED; urgency=medium * Use youtube-dl rather than quvi to download media from web pages, since quvi is not being actively developed and youtube-dl supports many more sites. + * addurl --relaxed no longer checks for embedded media, since youtube-dl + does not allow doing so without hitting the network, which would make + this no faster than addurl --fast. Use addurl --fast instead if you + want embedded media to be downloaded. -- Joey Hess <id@joeyh.name> Tue, 28 Nov 2017 13:48:44 -0400 diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index 866bfc463..da51a6f29 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -28,9 +28,8 @@ import Annex.FileMatcher import Logs.Location import Utility.Metered import Utility.FileSystemEncoding +import Utility.HtmlDetect import qualified Annex.Transfer as Transfer -import Annex.Quvi -import qualified Utility.Quvi as Quvi cmd :: Command cmd = notBareRepo $ withGlobalOptions [jobsOption, jsonOption, jsonProgressOption] $ @@ -85,7 +84,7 @@ parseRelaxedOption = switch parseRawOption :: Parser Bool parseRawOption = switch ( long "raw" - <> help "disable special handling for torrents, quvi, etc" + <> help "disable special handling for torrents, youtube-dl, etc" ) seek :: AddUrlOptions -> CommandSeek @@ -121,7 +120,7 @@ checkUrl r o u = do where go _ (Left e) = void $ commandAction $ do - showStart "addurl" u + showStart' "addurl" (Just u) warning (show e) next $ next $ return False go deffile (Right (UrlContents sz mf)) = do @@ -144,8 +143,9 @@ startRemote :: Remote -> Bool -> FilePath -> URLString -> Maybe Integer -> Comma startRemote r relaxed file uri sz = do pathmax <- liftIO $ fileNameLengthLimit "." let file' = joinPath $ map (truncateFilePath pathmax) $ splitDirectories file - showStart "addurl" file' + showStart' "addurl" (Just uri) showNote $ "from " ++ Remote.name r + showDestinationFile file' next $ performRemote r relaxed uri file' sz performRemote :: Remote -> Bool -> URLString -> FilePath -> Maybe Integer -> CommandPerform @@ -181,19 +181,13 @@ downloadRemoteFile r relaxed uri file sz = checkCanAdd file $ do where loguri = setDownloader uri OtherDownloader -startWeb :: AddUrlOptions -> String -> CommandStart -startWeb o s = go $ fromMaybe bad $ parseURI urlstring +startWeb :: AddUrlOptions -> URLString -> CommandStart +startWeb o urlstring = go $ fromMaybe bad $ parseURI urlstring where - (urlstring, downloader) = getDownloader s bad = fromMaybe (giveup $ "bad url " ++ urlstring) $ Url.parseURIRelaxed $ urlstring - go url = case downloader of - QuviDownloader -> usequvi - _ -> ifM (quviSupported urlstring) - ( usequvi - , regulardownload url - ) - regulardownload url = do + go url = do + showStart' "addurl" (Just urlstring) pathmax <- liftIO $ fileNameLengthLimit "." urlinfo <- if relaxedOption o then pure Url.assumeUrlExists @@ -209,25 +203,14 @@ startWeb o s = go $ fromMaybe bad $ parseURI urlstring ( pure $ url2file url (pathdepthOption o) pathmax , pure f ) - showStart "addurl" file - next $ performWeb (relaxedOption o) urlstring file urlinfo - badquvi = giveup $ "quvi does not know how to download url " ++ urlstring - usequvi = do - page <- fromMaybe badquvi - <$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] urlstring - let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page - pathmax <- liftIO $ fileNameLengthLimit "." - let file = adjustFile o $ flip fromMaybe (fileOption o) $ - truncateFilePath pathmax $ sanitizeFilePath $ - Quvi.pageTitle page ++ "." ++ fromMaybe "m" (Quvi.linkSuffix link) - showStart "addurl" file - next $ performQuvi (relaxedOption o) urlstring (Quvi.linkUrl link) file - -performWeb :: Bool -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform -performWeb relaxed url file urlinfo = ifAnnexed file addurl geturl + next $ performWeb o urlstring file urlinfo + +performWeb :: AddUrlOptions -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform +performWeb o url file urlinfo = ifAnnexed file addurl geturl where - geturl = next $ isJust <$> addUrlFile relaxed url urlinfo file - addurl = addUrlChecked relaxed url webUUID $ \k -> return $ + geturl = next $ isJust <$> addUrlFile (relaxedOption o) url urlinfo file + -- TODO youtube-dl + addurl = addUrlChecked (relaxedOption o) url webUUID $ \k -> return $ (Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k) performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform @@ -290,53 +273,111 @@ addUrlChecked relaxed url u checkexistssize key stop ) +{- Adds an url, normally to the specified FilePath. But, if youtube-dl + - supports the url, it will be written to a different file, based on the + - title of the media. + -} addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) -addUrlFile relaxed url urlinfo file = checkCanAdd file $ do - liftIO $ createDirectoryIfMissing True (parentDir file) - ifM (Annex.getState Annex.fast <||> pure relaxed) - ( nodownload url urlinfo file +addUrlFile relaxed url urlinfo file + | relaxed = checkCanAdd file $ do + liftIO $ createDirectoryIfMissing True (parentDir file) + nodownload url urlinfo file + | otherwise = ifM (Annex.getState Annex.fast) + ( checkCanAdd file $ do + liftIO $ createDirectoryIfMissing True (parentDir file) + nodownload url urlinfo file , downloadWeb url urlinfo file ) downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) -downloadWeb url urlinfo file = do - let dummykey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing - let downloader f p = do +downloadWeb url urlinfo file = + go =<< downloadWith' downloader dummykey webUUID url (AssociatedFile (Just file)) + where + dummykey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing + downloader f p = do showOutput downloadUrl dummykey p [url] f - showAction $ "downloading " ++ url ++ " " - downloadWith downloader dummykey webUUID url file + go Nothing = return Nothing + -- If we downloaded a html file, try to use youtube-dl to + -- extract embedded media. + go (Just tmp) = ifM (liftIO $ isHtml <$> readFile tmp) + ( do + -- TODO need a directory based on dummykey, + -- which unused needs to clean up like + -- it does gitAnnexTmpObjectLocation + tmpdir <- undefined + liftIO $ createDirectoryIfMissing True tmpdir + mf <- youtubeDl url tmpdir + case mf of + Just mediafile -> do + liftIO $ nukeFile tmp + let mediaurl = setDownloader url YoutubeDownloader + let key = Backend.URL.fromUrl mediaurl Nothing + let dest = takeFileName mediafile + showDestinationFile dest + cleanup webUUID mediaurl dest key (Just mediafile) + return (Just key) + Nothing -> normalfinish tmp + , normalfinish tmp + ) + normalfinish tmp = do + showDestinationFile file + liftIO $ createDirectoryIfMissing True (parentDir file) + finishDownloadWith tmp webUUID url file + +youtubeDl :: URLString -> FilePath -> Annex (Maybe FilePath) +youtubeDl = undefined -- TODO + +showDestinationFile :: FilePath -> Annex () +showDestinationFile file = do + showNote ("to " ++ file) + maybeShowJSON $ JSONChunk [("file", file)] {- The Key should be a dummy key, based on the URL, which is used - for this download, before we can examine the file and find its real key. - For resuming downloads to work, the dummy key for a given url should be - - stable. -} + - stable. For disk space checking to work, the dummy key should have + - the size of the url already set. + - + - Downloads the url, sets up the worktree file, and returns the + - real key. + -} downloadWith :: (FilePath -> MeterUpdate -> Annex Bool) -> Key -> UUID -> URLString -> FilePath -> Annex (Maybe Key) downloadWith downloader dummykey u url file = - checkDiskSpaceToGet dummykey Nothing $ do - tmp <- fromRepo $ gitAnnexTmpObjectLocation dummykey - ifM (runtransfer tmp) - ( do - backend <- chooseBackend file - let source = KeySource - { keyFilename = file - , contentLocation = tmp - , inodeCache = Nothing - } - k <- genKey source backend - case k of - Nothing -> return Nothing - Just (key, _) -> do - cleanup u url file key (Just tmp) - return (Just key) - , return Nothing - ) + go =<< downloadWith' downloader dummykey u url afile where - runtransfer tmp = Transfer.notifyTransfer Transfer.Download afile $ - Transfer.download u dummykey afile Transfer.forwardRetry $ \p -> do - liftIO $ createDirectoryIfMissing True (parentDir tmp) - downloader tmp p afile = AssociatedFile (Just file) + go Nothing = return Nothing + go (Just tmp) = finishDownloadWith tmp u url file + +{- Like downloadWith, but leaves the dummy key content in + - the returned location. -} +downloadWith' :: (FilePath -> MeterUpdate -> Annex Bool) -> Key -> UUID -> URLString -> AssociatedFile -> Annex (Maybe FilePath) +downloadWith' downloader dummykey u url afile = + checkDiskSpaceToGet dummykey Nothing $ do + tmp <- fromRepo $ gitAnnexTmpObjectLocation dummykey + ok <- Transfer.notifyTransfer Transfer.Download url $ + Transfer.download u dummykey afile Transfer.forwardRetry $ \p -> do + liftIO $ createDirectoryIfMissing True (parentDir tmp) + downloader tmp p + if ok + then return (Just tmp) + else return Nothing + +finishDownloadWith :: FilePath -> UUID -> URLString -> FilePath -> Annex (Maybe Key) +finishDownloadWith tmp u url file = do + backend <- chooseBackend file + let source = KeySource + { keyFilename = file + , contentLocation = tmp + , inodeCache = Nothing + } + k <- genKey source backend + case k of + Nothing -> return Nothing + Just (key, _) -> do + cleanup u url file key (Just tmp) + return (Just key) {- Adds the url size to the Key. -} addSizeUrlKey :: Url.UrlInfo -> Key -> Key @@ -369,6 +410,7 @@ cleanup u url file key mtmp = case mtmp of , liftIO $ maybe noop nukeFile mtmp ) +-- TODO youtube-dl nodownload :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) nodownload url urlinfo file | Url.urlExists urlinfo = do diff --git a/Logs/Web.hs b/Logs/Web.hs index ba71cb17d..abea00db6 100644 --- a/Logs/Web.hs +++ b/Logs/Web.hs @@ -100,15 +100,15 @@ removeTempUrl :: Key -> Annex () removeTempUrl key = Annex.changeState $ \s -> s { Annex.tempurls = M.delete key (Annex.tempurls s) } -data Downloader = WebDownloader | QuviDownloader | OtherDownloader +data Downloader = WebDownloader | YoutubeDownloader | QuviDownloader | OtherDownloader deriving (Eq, Show) {- To keep track of how an url is downloaded, it's mangled slightly in - - the log. For quvi, "quvi:" is prefixed. For urls that are handled by - - some other remote, ":" is prefixed. -} + - the log, with a prefix indicating when a Downloader is used. -} setDownloader :: URLString -> Downloader -> String setDownloader u WebDownloader = u setDownloader u QuviDownloader = "quvi:" ++ u +setDownloader u YoutubeDownloader = "yt:" ++ u setDownloader u OtherDownloader = ":" ++ u setDownloader' :: URLString -> Remote -> String @@ -118,6 +118,9 @@ setDownloader' u r getDownloader :: URLString -> (URLString, Downloader) getDownloader u = case separate (== ':') u of - ("quvi", u') -> (u', QuviDownloader) + ("yt", u') -> (u', YoutubeDownloader) + -- quvi is not used any longer; youtube-dl should be able to handle + -- all urls it did. + ("quvi", u') -> (u', YoutubeDownloader) ("", u') -> (u', OtherDownloader) _ -> (u, WebDownloader) diff --git a/Utility/HtmlDetect.hs b/Utility/HtmlDetect.hs index ca516e960..57a56c95f 100644 --- a/Utility/HtmlDetect.hs +++ b/Utility/HtmlDetect.hs @@ -18,11 +18,11 @@ import Data.Char -- Html fragments like "<p>this</p>" are not detected as being html, -- although some browsers may chose to render them as html. isHtml :: String -> Bool -isHtml = evaluate . canonicalizeTags . parseTags . truncate +isHtml = evaluate . canonicalizeTags . parseTags . shorten where -- We only care about the beginning of the file, -- so although tagsoup parses lazily anyway, truncate it. - truncate = take 16384 + shorten = take 16384 evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" evaluate (TagOpen "html" _:_) = True -- Allow some leading whitespace before the tag. diff --git a/doc/git-annex-addurl.mdwn b/doc/git-annex-addurl.mdwn index 28740dfef..1457b26b4 100644 --- a/doc/git-annex-addurl.mdwn +++ b/doc/git-annex-addurl.mdwn @@ -10,8 +10,8 @@ git annex addurl `[url ...]` Downloads each url to its own file, which is added to the annex. -When `youtube-dl` is installed, and the url is to a web page, -it's used to download any video that the web page embeds. +When `youtube-dl` is installed, it's used to download videos +embedded on web pages. Urls to torrent files (including magnet links) will cause the content of the torrent to be downloaded, using `aria2c`. @@ -28,10 +28,20 @@ be used to get better filenames. Avoid immediately downloading the url. The url is still checked (via HEAD) to verify that it exists, and to get its size if possible. + When `youtube-dl` is installed, videos embedded on web pages + will be added. To avoid the extra work of checking for videos, + add the `--raw` option. + * `--relaxed` - Avoid storing the size of the url's content, and accept whatever - content is there at a future point. (Implies `--fast`.) + Don't immediately download the url, and avoid storing the size of the + url's content. This makes git-annex accept whatever content is there + at a future point. This is also the fastest option, since it does not + hit the network at all. + + Note that this does *not* check for embedded videos using `youtube-dl`, + although it used to in previous versions of git-annex. + Use --fast instead if you want to do that. * `--raw` diff --git a/doc/todo/switch_from_quvi_to_youtube-dl.mdwn b/doc/todo/switch_from_quvi_to_youtube-dl.mdwn index aa2f6955d..8703b810d 100644 --- a/doc/todo/switch_from_quvi_to_youtube-dl.mdwn +++ b/doc/todo/switch_from_quvi_to_youtube-dl.mdwn @@ -16,12 +16,12 @@ urls, see for example http://bugs.debian.org/874321) So, switching to youtube-dl would probably need a new switch, like `git annex addurl --rip` that enables using it. -Currently `git annex importfeed` automatically tests for video urls with -quvi; it would also need to support `--rip`. +(Importfeed only treats links in the feed as video urls, not enclosures, +so this problem does not affect it and it would not need a new switch.) -Both of those changes would need changes to user's workflows and cron jobs. -git-annex could keep supporting quvi for some time, and warn when it uses -quvi, to help with the transition. +That would need changes to users' workflows. git-annex could keep +supporting quvi for some time, and warn when it uses quvi, to +help with the transition. > Alternatively, git-annex addurl could download the url first, and then > check the file to see if it looks like html. If so, run youtube-dl (which @@ -30,12 +30,22 @@ quvi, to help with the transition. > overhead, and the redundant download is fairly small compared to ripping > the media. Only the unusual case where addurl is being used on html that > does not contain media becomes more expensive. +> +> However, for --relaxed, running youtube-dl --get-filename would be +> significantly more expensive since it hits the network. It seems that +> --relaxed would need to change to not rip videos; users who want that +> could use --fast. +> +> --fast already hits the network, but +> if it uses youtube-dl --get-filename, it would fall afoul of +> bugs like <http://bugs.debian.org/874321>, although those can be worked +> around (/dev/null stderr in cast youtube-dl crashes) Another gotcha is playlists. youtube-dl downloads playlists automatically. But, git-annex needs to record an url that downloads a single file so that `git annex get` works right. So, playlists will need to be disabled when git-annex runs youtube-dl. But, `--no-playlist` does not always disable -playlists. Best option seems to be `--playlist-items 0` which works for +playlists. Best option seems to be `--no-playlist --playlist-items 0` which works for non-playlists, and downloads only 1 item from playlists (hopefully a fairly stable item, but who knows..). |