summaryrefslogtreecommitdiff
path: root/Annex
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2017-12-06 13:16:06 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2017-12-06 13:22:31 -0400
commit9ec6bdfb526fa6b75a264b6417b24aa7f01adc25 (patch)
treefc6d7acac70a1835117e3f3c5296b71f1ebf7970 /Annex
parent30671447e071cee943701c8e9d72571ce2d6699d (diff)
fix regression in addurl --file caused by youtube-dl support
Now youtubeDlCheck downloads the beginning of the url's content and checks if it's html, only when it is does it pass it off the youtube-dl to check if it supports it. This means more work is done for urls that youtube-dl does support, but is probably more efficient for other urls, since it only downloads the first chunk of content, while youtube-dl probably downloads more. As well as the reported bug, this also fixes behavior when an url was added with youtube-dl, but the url content has now changed from a html page to something else. Remote.Web.checkKey used to wrongly succeed in that situation, since youtube-dl said sure it can download that something else. This commit was supported by the NSF-funded DataLad project.
Diffstat (limited to 'Annex')
-rw-r--r--Annex/YoutubeDl.hs15
1 files changed, 13 insertions, 2 deletions
diff --git a/Annex/YoutubeDl.hs b/Annex/YoutubeDl.hs
index 4a820cede..071ab1e93 100644
--- a/Annex/YoutubeDl.hs
+++ b/Annex/YoutubeDl.hs
@@ -10,8 +10,10 @@ module Annex.YoutubeDl where
import Annex.Common
import qualified Annex
import Annex.Content
+import Annex.Url
import Utility.Url (URLString)
import Utility.DiskFree
+import Utility.HtmlDetect
import Logs.Transfer
-- Runs youtube-dl in a work directory, to download a single media file
@@ -108,10 +110,19 @@ youtubeDlSupported :: URLString -> Annex Bool
youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
-- Check if youtube-dl can find media in an url.
+--
+-- youtube-dl supports downloading urls that are not html pages,
+-- but we don't want to use it for such urls, since they can be downloaded
+-- without it. So, this first downloads part of the content and checks
+-- if it's a html page; only then is youtube-dl used.
youtubeDlCheck :: URLString -> Annex (Either String Bool)
youtubeDlCheck url = catchMsgIO $ do
- opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
- liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
+ uo <- getUrlOptions
+ liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
+ Just bs | isHtmlBs bs -> do
+ opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
+ liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
+ _ -> return False
-- Ask youtube-dl for the filename of media in an url.
--