From 9ec6bdfb526fa6b75a264b6417b24aa7f01adc25 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 6 Dec 2017 13:16:06 -0400 Subject: fix regression in addurl --file caused by youtube-dl support Now youtubeDlCheck downloads the beginning of the url's content and checks if it's html, only when it is does it pass it off the youtube-dl to check if it supports it. This means more work is done for urls that youtube-dl does support, but is probably more efficient for other urls, since it only downloads the first chunk of content, while youtube-dl probably downloads more. As well as the reported bug, this also fixes behavior when an url was added with youtube-dl, but the url content has now changed from a html page to something else. Remote.Web.checkKey used to wrongly succeed in that situation, since youtube-dl said sure it can download that something else. This commit was supported by the NSF-funded DataLad project. --- Annex/YoutubeDl.hs | 15 ++++++- Utility/HtmlDetect.hs | 25 ++++++++---- Utility/Url.hs | 47 ++++++++++++++++++---- ...__58___prefix_for___34__regular__34___urls.mdwn | 2 +- 4 files changed, 71 insertions(+), 18 deletions(-) diff --git a/Annex/YoutubeDl.hs b/Annex/YoutubeDl.hs index 4a820cede..071ab1e93 100644 --- a/Annex/YoutubeDl.hs +++ b/Annex/YoutubeDl.hs @@ -10,8 +10,10 @@ module Annex.YoutubeDl where import Annex.Common import qualified Annex import Annex.Content +import Annex.Url import Utility.Url (URLString) import Utility.DiskFree +import Utility.HtmlDetect import Logs.Transfer -- Runs youtube-dl in a work directory, to download a single media file @@ -108,10 +110,19 @@ youtubeDlSupported :: URLString -> Annex Bool youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url -- Check if youtube-dl can find media in an url. +-- +-- youtube-dl supports downloading urls that are not html pages, +-- but we don't want to use it for such urls, since they can be downloaded +-- without it. So, this first downloads part of the content and checks +-- if it's a html page; only then is youtube-dl used. youtubeDlCheck :: URLString -> Annex (Either String Bool) youtubeDlCheck url = catchMsgIO $ do - opts <- youtubeDlOpts [ Param url, Param "--simulate" ] - liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing + uo <- getUrlOptions + liftIO (downloadPartial url uo htmlPrefixLength) >>= \case + Just bs | isHtmlBs bs -> do + opts <- youtubeDlOpts [ Param url, Param "--simulate" ] + liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing + _ -> return False -- Ask youtube-dl for the filename of media in an url. -- diff --git a/Utility/HtmlDetect.hs b/Utility/HtmlDetect.hs index 57a56c95f..bf0839e9e 100644 --- a/Utility/HtmlDetect.hs +++ b/Utility/HtmlDetect.hs @@ -9,20 +9,20 @@ module Utility.HtmlDetect where import Text.HTML.TagSoup import Data.Char +import qualified Data.ByteString.Lazy as B +import qualified Data.ByteString.Lazy.Char8 as B8 --- | Detect if a string is a html document. +-- | Detect if a String is a html document. -- --- The document many not be valid, and will still be detected as html, --- as long as it starts with a "" or "" tag. +-- The document many not be valid, or may be truncated, and will +-- still be detected as html, as long as it starts with a +-- "" or "" tag. -- -- Html fragments like "

this

" are not detected as being html, -- although some browsers may chose to render them as html. isHtml :: String -> Bool -isHtml = evaluate . canonicalizeTags . parseTags . shorten +isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength where - -- We only care about the beginning of the file, - -- so although tagsoup parses lazily anyway, truncate it. - shorten = take 16384 evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" evaluate (TagOpen "html" _:_) = True -- Allow some leading whitespace before the tag. @@ -33,3 +33,14 @@ isHtml = evaluate . canonicalizeTags . parseTags . shorten -- tag, but easy to allow for. evaluate (TagComment _:rest) = evaluate rest evaluate _ = False + +-- | Detect if a ByteString is a html document. +isHtmlBs :: B.ByteString -> Bool +-- The encoding of the ByteString is not known, but isHtml only +-- looks for ascii strings. +isHtmlBs = isHtml . B8.unpack + +-- | How much of the beginning of a html document is needed to detect it. +-- (conservatively) +htmlPrefixLength :: Int +htmlPrefixLength = 8192 diff --git a/Utility/Url.hs b/Utility/Url.hs index abd5f7ae5..5bf09ca4d 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -1,6 +1,6 @@ {- Url downloading. - - - Copyright 2011-2014 Joey Hess + - Copyright 2011-2017 Joey Hess - - License: BSD-2-clause -} @@ -25,6 +25,7 @@ module Utility.Url ( assumeUrlExists, download, downloadQuiet, + downloadPartial, parseURIRelaxed, matchStatusCodeException, matchHttpExceptionContent, @@ -39,8 +40,10 @@ import Network.HTTP.Types import qualified Data.CaseInsensitive as CI import qualified Data.ByteString as B import qualified Data.ByteString.UTF8 as B8 +import qualified Data.ByteString.Lazy as L import Control.Monad.Trans.Resource import Network.HTTP.Conduit hiding (closeManager) +import Network.HTTP.Client (brRead, withResponse) -- closeManager is needed with older versions of http-client, -- but not new versions, which warn about using it. Urgh. @@ -140,7 +143,7 @@ assumeUrlExists = UrlInfo True Nothing Nothing - also returning its size and suggested filename if available. -} getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo getUrlInfo url uo = case parseURIRelaxed url of - Just u -> case parseurlconduit (show u) of + Just u -> case parseUrlConduit (show u) of Just req -> catchJust -- When http redirects to a protocol which -- conduit does not support, it will throw @@ -220,12 +223,6 @@ getUrlInfo url uo = case parseURIRelaxed url of _ | isftp && isJust len -> good _ -> dne -#if MIN_VERSION_http_client(0,4,30) - parseurlconduit = parseUrlThrow -#else - parseurlconduit = parseUrl -#endif - -- Parse eg: attachment; filename="fname.ext" -- per RFC 2616 contentDispositionFilename :: String -> Maybe FilePath @@ -321,11 +318,45 @@ download' quiet url file uo = do | quiet = [Param s] | otherwise = [] +{- Downloads at least the specified number of bytes from an url. -} +downloadPartial :: URLString -> UrlOptions -> Int -> IO (Maybe L.ByteString) +downloadPartial url uo n = case parseURIRelaxed url of + Nothing -> return Nothing + Just u -> go u `catchNonAsync` const (return Nothing) + where + go u = case parseUrlConduit (show u) of + Nothing -> return Nothing + Just req -> do + mgr <- newManager managerSettings + let req' = applyRequest uo req + ret <- withResponse req' mgr $ \resp -> + if responseStatus resp == ok200 + then Just <$> brread n [] (responseBody resp) + else return Nothing + liftIO $ closeManager mgr + return ret + + -- could use brReadSome here, needs newer http-client dependency + brread n' l rb + | n' <= 0 = return (L.fromChunks (reverse l)) + | otherwise = do + bs <- brRead rb + if B.null bs + then return (L.fromChunks (reverse l)) + else brread (n' - B.length bs) (bs:l) rb + {- Allows for spaces and other stuff in urls, properly escaping them. -} parseURIRelaxed :: URLString -> Maybe URI parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $ parseURI $ escapeURIString isAllowedInURI s +#if MIN_VERSION_http_client(0,4,30) +parseUrlConduit :: URLString -> Maybe Request +parseUrlConduit = parseUrlThrow +#else +parseUrlConduit = parseUrl +#endif + {- Some characters like '[' are allowed in eg, the address of - an uri, but cannot appear unescaped further along in the uri. - This handles that, expensively, by successively escaping each character diff --git a/doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn b/doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn index 0dd86d57f..9c74c6aeb 100644 --- a/doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn +++ b/doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn @@ -28,4 +28,4 @@ Some tests also failed related to our datalad archives git annex special remote [[!meta author=yoh]] - +> [[fixed|done]] --[[Joey]] -- cgit v1.2.3