summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2017-12-06 13:16:06 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2017-12-06 13:22:31 -0400
commit9ec6bdfb526fa6b75a264b6417b24aa7f01adc25 (patch)
treefc6d7acac70a1835117e3f3c5296b71f1ebf7970
parent30671447e071cee943701c8e9d72571ce2d6699d (diff)
fix regression in addurl --file caused by youtube-dl support
Now youtubeDlCheck downloads the beginning of the url's content and checks if it's html, only when it is does it pass it off the youtube-dl to check if it supports it. This means more work is done for urls that youtube-dl does support, but is probably more efficient for other urls, since it only downloads the first chunk of content, while youtube-dl probably downloads more. As well as the reported bug, this also fixes behavior when an url was added with youtube-dl, but the url content has now changed from a html page to something else. Remote.Web.checkKey used to wrongly succeed in that situation, since youtube-dl said sure it can download that something else. This commit was supported by the NSF-funded DataLad project.
-rw-r--r--Annex/YoutubeDl.hs15
-rw-r--r--Utility/HtmlDetect.hs25
-rw-r--r--Utility/Url.hs47
-rw-r--r--doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn2
4 files changed, 71 insertions, 18 deletions
diff --git a/Annex/YoutubeDl.hs b/Annex/YoutubeDl.hs
index 4a820cede..071ab1e93 100644
--- a/Annex/YoutubeDl.hs
+++ b/Annex/YoutubeDl.hs
@@ -10,8 +10,10 @@ module Annex.YoutubeDl where
import Annex.Common
import qualified Annex
import Annex.Content
+import Annex.Url
import Utility.Url (URLString)
import Utility.DiskFree
+import Utility.HtmlDetect
import Logs.Transfer
-- Runs youtube-dl in a work directory, to download a single media file
@@ -108,10 +110,19 @@ youtubeDlSupported :: URLString -> Annex Bool
youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
-- Check if youtube-dl can find media in an url.
+--
+-- youtube-dl supports downloading urls that are not html pages,
+-- but we don't want to use it for such urls, since they can be downloaded
+-- without it. So, this first downloads part of the content and checks
+-- if it's a html page; only then is youtube-dl used.
youtubeDlCheck :: URLString -> Annex (Either String Bool)
youtubeDlCheck url = catchMsgIO $ do
- opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
- liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
+ uo <- getUrlOptions
+ liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
+ Just bs | isHtmlBs bs -> do
+ opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
+ liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
+ _ -> return False
-- Ask youtube-dl for the filename of media in an url.
--
diff --git a/Utility/HtmlDetect.hs b/Utility/HtmlDetect.hs
index 57a56c95f..bf0839e9e 100644
--- a/Utility/HtmlDetect.hs
+++ b/Utility/HtmlDetect.hs
@@ -9,20 +9,20 @@ module Utility.HtmlDetect where
import Text.HTML.TagSoup
import Data.Char
+import qualified Data.ByteString.Lazy as B
+import qualified Data.ByteString.Lazy.Char8 as B8
--- | Detect if a string is a html document.
+-- | Detect if a String is a html document.
--
--- The document many not be valid, and will still be detected as html,
--- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag.
+-- The document many not be valid, or may be truncated, and will
+-- still be detected as html, as long as it starts with a
+-- "<html>" or "<!DOCTYPE html>" tag.
--
-- Html fragments like "<p>this</p>" are not detected as being html,
-- although some browsers may chose to render them as html.
isHtml :: String -> Bool
-isHtml = evaluate . canonicalizeTags . parseTags . shorten
+isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength
where
- -- We only care about the beginning of the file,
- -- so although tagsoup parses lazily anyway, truncate it.
- shorten = take 16384
evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
evaluate (TagOpen "html" _:_) = True
-- Allow some leading whitespace before the tag.
@@ -33,3 +33,14 @@ isHtml = evaluate . canonicalizeTags . parseTags . shorten
-- tag, but easy to allow for.
evaluate (TagComment _:rest) = evaluate rest
evaluate _ = False
+
+-- | Detect if a ByteString is a html document.
+isHtmlBs :: B.ByteString -> Bool
+-- The encoding of the ByteString is not known, but isHtml only
+-- looks for ascii strings.
+isHtmlBs = isHtml . B8.unpack
+
+-- | How much of the beginning of a html document is needed to detect it.
+-- (conservatively)
+htmlPrefixLength :: Int
+htmlPrefixLength = 8192
diff --git a/Utility/Url.hs b/Utility/Url.hs
index abd5f7ae5..5bf09ca4d 100644
--- a/Utility/Url.hs
+++ b/Utility/Url.hs
@@ -1,6 +1,6 @@
{- Url downloading.
-
- - Copyright 2011-2014 Joey Hess <id@joeyh.name>
+ - Copyright 2011-2017 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
@@ -25,6 +25,7 @@ module Utility.Url (
assumeUrlExists,
download,
downloadQuiet,
+ downloadPartial,
parseURIRelaxed,
matchStatusCodeException,
matchHttpExceptionContent,
@@ -39,8 +40,10 @@ import Network.HTTP.Types
import qualified Data.CaseInsensitive as CI
import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as B8
+import qualified Data.ByteString.Lazy as L
import Control.Monad.Trans.Resource
import Network.HTTP.Conduit hiding (closeManager)
+import Network.HTTP.Client (brRead, withResponse)
-- closeManager is needed with older versions of http-client,
-- but not new versions, which warn about using it. Urgh.
@@ -140,7 +143,7 @@ assumeUrlExists = UrlInfo True Nothing Nothing
- also returning its size and suggested filename if available. -}
getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
getUrlInfo url uo = case parseURIRelaxed url of
- Just u -> case parseurlconduit (show u) of
+ Just u -> case parseUrlConduit (show u) of
Just req -> catchJust
-- When http redirects to a protocol which
-- conduit does not support, it will throw
@@ -220,12 +223,6 @@ getUrlInfo url uo = case parseURIRelaxed url of
_ | isftp && isJust len -> good
_ -> dne
-#if MIN_VERSION_http_client(0,4,30)
- parseurlconduit = parseUrlThrow
-#else
- parseurlconduit = parseUrl
-#endif
-
-- Parse eg: attachment; filename="fname.ext"
-- per RFC 2616
contentDispositionFilename :: String -> Maybe FilePath
@@ -321,11 +318,45 @@ download' quiet url file uo = do
| quiet = [Param s]
| otherwise = []
+{- Downloads at least the specified number of bytes from an url. -}
+downloadPartial :: URLString -> UrlOptions -> Int -> IO (Maybe L.ByteString)
+downloadPartial url uo n = case parseURIRelaxed url of
+ Nothing -> return Nothing
+ Just u -> go u `catchNonAsync` const (return Nothing)
+ where
+ go u = case parseUrlConduit (show u) of
+ Nothing -> return Nothing
+ Just req -> do
+ mgr <- newManager managerSettings
+ let req' = applyRequest uo req
+ ret <- withResponse req' mgr $ \resp ->
+ if responseStatus resp == ok200
+ then Just <$> brread n [] (responseBody resp)
+ else return Nothing
+ liftIO $ closeManager mgr
+ return ret
+
+ -- could use brReadSome here, needs newer http-client dependency
+ brread n' l rb
+ | n' <= 0 = return (L.fromChunks (reverse l))
+ | otherwise = do
+ bs <- brRead rb
+ if B.null bs
+ then return (L.fromChunks (reverse l))
+ else brread (n' - B.length bs) (bs:l) rb
+
{- Allows for spaces and other stuff in urls, properly escaping them. -}
parseURIRelaxed :: URLString -> Maybe URI
parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $
parseURI $ escapeURIString isAllowedInURI s
+#if MIN_VERSION_http_client(0,4,30)
+parseUrlConduit :: URLString -> Maybe Request
+parseUrlConduit = parseUrlThrow
+#else
+parseUrlConduit = parseUrl
+#endif
+
{- Some characters like '[' are allowed in eg, the address of
- an uri, but cannot appear unescaped further along in the uri.
- This handles that, expensively, by successively escaping each character
diff --git a/doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn b/doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn
index 0dd86d57f..9c74c6aeb 100644
--- a/doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn
+++ b/doc/bugs/regression_-_yt__58___prefix_for___34__regular__34___urls.mdwn
@@ -28,4 +28,4 @@ Some tests also failed related to our datalad archives git annex special remote
[[!meta author=yoh]]
-
+> [[fixed|done]] --[[Joey]]