fix regression in addurl --file caused by youtube-dl support

Now youtubeDlCheck downloads the beginning of the url's content and checks if it's html, only when it is does it pass it off the youtube-dl to check if it supports it. This means more work is done for urls that youtube-dl does support, but is probably more efficient for other urls, since it only downloads the first chunk of content, while youtube-dl probably downloads more. As well as the reported bug, this also fixes behavior when an url was added with youtube-dl, but the url content has now changed from a html page to something else. Remote.Web.checkKey used to wrongly succeed in that situation, since youtube-dl said sure it can download that something else. This commit was supported by the NSF-funded DataLad project.
author: Joey Hess <joeyh@joeyh.name> 2017-12-06 13:16:06 -0400
committer: Joey Hess <joeyh@joeyh.name> 2017-12-06 13:22:31 -0400
commit: 9ec6bdfb526fa6b75a264b6417b24aa7f01adc25 (patch)
tree: fc6d7acac70a1835117e3f3c5296b71f1ebf7970 /Utility/HtmlDetect.hs
parent: 30671447e071cee943701c8e9d72571ce2d6699d (diff)
1 files changed, 18 insertions, 7 deletions
diff --git a/Utility/HtmlDetect.hs b/Utility/HtmlDetect.hs
index 57a56c95f..bf0839e9e 100644
--- a/Utility/HtmlDetect.hs
+++ b/Utility/HtmlDetect.hs
@@ -9,20 +9,20 @@ module Utility.HtmlDetect where
 
 import Text.HTML.TagSoup
 import Data.Char
+import qualified Data.ByteString.Lazy as B
+import qualified Data.ByteString.Lazy.Char8 as B8
 
--- | Detect if a string is a html document.
+-- | Detect if a String is a html document.
 --
--- The document many not be valid, and will still be detected as html,
--- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag.
+-- The document many not be valid, or may be truncated, and will
+-- still be detected as html, as long as it starts with a
+-- "<html>" or "<!DOCTYPE html>" tag.
 --
 -- Html fragments like "<p>this</p>" are not detected as being html,
 -- although some browsers may chose to render them as html.
 isHtml :: String -> Bool
-isHtml = evaluate . canonicalizeTags . parseTags . shorten
+isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength
   where
-	-- We only care about the beginning of the file,
-	-- so although tagsoup parses lazily anyway, truncate it.
-	shorten = take 16384
 	evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
 	evaluate (TagOpen "html" _:_) = True
 	-- Allow some leading whitespace before the tag.
@@ -33,3 +33,14 @@ isHtml = evaluate . canonicalizeTags . parseTags . shorten
 	-- tag, but easy to allow for.
 	evaluate (TagComment _:rest) = evaluate rest
 	evaluate _ = False
+
+-- | Detect if a ByteString is a html document.
+isHtmlBs :: B.ByteString -> Bool
+-- The encoding of the ByteString is not known, but isHtml only
+-- looks for ascii strings.
+isHtmlBs = isHtml . B8.unpack
+
+-- | How much of the beginning of a html document is needed to detect it.
+-- (conservatively)
+htmlPrefixLength :: Int
+htmlPrefixLength = 8192
author	Joey Hess <joeyh@joeyh.name>	2017-12-06 13:16:06 -0400
committer	Joey Hess <joeyh@joeyh.name>	2017-12-06 13:22:31 -0400
commit	9ec6bdfb526fa6b75a264b6417b24aa7f01adc25 (patch)
tree	fc6d7acac70a1835117e3f3c5296b71f1ebf7970 /Utility/HtmlDetect.hs
parent	30671447e071cee943701c8e9d72571ce2d6699d (diff)