diff options
-rw-r--r-- | Utility/HtmlDetect.hs | 35 | ||||
-rw-r--r-- | debian/control | 1 | ||||
-rw-r--r-- | doc/todo/switch_from_quvi_to_youtube-dl.mdwn | 8 | ||||
-rw-r--r-- | git-annex.cabal | 2 |
4 files changed, 46 insertions, 0 deletions
diff --git a/Utility/HtmlDetect.hs b/Utility/HtmlDetect.hs new file mode 100644 index 000000000..ca516e960 --- /dev/null +++ b/Utility/HtmlDetect.hs @@ -0,0 +1,35 @@ +{- html detection + - + - Copyright 2017 Joey Hess <id@joeyh.name> + - + - License: BSD-2-clause + -} + +module Utility.HtmlDetect where + +import Text.HTML.TagSoup +import Data.Char + +-- | Detect if a string is a html document. +-- +-- The document many not be valid, and will still be detected as html, +-- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag. +-- +-- Html fragments like "<p>this</p>" are not detected as being html, +-- although some browsers may chose to render them as html. +isHtml :: String -> Bool +isHtml = evaluate . canonicalizeTags . parseTags . truncate + where + -- We only care about the beginning of the file, + -- so although tagsoup parses lazily anyway, truncate it. + truncate = take 16384 + evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" + evaluate (TagOpen "html" _:_) = True + -- Allow some leading whitespace before the tag. + evaluate (TagText t:rest) + | all isSpace t = evaluate rest + | otherwise = False + -- It would be pretty weird to have a html comment before the html + -- tag, but easy to allow for. + evaluate (TagComment _:rest) = evaluate rest + evaluate _ = False diff --git a/debian/control b/debian/control index b34a79002..9bbe4bade 100644 --- a/debian/control +++ b/debian/control @@ -25,6 +25,7 @@ Build-Depends: libghc-dlist-dev, libghc-uuid-dev, libghc-aeson-dev, + libghc-tagsoup-dev, libghc-unordered-containers-dev, libghc-ifelse-dev, libghc-bloomfilter-dev, diff --git a/doc/todo/switch_from_quvi_to_youtube-dl.mdwn b/doc/todo/switch_from_quvi_to_youtube-dl.mdwn index cfdd8a8a6..82d61804a 100644 --- a/doc/todo/switch_from_quvi_to_youtube-dl.mdwn +++ b/doc/todo/switch_from_quvi_to_youtube-dl.mdwn @@ -23,6 +23,14 @@ Both of those changes would need changes to user's workflows and cron jobs. git-annex could keep supporting quvi for some time, and warn when it uses quvi, to help with the transition. +> Alternatively, git-annex addurl could download the url first, and then +> check the file to see if it looks like html. If so, run youtube-dl (which +> unfortunately has to download it again) and see if it manages to rip +> media from it. This way, addurl of non-html files does not have extra +> overhead, and the redundant download is fairly small compared to ripping +> the media. Only the unusual case where addurl is being used on html that +> does not contain media becomes more expensive. + Another gotcha is playlists. youtube-dl downloads playlists automatically. But, git-annex needs to record an url that downloads a single file so that `git annex get` works right. So, playlists will need to be disabled when diff --git a/git-annex.cabal b/git-annex.cabal index 5d46caed3..780961d88 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -347,6 +347,7 @@ Executable git-annex persistent, persistent-template, aeson, + tagsoup, unordered-containers, feed (>= 0.3.9), regex-tdfa, @@ -1001,6 +1002,7 @@ Executable git-annex Utility.Glob Utility.Gpg Utility.Hash + Utility.HtmlDetect Utility.HumanNumber Utility.HumanTime Utility.InodeCache |