diff options
Diffstat (limited to 'Utility/HtmlDetect.hs')
-rw-r--r-- | Utility/HtmlDetect.hs | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/Utility/HtmlDetect.hs b/Utility/HtmlDetect.hs new file mode 100644 index 000000000..ca516e960 --- /dev/null +++ b/Utility/HtmlDetect.hs @@ -0,0 +1,35 @@ +{- html detection + - + - Copyright 2017 Joey Hess <id@joeyh.name> + - + - License: BSD-2-clause + -} + +module Utility.HtmlDetect where + +import Text.HTML.TagSoup +import Data.Char + +-- | Detect if a string is a html document. +-- +-- The document many not be valid, and will still be detected as html, +-- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag. +-- +-- Html fragments like "<p>this</p>" are not detected as being html, +-- although some browsers may chose to render them as html. +isHtml :: String -> Bool +isHtml = evaluate . canonicalizeTags . parseTags . truncate + where + -- We only care about the beginning of the file, + -- so although tagsoup parses lazily anyway, truncate it. + truncate = take 16384 + evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" + evaluate (TagOpen "html" _:_) = True + -- Allow some leading whitespace before the tag. + evaluate (TagText t:rest) + | all isSpace t = evaluate rest + | otherwise = False + -- It would be pretty weird to have a html comment before the html + -- tag, but easy to allow for. + evaluate (TagComment _:rest) = evaluate rest + evaluate _ = False |