summaryrefslogtreecommitdiff
path: root/Utility
diff options
context:
space:
mode:
Diffstat (limited to 'Utility')
-rw-r--r--Utility/HtmlDetect.hs35
1 files changed, 35 insertions, 0 deletions
diff --git a/Utility/HtmlDetect.hs b/Utility/HtmlDetect.hs
new file mode 100644
index 000000000..ca516e960
--- /dev/null
+++ b/Utility/HtmlDetect.hs
@@ -0,0 +1,35 @@
+{- html detection
+ -
+ - Copyright 2017 Joey Hess <id@joeyh.name>
+ -
+ - License: BSD-2-clause
+ -}
+
+module Utility.HtmlDetect where
+
+import Text.HTML.TagSoup
+import Data.Char
+
+-- | Detect if a string is a html document.
+--
+-- The document many not be valid, and will still be detected as html,
+-- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag.
+--
+-- Html fragments like "<p>this</p>" are not detected as being html,
+-- although some browsers may chose to render them as html.
+isHtml :: String -> Bool
+isHtml = evaluate . canonicalizeTags . parseTags . truncate
+ where
+ -- We only care about the beginning of the file,
+ -- so although tagsoup parses lazily anyway, truncate it.
+ truncate = take 16384
+ evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
+ evaluate (TagOpen "html" _:_) = True
+ -- Allow some leading whitespace before the tag.
+ evaluate (TagText t:rest)
+ | all isSpace t = evaluate rest
+ | otherwise = False
+ -- It would be pretty weird to have a html comment before the html
+ -- tag, but easy to allow for.
+ evaluate (TagComment _:rest) = evaluate rest
+ evaluate _ = False