module Data.Microformats2.Parser.HtmlUtil (
HtmlContentMode (..)
, getInnerHtml
, getInnerHtmlSanitized
, getInnerTextRaw
, getInnerTextWithImgs
, getProcessedInnerHtml
, deduplicateElements
, unescapeHtml
) where
import Prelude.Compat
import Control.Applicative ((<|>))
import Control.Error.Util (hush)
import Data.Monoid.Compat
import qualified Data.Map as M
import qualified Data.Text as T
import Data.Text (Text)
import Data.Char (isSpace)
import Data.Foldable (asum)
import Data.Attoparsec.Text
import Data.Maybe
import Text.HTML.SanitizeXSS
import Text.HTML.TagSoup.Entity
import Network.URI
import Data.Microformats2.Parser.Util
resolveHrefSrc ∷ Maybe URI → Element → Element
resolveHrefSrc b e = e { elementAttributes = M.adjust (resolveURI b) "href" $ M.adjust (resolveURI b) "href" $ elementAttributes e }
processChildren ∷ (Node → Node) → Element → Element
processChildren f e = e { elementNodes = map f $ elementNodes e }
filterChildElements ∷ (Element → Bool) → Element → Element
filterChildElements f e = e { elementNodes = filter f' $ elementNodes e }
where f' (NodeContent _) = True
f' (NodeElement e') = f e'
f' _ = False
getInnerHtml ∷ Maybe URI → Element → Maybe Text
getInnerHtml b rootEl = Just $ renderInner processedRoot
where (NodeElement processedRoot) = processNode (NodeElement rootEl)
processNode (NodeContent c) = NodeContent c
processNode (NodeElement e) = NodeElement $ processChildren processNode $ resolveHrefSrc b e
processNode x = x
sanitizeAttrs ∷ Element → Element
sanitizeAttrs e = e { elementAttributes = M.fromList $ map wrapName $ mapMaybe modify $ M.toList $ elementAttributes e }
where modify (Name n _ _, val) = sanitizeAttribute (n, val)
wrapName (n, val) = (Name n Nothing Nothing, val)
getInnerHtmlSanitized ∷ Maybe URI → Element → Maybe Text
getInnerHtmlSanitized b rootEl = Just $ renderInner processedRoot
where (NodeElement processedRoot) = processNode (NodeElement rootEl)
processNode (NodeContent c) = NodeContent c
processNode (NodeElement e) = NodeElement $ processChildren processNode $ filterChildElements (safeTagName' . nameLocalName . elementName) $ resolveHrefSrc b $ sanitizeAttrs e
processNode x = x
getInnerTextRaw ∷ Element → Maybe Text
getInnerTextRaw rootEl = unless' (txt == Just "") txt
where txt = Just $ T.dropAround isSpace processedRoot
(NodeContent processedRoot) = processNode (NodeElement rootEl)
processNode (NodeContent c) = NodeContent $ escapeHtml c
processNode (NodeElement e) = NodeContent $ T.dropAround isSpace $ renderInner $ processChildren processNode $ filterChildElements (safeTagName' . nameLocalName . elementName) e
processNode x = x
getInnerTextWithImgs ∷ Element → Maybe Text
getInnerTextWithImgs rootEl = unless' (txt == Just "") txt
where txt = Just $ T.dropAround isSpace processedRoot
(NodeContent processedRoot) = processNode (NodeElement rootEl)
processNode (NodeContent c) = NodeContent $ escapeHtml c
processNode (NodeElement e) | nameLocalName (elementName e) == "img" = NodeContent $ fromMaybe "" $ asum [ e ^. attribute "alt", e ^. attribute "src" ]
processNode (NodeElement e) = NodeContent $ T.dropAround isSpace $ renderInner $ processChildren processNode $ filterChildElements (safeTagName' . nameLocalName . elementName) e
processNode x = x
data HtmlContentMode = Unsafe | Escape | Sanitize
deriving (Show, Eq)
getProcessedInnerHtml ∷ HtmlContentMode → Maybe URI → Element → Maybe Text
getProcessedInnerHtml Unsafe b e = getInnerHtml b e
getProcessedInnerHtml Escape b e = escapeHtml <$> getInnerHtml b e
getProcessedInnerHtml Sanitize b e = getInnerHtmlSanitized b e
deduplicateElements ∷ [Element] → [Element]
deduplicateElements es = filter (not . isNested) es
where isNested e = any (\e' → e `elem` filter (/= e') (e' ^.. entire)) es
escapeHtml ∷ Text → Text
escapeHtml = T.replace "<" "<" . T.replace ">" ">" . T.replace "&" "&"
unescapeHtml ∷ Text → Text
unescapeHtml x = fromMaybe x $ T.concat <$> hush (parseOnly p x)
where p = many1 $ takeWhile1 (/= '&') <|> pent
pent = do
_ ← char '&'
ent ← takeWhile1 (/= ';')
_ ← char ';'
return $ fromMaybe ("&" <> ent <> ";") $ T.pack <$> lookupEntity (T.unpack ent)
safeTagName' ∷ Text → Bool
safeTagName' x = safeTagName x || "-" `T.isInfixOf` x