{-# LANGUAGE Safe, NoImplicitPrelude, OverloadedStrings, UnicodeSyntax, RankNTypes #-} module Data.Microformats2.Parser.HtmlUtil ( HtmlContentMode (..) , getInnerHtml , getInnerHtmlSanitized , getInnerTextRaw , getInnerTextWithImgs , getProcessedInnerHtml , deduplicateElements , unescapeHtml ) where import Prelude.Compat import Control.Applicative ((<|>)) import Control.Error.Util (hush) import Data.Monoid.Compat import qualified Data.Map as M import qualified Data.Text as T import Data.Text (Text) import Data.Char (isSpace) import Data.Foldable (asum) import Data.Attoparsec.Text import Data.Maybe import Text.HTML.SanitizeXSS import Text.HTML.TagSoup.Entity import Network.URI import Data.Microformats2.Parser.Util resolveHrefSrc ∷ Maybe URI → Element → Element resolveHrefSrc b e = e { elementAttributes = M.adjust (resolveURI b) "href" $ M.adjust (resolveURI b) "href" $ elementAttributes e } processChildren ∷ (Node → Node) → Element → Element processChildren f e = e { elementNodes = map f $ elementNodes e } filterChildElements ∷ (Element → Bool) → Element → Element filterChildElements f e = e { elementNodes = filter f' $ elementNodes e } where f' (NodeContent _) = True f' (NodeElement e') = f e' f' _ = False getInnerHtml ∷ Maybe URI → Element → Maybe Text getInnerHtml b rootEl = Just $ renderInner processedRoot where (NodeElement processedRoot) = processNode (NodeElement rootEl) processNode (NodeContent c) = NodeContent c processNode (NodeElement e) = NodeElement $ processChildren processNode $ resolveHrefSrc b e processNode x = x sanitizeAttrs ∷ Element → Element sanitizeAttrs e = e { elementAttributes = M.fromList $ map wrapName $ mapMaybe modify $ M.toList $ elementAttributes e } where modify (Name n _ _, val) = sanitizeAttribute (n, val) wrapName (n, val) = (Name n Nothing Nothing, val) getInnerHtmlSanitized ∷ Maybe URI → Element → Maybe Text getInnerHtmlSanitized b rootEl = Just $ renderInner processedRoot where (NodeElement processedRoot) = processNode (NodeElement rootEl) processNode (NodeContent c) = NodeContent c processNode (NodeElement e) = NodeElement $ processChildren processNode $ filterChildElements (safeTagName' . nameLocalName . elementName) $ resolveHrefSrc b $ sanitizeAttrs e processNode x = x getInnerTextRaw ∷ Element → Maybe Text getInnerTextRaw rootEl = unless' (txt == Just "") txt where txt = Just $ T.dropAround isSpace processedRoot (NodeContent processedRoot) = processNode (NodeElement rootEl) processNode (NodeContent c) = NodeContent $ escapeHtml c processNode (NodeElement e) = NodeContent $ T.dropAround isSpace $ renderInner $ processChildren processNode $ filterChildElements (safeTagName' . nameLocalName . elementName) e processNode x = x getInnerTextWithImgs ∷ Element → Maybe Text getInnerTextWithImgs rootEl = unless' (txt == Just "") txt where txt = Just $ T.dropAround isSpace processedRoot (NodeContent processedRoot) = processNode (NodeElement rootEl) processNode (NodeContent c) = NodeContent $ escapeHtml c processNode (NodeElement e) | nameLocalName (elementName e) == "img" = NodeContent $ fromMaybe "" $ asum [ e ^. attribute "alt", e ^. attribute "src" ] processNode (NodeElement e) = NodeContent $ T.dropAround isSpace $ renderInner $ processChildren processNode $ filterChildElements (safeTagName' . nameLocalName . elementName) e processNode x = x data HtmlContentMode = Unsafe | Escape | Sanitize deriving (Show, Eq) getProcessedInnerHtml ∷ HtmlContentMode → Maybe URI → Element → Maybe Text getProcessedInnerHtml Unsafe b e = getInnerHtml b e getProcessedInnerHtml Escape b e = escapeHtml <$> getInnerHtml b e getProcessedInnerHtml Sanitize b e = getInnerHtmlSanitized b e deduplicateElements ∷ [Element] → [Element] deduplicateElements es = filter (not . isNested) es where isNested e = any (\e' → e `elem` filter (/= e') (e' ^.. entire)) es -- not the fastest function I guess... escapeHtml ∷ Text → Text escapeHtml = T.replace "<" "<" . T.replace ">" ">" . T.replace "&" "&" unescapeHtml ∷ Text → Text unescapeHtml x = fromMaybe x $ T.concat <$> hush (parseOnly p x) where p = many1 $ takeWhile1 (/= '&') <|> pent pent = do _ ← char '&' ent ← takeWhile1 (/= ';') _ ← char ';' return $ fromMaybe ("&" <> ent <> ";") $ T.pack <$> lookupEntity (T.unpack ent) safeTagName' ∷ Text → Bool safeTagName' x = safeTagName x || "-" `T.isInfixOf` x