module Data.Microformats2.Parser.HtmlUtil (
HtmlContentMode (..)
, getInnerHtml
, getInnerHtmlSanitized
, getInnerTextRaw
, getInnerTextWithImgs
, getProcessedInnerHtml
, deduplicateElements
) where
#if __GLASGOW_HASKELL__ < 709
import Control.Applicative
#endif
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Text (Text)
import Data.Foldable (asum)
import Data.Maybe
import Text.Blaze
import Text.Blaze.Renderer.Text
import Text.HTML.SanitizeXSS
import Text.XML.Lens hiding (re)
import Data.Microformats2.Parser.Util
getPrism ∷ Prism' Node Text → Element → Maybe Text
getPrism t e = Just . T.strip <$> T.concat $ e ^.. nodes . traverse . t
_InnerHtml ∷ Prism' Node Text
_InnerHtml = prism' NodeContent $ \s → case s of
NodeContent c → Just $ collapseWhitespace c
NodeElement e → Just . TL.toStrict . renderMarkup . toMarkup $ e
_ → Nothing
getInnerHtml ∷ Element → Maybe Text
getInnerHtml = getPrism _InnerHtml
sanitizeAttrs ∷ Element → Element
sanitizeAttrs e = e { elementAttributes = M.fromList $ map wrapName $ mapMaybe modify $ M.toList $ elementAttributes e }
where modify (Name n _ _, val) = sanitizeAttribute (n, val)
wrapName (n, val) = (Name n Nothing Nothing, val)
_InnerHtmlSanitized ∷ Prism' Node Text
_InnerHtmlSanitized = prism' NodeContent $ \s → case s of
NodeContent c → Just $ collapseWhitespace c
NodeElement e → if' (safeTagName $ nameLocalName (elementName e)) $
Just . TL.toStrict . renderMarkup . toMarkup $ sanitizeAttrs e
_ → Nothing
getInnerHtmlSanitized ∷ Element → Maybe Text
getInnerHtmlSanitized = getPrism _InnerHtmlSanitized
_InnerTextRaw ∷ Prism' Node Text
_InnerTextRaw = prism' NodeContent $ \s → case s of
NodeContent c → Just . collapseWhitespace $ c
NodeElement e → Just . collapseWhitespace . TL.toStrict . renderMarkup . contents . toMarkup $ e
_ → Nothing
_InnerTextWithImgs ∷ Prism' Node Text
_InnerTextWithImgs = prism' NodeContent $ \s → case s of
NodeContent c → Just $ collapseWhitespace c
NodeElement e → if nameLocalName (elementName e) == "img"
then asum [ e ^. attribute "alt", e ^. attribute "src" ]
else Just . collapseWhitespace . TL.toStrict . renderMarkup . contents . toMarkup $ e
_ → Nothing
getInnerTextRaw ∷ Element → Maybe Text
getInnerTextRaw e = unless' (txt == Just "") txt
where txt = getPrism _InnerTextRaw e
getInnerTextWithImgs ∷ Element → Maybe Text
getInnerTextWithImgs e = unless' (txt == Just "") txt
where txt = getPrism _InnerTextWithImgs e
data HtmlContentMode = Unsafe | Escape | Sanitize
deriving (Show, Eq)
getProcessedInnerHtml ∷ HtmlContentMode → Element → Maybe Text
getProcessedInnerHtml Unsafe e = getInnerHtml e
getProcessedInnerHtml Escape e = (T.replace "<" "<" . T.replace ">" ">" . T.replace "&" "&") <$> getInnerHtml e
getProcessedInnerHtml Sanitize e = getInnerHtmlSanitized e
deduplicateElements ∷ [Element] → [Element]
deduplicateElements es = filter (not . isNested) es
where isNested e = any (\e' → e `elem` filter (/= e') (e' ^.. entire)) es