{-# LANGUAGE OverloadedStrings, UnicodeSyntax #-} {-# LANGUAGE CPP, RankNTypes #-} module Data.Microformats2.Parser.HtmlUtil ( HtmlContentMode (..) , getInnerHtml , getInnerHtmlSanitized , getInnerTextRaw , getInnerTextWithImgs , getProcessedInnerHtml , deduplicateElements ) where #if __GLASGOW_HASKELL__ < 709 import Control.Applicative #endif import qualified Data.Map as M import qualified Data.Text as T import qualified Data.Text.Lazy as TL import Data.Text (Text) import Data.Foldable (asum) import Data.Maybe import Text.Blaze import Text.Blaze.Renderer.Text import Text.HTML.SanitizeXSS import Text.XML.Lens hiding (re) import Data.Microformats2.Parser.Util getPrism ∷ Prism' Node Text → Element → Maybe Text getPrism t e = Just . T.strip <$> T.concat $ e ^.. nodes . traverse . t _InnerHtml ∷ Prism' Node Text _InnerHtml = prism' NodeContent $ \s → case s of NodeContent c → Just $ collapseWhitespace c NodeElement e → Just . TL.toStrict . renderMarkup . toMarkup $ e _ → Nothing getInnerHtml ∷ Element → Maybe Text getInnerHtml = getPrism _InnerHtml sanitizeAttrs ∷ Element → Element sanitizeAttrs e = e { elementAttributes = M.fromList $ map wrapName $ mapMaybe modify $ M.toList $ elementAttributes e } where modify (Name n _ _, val) = sanitizeAttribute (n, val) wrapName (n, val) = (Name n Nothing Nothing, val) _InnerHtmlSanitized ∷ Prism' Node Text _InnerHtmlSanitized = prism' NodeContent $ \s → case s of NodeContent c → Just $ collapseWhitespace c NodeElement e → if' (safeTagName $ nameLocalName (elementName e)) $ Just . TL.toStrict . renderMarkup . toMarkup $ sanitizeAttrs e _ → Nothing getInnerHtmlSanitized ∷ Element → Maybe Text getInnerHtmlSanitized = getPrism _InnerHtmlSanitized _InnerTextRaw ∷ Prism' Node Text _InnerTextRaw = prism' NodeContent $ \s → case s of NodeContent c → Just . collapseWhitespace $ c NodeElement e → Just . collapseWhitespace . TL.toStrict . renderMarkup . contents . toMarkup $ e _ → Nothing _InnerTextWithImgs ∷ Prism' Node Text _InnerTextWithImgs = prism' NodeContent $ \s → case s of NodeContent c → Just $ collapseWhitespace c NodeElement e → if nameLocalName (elementName e) == "img" then asum [ e ^. attribute "alt", e ^. attribute "src" ] else Just . collapseWhitespace . TL.toStrict . renderMarkup . contents . toMarkup $ e _ → Nothing getInnerTextRaw ∷ Element → Maybe Text getInnerTextRaw e = unless' (txt == Just "") txt where txt = getPrism _InnerTextRaw e getInnerTextWithImgs ∷ Element → Maybe Text getInnerTextWithImgs e = unless' (txt == Just "") txt where txt = getPrism _InnerTextWithImgs e data HtmlContentMode = Unsafe | Escape | Sanitize deriving (Show, Eq) getProcessedInnerHtml ∷ HtmlContentMode → Element → Maybe Text getProcessedInnerHtml Unsafe e = getInnerHtml e getProcessedInnerHtml Escape e = (T.replace "<" "<" . T.replace ">" ">" . T.replace "&" "&") <$> getInnerHtml e getProcessedInnerHtml Sanitize e = getInnerHtmlSanitized e deduplicateElements ∷ [Element] → [Element] deduplicateElements es = filter (not . isNested) es where isNested e = any (\e' → e `elem` filter (/= e') (e' ^.. entire)) es -- not the fastest function I guess...