{-# LANGUAGE OverloadedStrings #-} -- | Parse an HTML document into xml-conduit's Document. -- -- Assumes UTF-8 encoding. module Yesod.Test.HtmlParse ( parseHtml ) where import Text.HTML.TagStream import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as L import Text.XML import Data.Conduit import qualified Data.Conduit.List as CL import Data.Functor.Identity (runIdentity) import Control.Monad.Trans.Resource (runExceptionT) import Data.XML.Types (Event (..), Content (ContentText)) import Control.Arrow ((***)) import Data.Text.Encoding (decodeUtf8With) import Data.Text.Encoding.Error (lenientDecode) import qualified Data.Set as Set parseHtml :: L.ByteString -> Either String Document parseHtml lbs = either (Left . show) Right $ runIdentity $ runExceptionT $ CL.sourceList (L.toChunks lbs) $$ tokenStream =$ (CL.concatMap toEvent =$ fromEvents) toEvent :: Token -> [Event] toEvent (TagOpen bsname bsattrs isClose') = EventBeginElement name attrs : if isClose then [EventEndElement name] else [] where name = toName bsname attrs = map (toName *** (return . ContentText . decodeUtf8With lenientDecode)) bsattrs isClose = isClose' || isVoid bsname toEvent (TagClose bsname) = [EventEndElement $ toName bsname] toEvent (Text bs) = [EventContent $ ContentText $ decodeUtf8With lenientDecode bs] toEvent (Comment bs) = [EventComment $ decodeUtf8With lenientDecode bs] toEvent Special{} = [] toEvent Incomplete{} = [] toName :: S.ByteString -> Name toName bs = Name (decodeUtf8With lenientDecode bs) Nothing Nothing isVoid :: S.ByteString -> Bool isVoid = flip Set.member $ Set.fromList [ "area" , "base" , "br" , "col" , "command" , "embed" , "hr" , "img" , "input" , "keygen" , "link" , "meta" , "param" , "source" , "track" , "wbr" ]