module ListT.HTMLParser
(
Parser,
Error,
ErrorDetails(..),
run,
eoi,
token,
rawToken,
space,
openingTag,
closingTag,
text,
comment,
html,
properHTML,
xmlNode,
many1,
manyTill,
skipTill,
total,
)
where
import BasePrelude
import MTLPrelude hiding (Error, shift)
import Control.Monad.Trans.Either hiding (left, right)
import ListT (ListT)
import Data.Text (Text)
import Conversion
import Conversion.Text
import qualified Data.Text as Text
import qualified Data.Text.Lazy.Builder as Text (Builder)
import qualified Data.Text.Lazy.Builder as Text.Builder
import qualified ListT as L
import qualified HTMLTokenizer as HT
import qualified HTMLEntities.Decoder
import qualified ListT.HTMLParser.Renderer as Renderer
import qualified ListT.HTMLParser.XML as XML
import qualified Data.XML.Types as XMLTypes
newtype Parser m a =
Parser { unwrap :: EitherT Error (StateT (ListT m HT.Token, [HT.Token]) m) a }
deriving (Functor, Applicative, MonadError Error)
type Error =
Maybe ErrorDetails
data ErrorDetails =
ErrorDetails_Message Text |
ErrorDetails_UnexpectedToken |
ErrorDetails_EOI
deriving (Show, Eq)
instance Monad m => Monad (Parser m) where
return =
Parser . return
(>>=) a b =
Parser $ unwrap a >>= unwrap . b
fail a =
throwError $ Just $ ErrorDetails_Message $ fromString a
instance Monad m => Alternative (Parser m) where
empty =
Parser $ EitherT $ return $ Left Nothing
(<|>) a b =
Parser $ EitherT $ StateT $ \(incoming, backtrack) -> do
(result', (incoming', backtrack')) <- flip runStateT (incoming, []) $ runEitherT $ unwrap $ a
(result'', (incoming'', backtrack'')) <-
case result' of
Left _ -> do
flip runStateT (foldl' (flip L.cons) incoming' backtrack', []) $ runEitherT $ unwrap $ b
Right result' -> do
return (Right result', (incoming', backtrack'))
return (result'', (incoming'', backtrack'' <> backtrack))
instance Monad m => MonadPlus (Parser m) where
mzero = empty
mplus = (<|>)
run :: Monad m => Parser m a -> ListT m HT.Token -> m (Either Error a)
run p l =
flip evalStateT (l, []) $ runEitherT $ unwrap $ p
rawToken :: Monad m => Parser m HT.Token
rawToken =
Parser $ EitherT $ StateT $ \(incoming, backtrack) ->
liftM (maybe (Left (Just ErrorDetails_EOI), (incoming, backtrack))
(\(a, incoming') -> (Right a, (incoming', a : backtrack)))) $
L.uncons incoming
token :: Monad m => Parser m HT.Token
token =
rawToken >>= \case
HT.Token_Text x -> Text.strip x & \x -> if Text.null x
then token
else return $ HT.Token_Text $ convert $ decode x
HT.Token_Comment x -> return $ HT.Token_Comment $ convert $ decode x
HT.Token_OpeningTag (name, attrs, closed) ->
return $ HT.Token_OpeningTag $ (name, ((fmap . fmap . fmap) (convert . decode) attrs), closed)
x -> return x
where
decode = HTMLEntities.Decoder.htmlEncodedText
space :: Monad m => Parser m Text
space =
rawToken >>= \case
HT.Token_Text x | Text.all isSpace x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
eoi :: Monad m => Parser m ()
eoi =
rawToken $> () <|> pure ()
openingTag :: Monad m => Parser m HT.OpeningTag
openingTag =
token >>= \case
HT.Token_OpeningTag x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
closingTag :: Monad m => Parser m HT.Identifier
closingTag =
token >>= \case
HT.Token_ClosingTag x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
text :: Monad m => Parser m Text
text =
token >>= \case
HT.Token_Text x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
comment :: Monad m => Parser m Text
comment =
token >>= \case
HT.Token_Comment x -> return x
_ -> throwError (Just ErrorDetails_UnexpectedToken)
many1 :: Monad m => Parser m a -> Parser m [a]
many1 a =
(:) <$> a <*> many a
manyTill :: Monad m => Parser m a -> Parser m b -> Parser m ([a], b)
manyTill a b =
fix $ \loop ->
([],) <$> b <|>
(\a (al, b) -> (a : al, b)) <$> a <*> loop
skipTill :: Monad m => Parser m a -> Parser m a
skipTill a =
fix $ \loop ->
a <|> (rawToken *> loop)
total :: Monad m => Parser m a -> Parser m a
total a =
a <* eoi
html :: Monad m => Parser m Text.Builder
html =
flip fmap cleanTokenSequence $
foldl' (flip mappend) mempty . map Renderer.token
properHTML :: Monad m => Parser m Text.Builder
properHTML =
cleanTokenSequence >>= \case
[] -> throwError $ Just $ ErrorDetails_Message "Improper HTML node"
l -> return $ foldl' (flip mappend) mempty $ map Renderer.token l
xmlNode :: Monad m => Parser m XMLTypes.Node
xmlNode =
cleanTokenSequence >>= \case
[] -> throwError $ Just $ ErrorDetails_Message "Improper HTML node"
tokens -> XML.run XML.node (reverse tokens) & \case
Just node -> return node
cleanTokenSequence :: Monad m => Parser m [HT.Token]
cleanTokenSequence =
fmap (fmap (either id id)) $
flip execStateT [] $ fix $ \loop -> lift rawToken >>= \case
HT.Token_Doctype _ -> return ()
HT.Token_ClosingTag ct -> do
ours <- state $ \list -> fromMaybe ([], list) $ do
(l, r) <- Just $ flip break list $ \case
Right (HT.Token_OpeningTag (n, _, False)) -> n == ct
_ -> False
(h, t) <- uncons r
return (fmap (fmap closeOpeningTag) l <> [h], t)
loop' <- bool loop (return ()) . null <$> get
if null ours
then loop'
else do
modify $ mappend $ (:) (Left (HT.Token_ClosingTag ct)) $ fmap (either Left Left) $ ours
loop'
t@(HT.Token_OpeningTag _) -> do
modify $ (:) $ Right t
loop
t -> do
context <- get
modify $ (:) $ Right t
if null context
then return ()
else loop
where
closeOpeningTag =
\case
HT.Token_OpeningTag (n, a, _) -> HT.Token_OpeningTag (n, a, True)
x -> x