module ListT.HTMLParser ( Parser, Error, ErrorDetails(..), run, -- * Parsers eoi, token, openingTag, closingTag, text, comment, html, properHTML, -- * Combinators many1, manyTill, skipTill, total, ) where import BasePrelude import MTLPrelude hiding (Error, shift) import Control.Monad.Trans.Either hiding (left, right) import ListT (ListT) import Data.Text (Text) import qualified Data.Text.Lazy.Builder as Text (Builder) import qualified Data.Text.Lazy.Builder as Text.Builder import qualified ListT as L import qualified HTMLTokenizer.Parser as HT import qualified ListT.HTMLParser.Renderer as Renderer -- | -- A backtracking HTML-tokens stream parser. newtype Parser m a = Parser { unwrap :: EitherT Error (StateT (ListT m HT.Token, [HT.Token]) m) a } deriving (Functor, Applicative, MonadError Error) -- | -- A possibly detailed parser error. -- When 'mzero' or 'empty' is used, an error value of 'Nothing' is produced. type Error = Maybe ErrorDetails data ErrorDetails = -- | A text message ErrorDetails_Message Text | -- | Unexpected token ErrorDetails_UnexpectedToken | -- | End of input ErrorDetails_EOI deriving (Show, Eq) instance Monad m => Monad (Parser m) where return = Parser . return (>>=) a b = Parser $ unwrap a >>= unwrap . b fail a = throwError $ Just $ ErrorDetails_Message $ fromString a instance Monad m => Alternative (Parser m) where empty = Parser $ EitherT $ return $ Left Nothing (<|>) a b = Parser $ EitherT $ StateT $ \(incoming, backtrack) -> do (result', (incoming', backtrack')) <- flip runStateT (incoming, []) $ runEitherT $ unwrap $ a (result'', (incoming'', backtrack'')) <- case result' of Left _ -> do flip runStateT (foldl' (flip L.cons) incoming' backtrack', []) $ runEitherT $ unwrap $ b Right result' -> do return (Right result', (incoming', backtrack')) return (result'', (incoming'', backtrack'' <> backtrack)) instance Monad m => MonadPlus (Parser m) where mzero = empty mplus = (<|>) -- | -- Run a parser on a stream of HTML tokens, -- consuming only as many as needed. run :: Monad m => Parser m a -> ListT m HT.Token -> m (Either Error a) run p l = flip evalStateT (l, []) $ runEitherT $ unwrap $ p -- | -- End of input. eoi :: Monad m => Parser m () eoi = token $> () <|> pure () -- | -- Any HTML token. token :: Monad m => Parser m HT.Token token = Parser $ EitherT $ StateT $ \(incoming, backtrack) -> liftM (maybe (Left (Just ErrorDetails_EOI), (incoming, backtrack)) (\(a, incoming') -> (Right a, (incoming', a : backtrack)))) $ L.uncons incoming -- | -- An opening tag. openingTag :: Monad m => Parser m HT.OpeningTag openingTag = token >>= \case HT.Token_OpeningTag x -> return x _ -> throwError (Just ErrorDetails_UnexpectedToken) -- | -- A closing tag. closingTag :: Monad m => Parser m HT.ClosingTag closingTag = token >>= \case HT.Token_ClosingTag x -> return x _ -> throwError (Just ErrorDetails_UnexpectedToken) -- | -- A text between tags with HTML-entities decoded. text :: Monad m => Parser m Text text = token >>= \case HT.Token_Text x -> return x _ -> throwError (Just ErrorDetails_UnexpectedToken) -- | -- Contents of a comment. comment :: Monad m => Parser m Text comment = token >>= \case HT.Token_Comment x -> return x _ -> throwError (Just ErrorDetails_UnexpectedToken) -- | -- Apply a parser at least one time. many1 :: Monad m => Parser m a -> Parser m [a] many1 a = (:) <$> a <*> many a -- | -- Apply a parser multiple times until another parser is satisfied. -- Returns results of both parsers. manyTill :: Monad m => Parser m a -> Parser m b -> Parser m ([a], b) manyTill a b = fix $ \loop -> ([],) <$> b <|> (\a (al, b) -> (a : al, b)) <$> a <*> loop -- | -- Skip any tokens until the provided parser is satisfied. skipTill :: Monad m => Parser m a -> Parser m a skipTill a = fix $ \loop -> a <|> (token *> loop) -- | -- Greedily consume all the input until the end, -- while running the provided parser. -- Same as: -- -- > theParser <* eoi total :: Monad m => Parser m a -> Parser m a total a = a <* eoi -- | -- The auto-repaired textual HTML representation of an HTML-tree node. -- -- Useful for consuming HTML-formatted snippets. -- -- E.g., when the following parser: -- -- > openingTag *> html -- -- is run against the following HTML snippet: -- -- > -- -- it'll produce the following text builder value: -- -- >
  • I'm not your friend, buddy!
  • -- -- If you want to consume all children of a node, -- it's recommended to use 'properHTML' in combination with 'many' or 'many1'. -- For details consult the docs on 'properHTML'. -- -- __This parser is smart and handles and repairs broken HTML__: -- -- * It repairs unclosed tags, -- interpreting them as closed singletons. -- E.g., @\@ will be consumed as @\@. -- -- * It handles orphan closing tags by ignoring them. -- E.g. it'll consume the input @\\<\/b\>\<\/a\>@ as @\\<\/a\>@. html :: Monad m => Parser m Text.Builder html = flip fmap cleanTokenSequence $ foldl' (flip mappend) mempty . map Renderer.token -- | -- Same as 'html', but fails if the input begins with an orphan closing tag. -- I.e., the input \"\<\/a\>\\<\/b\>\" will make this parser fail. -- -- This parser is particularly useful for consuming all children in the current context. -- E.g., running the following parser: -- -- > openingTag *> (mconcat <$> many properHTML) -- -- on the following input: -- -- >
      -- >
    • I'm not your friend, buddy!
    • -- >
    • I'm not your buddy, guy!
    • -- >
    • He's not your guy, friend!
    • -- >
    • I'm not your friend, buddy!
    • -- >
    -- -- will produce a merged text builder, which consists of the following nodes: -- -- >
  • I'm not your friend, buddy!
  • -- >
  • I'm not your buddy, guy!
  • -- >
  • He's not your guy, friend!
  • -- >
  • I'm not your friend, buddy!
  • -- -- Notice that unlike with 'html', it's safe to assume -- that it will not consume the following closing @\<\/ul\>@ tag, -- because it does not begin a valid HTML-tree node. -- -- Notice also that despite failing in case of the first broken token, -- this parser handles the broken tokens in other cases the same way as 'html'. properHTML :: Monad m => Parser m Text.Builder properHTML = cleanTokenSequence >>= \case [] -> throwError $ Just $ ErrorDetails_Message "Improper HTML node" l -> return $ foldl' (flip mappend) mempty $ map Renderer.token l cleanTokenSequence :: Monad m => Parser m [HT.Token] cleanTokenSequence = fmap (fmap (either id id)) $ flip execStateT [] $ fix $ \loop -> lift token >>= \case HT.Token_ClosingTag ct -> do ours <- state $ \list -> fromMaybe ([], list) $ do (l, r) <- Just $ flip break list $ \case Right (HT.Token_OpeningTag (n, _, False)) -> n == ct _ -> False (h, t) <- uncons r return (fmap (fmap closeOpeningTag) l <> [h], t) loop' <- bool loop (return ()) . null <$> get if null ours then loop' else do modify $ mappend $ (:) (Left (HT.Token_ClosingTag ct)) $ fmap (either Left Left) $ ours loop' t -> do modify $ (:) $ Right t loop where closeOpeningTag = \case HT.Token_OpeningTag (n, a, _) -> HT.Token_OpeningTag (n, a, True) x -> x