{-# LANGUAGE CPP #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FunctionalDependencies #-} {-# LANGUAGE KindSignatures #-} {-# LANGUAGE UndecidableInstances #-} {-| Description: Generic lazy parsers for transforming input. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable The existing parsing libraries are wonderful, but backtracking parsers have a bad habit of being strict in their output; sure, you might be able to operate over "Data.ByteString.Lazy", but they all expect to consume their entire input before handing you their result. "Data.Attoparsec"'s continuations fully lean into that---even though you don't have to provide all the input in one block, you can't get a value before closing it out. [Text.Megaparsec] (https://hackage.haskell.org/package/megaparsec/docs/Text-Megaparsec.html) does provide a reentrant form in [runParser'] (https://hackage.haskell.org/package/megaparsec/docs/Text-Megaparsec.html#v:runParser-39-), but it also comes with comparatively heavyweight error and pretty-printing features. For complicated formats, those all can indeed be desirable. However, the HTML algorithms have been optimized for minimal lookahead and certainly no output revocation---once something is shipped out, it's not going to be called back. Not taking advantage of that by using a lazy output type means that parsing would always be subject to the whims of slow or unreliable network connections. Moreover, the entire complexity of the parsing algorithm is built around never reaching a fatal failure condition, so error handling and especially recovery are unnecessary overhead. And so, a custom parsing framework must be defined. -} module Web.Willow.Common.Parser ( -- * Concrete types Parser , runParser , ParserT ( .. ) , StateParser -- * Parsing combinators , MonadParser ( .. ) , end , satisfying , token , chunk #if MIN_VERSION_base(4,11,0) #else , (<>) #endif -- * Supporting typeclasses , Stream ( .. ) ) where import qualified Control.Applicative as A import qualified Control.Monad as N import qualified Control.Monad.Cont.Class as N.C import qualified Control.Monad.Except as N.X import qualified Control.Monad.Fail as N.F import qualified Control.Monad.Fix as N.FX import qualified Control.Monad.IO.Class as N.IO import qualified Control.Monad.Reader as N.R import qualified Control.Monad.RWS.Lazy as N.T.L import qualified Control.Monad.RWS.Strict as N.T.S import qualified Control.Monad.State.Class as N.S import qualified Control.Monad.State.Lazy as N.S.L import qualified Control.Monad.State.Strict as N.S.S #if MIN_VERSION_transformers(0,5,3) import qualified Control.Monad.Trans.Accum as N.A #endif import qualified Control.Monad.Trans.Class as N.T import qualified Control.Monad.Trans.Identity as N.I import qualified Control.Monad.Trans.Maybe as N.M import qualified Control.Monad.Writer.Lazy as N.W.L import qualified Control.Monad.Writer.Strict as N.W.S import qualified Data.Bifunctor as F.B import qualified Data.ByteString as BS.S import qualified Data.ByteString.Lazy as BS.L import qualified Data.Maybe as Y import qualified Data.Text as T.S import qualified Data.Text.Lazy as T.L import qualified Data.Word as W import Control.Applicative ( (<|>) ) #if MIN_VERSION_base(4,11,0) #else import Data.Semigroup #endif -- | Unlike most monad transformers, a 'Parser' is built around the concept of -- success and failure, so its "default" form is better structured over 'Maybe' -- than over 'Data.Functor.Identity.Identity'. type Parser stream = ParserT stream Maybe -- | Set the constructed parser loose on a given input. Returns both the -- resulting value and the remaining contents of the 'Stream'. runParser :: Parser stream out -> stream -> Maybe (out, stream) runParser = runParserT -- | Purely a convenience of the package rather than the module, the state -- machines described by the HTML standard all involve some degree of -- persistence, and so are built over a deeper monad stack. This could easily -- one of the most common transformers to add, anyway, no matter what input is -- being parsed. type StateParser state stream = N.S.L.StateT state (Parser stream) -- | Encapsulation of an operation for transforming the head of a 'Stream' into -- some other value. Standard usage, with similar behaviour to other -- "Text.Parsec"-derived parsers, ("accept the first which matches") may be -- obtained by instantiating @gather@ with 'Maybe', or non-deterministic -- parsing ("accept any of these") through @[]@. -- -- Notably, this implementation is designed to allow laziness in both input and -- output. For the best usage, therefore, consume as little input at a time as -- possible, and so call 'runParser' often). -- -- As part of this simplification, all "Text.Parsec"-style integrated state -- (use 'Control.Monad.Trans.State.StateT') and "Text.Megaparsec"-style error -- pretty-printing (build your position tracking into the @stream@, and/or wrap -- the output in 'Either') has been stripped out. newtype ParserT stream gather out = ParserT { runParserT :: stream -> gather (out, stream) } instance ( Functor gather ) => Functor (ParserT stream gather) where fmap f (ParserT a') = ParserT $ fmap (F.B.first f) . a' -- | 'pure' is a 'ParserT' which succeeds without consuming any input. ('<*>') -- and the other sequencing functions run the right 'ParserT' over the -- remaining input after the left 'ParserT' returns. instance ( Monad gather ) => Applicative (ParserT stream gather) where pure out = ParserT $ \input -> pure (out, input) ParserT f' <*> ParserT a' = ParserT $ \input -> do (f, interm) <- f' input (a, output) <- a' interm return (f a, output) -- | 'A.empty' is a 'ParserT' which fails without consuming any input. ('<|>') -- applies both 'ParserT's to the same input ("automatically backtracks"). instance ( A.Alternative gather, Monad gather ) => A.Alternative (ParserT stream gather) where empty = ParserT $ const A.empty ParserT a' <|> ParserT b' = ParserT $ \input -> a' input <|> b' input -- | ('<>') runs the right 'ParserT' over the remaining input after the left -- 'ParserT' returns. instance ( Monad gather , Semigroup out ) => Semigroup (ParserT stream gather out) where ParserT a' <> ParserT b' = ParserT $ \input -> do (a, interm) <- a' input (b, output) <- b' interm return (a <> b, output) -- | 'mempty' is a 'ParserT' which /succeeds/ without consuming any input; it -- is therefore identical to @'pure' 'mempty'@ and /not/ 'A.empty' or 'N.mzero'. instance ( Monad gather , Monoid out #if MIN_VERSION_base(4,11,0) ) => Monoid (ParserT stream gather out) where #else , Semigroup out ) => Monoid (ParserT stream gather out) where mappend = (<>) #endif mempty = ParserT $ \input -> return (mempty, input) -- | ('>>=') runs the 'ParserT' resulting from the right (generation function) -- argument over the remaining input after the left (value) 'ParserT' returns. instance ( Monad gather ) => Monad (ParserT stream gather) where ParserT a' >>= f = ParserT $ \input -> do (a, output) <- a' input let ParserT b' = f a b' output -- | 'N.mzero' is a 'ParserT' which fails without consuming input, while -- 'N.mplus' applies both to the same input, modulo the semantics of the -- @'A.Alternative' gather@ instance. instance ( A.Alternative gather, Monad gather ) => N.MonadPlus (ParserT stream gather) instance ( N.F.MonadFail gather ) => N.F.MonadFail (ParserT stream gather) where fail = ParserT . const . N.F.fail -- | 'N.X.throwError' is a 'ParserT' which fails without consuming any input. -- 'N.X.catchError' runs the recovery parser over the /same/ input as was -- passed to the original (failing) parser. instance ( N.X.MonadError err gather ) => N.X.MonadError err (ParserT stream gather) where throwError = ParserT . const . N.X.throwError catchError (ParserT a') f = ParserT $ \input -> N.X.catchError (a' input) $ \err -> let ParserT b' = f err in b' input instance ( Monad gather ) => N.FX.MonadFix (ParserT stream gather) where mfix f = ParserT $ \input -> do let ParserT f' = N.FX.mfix f f' input instance N.T.MonadTrans (ParserT stream) where lift a' = ParserT $ \input -> a' >>= \a -> return (a, input) -- | Performs an action on the current input without consuming it; i.e. -- 'N.R.ask' is identical to 'N.S.get'. instance ( Monad gather ) => N.R.MonadReader stream (ParserT stream gather) where ask = N.S.get local f (ParserT a') = ParserT $ a' . f reader f = ParserT $ \input -> pure (f input, input) -- | Operates over the input that has not yet been processed. -- -- Note that this therefore provides the means for forcing an early -- end-of-stream: -- -- @ -- 'N.S.put' 'mempty' -- @ instance ( Monad gather ) => N.S.MonadState stream (ParserT stream gather) where state f = ParserT $ pure . f instance ( N.IO.MonadIO gather ) => N.IO.MonadIO (ParserT stream gather) where liftIO a' = ParserT $ \input -> do a <- N.IO.liftIO a' return (a, input) -- | The parser the inner function generates is run over the remaining input -- after the argument function runs (thus generating the inner function). instance N.C.MonadCont gather => N.C.MonadCont (ParserT stream gather) where callCC f = ParserT $ \input -> N.C.callCC $ \g -> runParserT (f $ \a -> ParserT $ \input' -> g (a, input')) input -- | A sequence of values which may be processed via a 'MonadParser'. This -- class is essentially just a unification of the various list-like interfaces -- (@'uncons' == 'head'@, etc.) as Haskell's abstractions are slightly lacking -- in that area. -- -- >>> Just (tok, str) == uncons (cons tok str) -- True class Monoid stream => Stream stream token | stream -> token where {-# MINIMAL cons, uncons #-} -- | Prepend a token to the stream for proximate processing, before -- everything already in it. cons :: token -> stream -> stream -- | As 'cons', but append multiple tokens at once. consChunk :: stream -> stream -> stream consChunk a b = case uncons a of Nothing -> b Just (t, ts) -> cons t $ consChunk ts b -- | Retrieve the next token from the stream. -- -- This should only return 'Nothing' if the stream is actually empty---if -- the next value is not available yet due to slow IO or other computation, -- 'uncons' waits until it is. uncons :: stream -> Maybe (token, stream) -- | Retrieve the next several tokens from the stream. -- -- If fewer tokens are in the input stream than asked for, the left side of -- the return value is the (shorter than requested) entire input stream and -- the right is 'mempty'. unconsChunk :: Word -> stream -> (stream, stream) unconsChunk 0 str = (mempty, str) unconsChunk n str = case uncons str of Nothing -> (mempty, str) Just (t, str') -> let (ts, str'') = unconsChunk (pred n) str' in (cons t ts, str'') -- | The number of tokens remaining in the stream. chunkLen :: stream -> Word chunkLen str = case uncons str of Just (_, str') -> 1 + chunkLen str' Nothing -> 0 instance Stream BS.L.ByteString W.Word8 where cons = BS.L.cons consChunk = (<>) uncons = BS.L.uncons unconsChunk = BS.L.splitAt . fromIntegral chunkLen = fromIntegral . BS.L.length instance Stream BS.S.ByteString W.Word8 where cons = BS.S.cons consChunk = (<>) uncons = BS.S.uncons unconsChunk = BS.S.splitAt . fromIntegral chunkLen = fromIntegral . BS.S.length instance Stream T.L.Text Char where cons = T.L.cons consChunk = (<>) uncons = T.L.uncons unconsChunk = T.L.splitAt . fromIntegral chunkLen = fromIntegral . T.L.length instance Stream T.S.Text Char where cons = T.S.cons consChunk = (<>) uncons = T.S.uncons unconsChunk = T.S.splitAt . fromIntegral chunkLen = fromIntegral . T.S.length instance Stream [token] token where cons = (:) consChunk = (<>) uncons [] = Nothing uncons (t:ts) = Just (t, ts) unconsChunk = splitAt . fromIntegral chunkLen = fromIntegral . length -- | Generalize the transformation of an input 'Stream' into a more meaningful -- value. This class provides the basic building blocks from which more -- expressive such parsers may be constructed. -- -- See also the description of 'ParserT' for some of the design decisions. class ( A.Alternative m, Monad m , Stream stream token, Monoid stream ) => MonadParser m stream token | m -> stream where -- | Runs the argument parser on the current input, without consuming any -- of it; these are identical semantics to saving and restoring the input -- after running the computation, assuming the 'N.S.MonadState' instance -- runs over the input stream (see 'ParserT'): -- -- @ -- input <- 'N.S.get' -- a <- parser -- 'N.S.put' input -- @ -- -- @ -- a <- 'lookAhead' parser -- @ lookAhead :: m out -> m out -- | Succeeds if and only if the argument parser fails (the input is not -- consumed). avoiding :: m out -> m () -- | Retrieve the next token in the stream, whatever it may be. Identical -- to @'uncons'@ in all but type. next :: m token -- | Retrieve the next several tokens in the stream. Identical to -- 'Control.Monad.Combinators.count' (with a safer index type) in the case -- that @gather@ is a list @[token]@. -- -- If fewer tokens are in the input stream than asked for, returns what -- does remain in the input stream. nextChunk :: Word -> m stream -- | Prepend a token to the input stream to be processed next. Identical -- to operating on the stream directly through 'N.S.MonadState', if that -- instance also exists. -- -- @ -- stream <- 'N.S.get' -- 'N.S.put' $ 'cons' tok stream -- @ -- -- @ -- 'push' tok -- @ push :: token -> m () -- | Concatenate the given sequence with the existing input, processing the -- argument before the older @stream@. pushChunk :: stream -> m () -- | Drop the remainder of the input, simulating an early end-of-stream. -- Can be emulated through appropriate 'N.S.MonadState' and 'Monoid' -- instances: -- -- @ -- stream <- 'N.S.get' -- 'N.S.put' 'mempty' -- 'return' stream -- @ -- -- @ -- 'abridge' -- @ abridge :: m stream instance ( A.Alternative gather, Monad gather , Stream stream token, Monoid stream ) => MonadParser (ParserT stream gather) stream token where lookAhead (ParserT a') = ParserT $ \input -> do (a, _) <- a' input return (a, input) avoiding (ParserT bad') = ParserT $ \input -> do bad <- A.optional $ bad' input if Y.isJust bad then A.empty else pure ((), input) next = ParserT $ maybe A.empty return . uncons nextChunk l = ParserT $ return . unconsChunk l push tok = ParserT $ \input -> return ((), cons tok input) pushChunk str = ParserT $ \input -> return ((), consChunk str input) abridge = ParserT $ \input -> return (input, mempty) #if MIN_VERSION_transformers(0,5,3) instance ( MonadParser trans stream token , Monoid accum , N.MonadPlus trans ) => MonadParser (N.A.AccumT accum trans) stream token where lookAhead (N.A.AccumT trans) = N.A.AccumT $ lookAhead . trans avoiding (N.A.AccumT trans) = N.A.AccumT $ \accum -> avoiding (trans accum) >> return ((), accum) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge #endif instance ( MonadParser trans stream token , Monoid except ) => MonadParser (N.X.ExceptT except trans) stream token where lookAhead (N.X.ExceptT trans) = N.X.ExceptT $ lookAhead trans avoiding (N.X.ExceptT trans) = N.X.ExceptT $ avoiding trans >> return (pure ()) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token ) => MonadParser (N.I.IdentityT trans) stream token where lookAhead (N.I.IdentityT trans) = N.I.IdentityT $ lookAhead trans avoiding (N.I.IdentityT trans) = N.I.IdentityT $ avoiding trans next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token ) => MonadParser (N.M.MaybeT trans) stream token where lookAhead (N.M.MaybeT trans) = N.M.MaybeT $ lookAhead trans avoiding (N.M.MaybeT trans) = N.M.MaybeT $ avoiding trans >> return (pure ()) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token ) => MonadParser (N.R.ReaderT reader trans) stream token where lookAhead (N.R.ReaderT trans) = N.R.ReaderT $ lookAhead . trans avoiding (N.R.ReaderT trans) = N.R.ReaderT $ avoiding . trans next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token , N.MonadPlus trans ) => MonadParser (N.S.L.StateT state trans) stream token where lookAhead (N.S.L.StateT trans) = N.S.L.StateT $ lookAhead . trans avoiding (N.S.L.StateT trans) = N.S.L.StateT $ \state -> avoiding (fst <$> trans state) >> return ((), state) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token , N.MonadPlus trans ) => MonadParser (N.S.S.StateT state trans) stream token where lookAhead (N.S.S.StateT trans) = N.S.S.StateT $ lookAhead . trans avoiding (N.S.S.StateT trans) = N.S.S.StateT $ \state -> avoiding (fst <$> trans state) >> return ((), state) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token , Monoid writer ) => MonadParser (N.W.L.WriterT writer trans) stream token where lookAhead (N.W.L.WriterT trans) = N.W.L.WriterT $ lookAhead trans avoiding (N.W.L.WriterT trans) = N.W.L.WriterT $ avoiding (fmap fst trans) >> return ((), mempty) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token , Monoid writer ) => MonadParser (N.W.S.WriterT writer trans) stream token where lookAhead (N.W.S.WriterT trans) = N.W.S.WriterT $ lookAhead trans avoiding (N.W.S.WriterT trans) = N.W.S.WriterT $ avoiding (fmap fst trans) >> return ((), mempty) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token , Monoid writer , N.MonadPlus trans ) => MonadParser (N.T.S.RWST reader writer state trans) stream token where lookAhead (N.T.S.RWST trans) = N.T.S.RWST $ \reader -> lookAhead . trans reader avoiding (N.T.S.RWST trans) = N.T.S.RWST $ \reader state -> avoiding (trans reader state) >> return ((), state, mempty) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge instance ( MonadParser trans stream token , Monoid writer , N.MonadPlus trans ) => MonadParser (N.T.L.RWST reader writer state trans) stream token where lookAhead (N.T.L.RWST trans) = N.T.L.RWST $ \reader -> lookAhead . trans reader avoiding (N.T.L.RWST trans) = N.T.L.RWST $ \reader state -> do avoiding $ trans reader state return ((), state, mempty) next = N.T.lift next nextChunk = N.T.lift . nextChunk push = N.T.lift . push pushChunk = N.T.lift . pushChunk abridge = N.T.lift abridge -- | Succeeds if and only if the input is empty. end :: MonadParser trans stream token => trans () end = avoiding next -- | Expect a specific token from the 'Stream', and fail if a different -- token is found instead. Identical to running 'satisfying' with equality -- in the (by far most likely) case that @gather@ is a 'Monad' in addition -- to an 'A.Alternative': -- -- @ -- tok \<- 'next' '>>=' 'satisfying' ('==' desired) -- @ -- -- @ -- tok <- 'token' desired -- @ token :: ( MonadParser trans stream token , Eq token ) => token -> trans token token desired = next >>= satisfying (== desired) -- | Expect a specific sequence of tokens from the 'Stream', and fail if -- anything else is found instead, or if the 'Stream' doesn't have enough -- characters before its end. Identical to running 'satisfying' with equality -- over 'nextChunk' in the case that @stream@ is an 'Eq' (which all provided -- instances are) and can easily provide a 'length' (which they do, unless the -- sequence to test against also needs to be lazy). -- -- @ -- stream \<- 'nextChunk' ('length' desired) '>>=' 'satisfying' ('==' desired) -- @ -- -- @ -- stream <- 'chunk' desired -- @ chunk :: ( MonadParser trans stream token , Eq stream ) => stream -> trans stream chunk desired = nextChunk (chunkLen desired) >>= satisfying (== desired) -- | Succeeds if and only if the value parsed by the argument parser satisfies -- the predicate. No further input is consumed. satisfying :: MonadParser trans stream token => (out -> Bool) -> out -> trans out satisfying test out | test out = return out | otherwise = A.empty