{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE KindSignatures #-}
{-# LANGUAGE UndecidableInstances #-}

{-|
Description:    Generic lazy parsers for transforming input.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable

The existing parsing libraries are wonderful, but backtracking parsers have a
bad habit of being strict in their output; sure, you might be able to operate
over "Data.ByteString.Lazy", but they all expect to consume their entire input
before handing you their result.  "Data.Attoparsec"'s continuations fully lean
into that---even though you don't have to provide all the input in one block,
you can't get a value before closing it out.  [Text.Megaparsec]
(https://hackage.haskell.org/package/megaparsec/docs/Text-Megaparsec.html)
does provide a reentrant form in [runParser']
(https://hackage.haskell.org/package/megaparsec/docs/Text-Megaparsec.html#v:runParser-39-),
but it also comes with comparatively heavyweight error and pretty-printing
features.

For complicated formats, those all can indeed be desirable.  However, the HTML
algorithms have been optimized for minimal lookahead and certainly no output
revocation---once something is shipped out, it's not going to be called back.
Not taking advantage of that by using a lazy output type means that parsing
would always be subject to the whims of slow or unreliable network connections.
Moreover, the entire complexity of the parsing algorithm is built around never
reaching a fatal failure condition, so error handling and especially recovery
are unnecessary overhead.

And so, a custom parsing framework must be defined.
-}
module Web.Willow.Common.Parser
    ( -- * Concrete types
      Parser
    , runParser
    , ParserT ( .. )
    , StateParser
      -- * Parsing combinators
    , MonadParser ( .. )
    , end
    , satisfying
    , token
    , chunk
#if MIN_VERSION_base(4,11,0)
#else
    , (<>)
#endif
      -- * Supporting typeclasses
    , Stream ( .. )
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad as N
import qualified Control.Monad.Cont.Class as N.C
import qualified Control.Monad.Except as N.X
import qualified Control.Monad.Fail as N.F
import qualified Control.Monad.Fix as N.FX
import qualified Control.Monad.IO.Class as N.IO
import qualified Control.Monad.Reader as N.R
import qualified Control.Monad.RWS.Lazy as N.T.L
import qualified Control.Monad.RWS.Strict as N.T.S
import qualified Control.Monad.State.Class as N.S
import qualified Control.Monad.State.Lazy as N.S.L
import qualified Control.Monad.State.Strict as N.S.S
#if MIN_VERSION_transformers(0,5,3)
import qualified Control.Monad.Trans.Accum as N.A
#endif
import qualified Control.Monad.Trans.Class as N.T
import qualified Control.Monad.Trans.Identity as N.I
import qualified Control.Monad.Trans.Maybe as N.M
import qualified Control.Monad.Writer.Lazy as N.W.L
import qualified Control.Monad.Writer.Strict as N.W.S

import qualified Data.Bifunctor as F.B
import qualified Data.ByteString as BS.S
import qualified Data.ByteString.Lazy as BS.L
import qualified Data.Maybe as Y
import qualified Data.Text as T.S
import qualified Data.Text.Lazy as T.L
import qualified Data.Word as W

import Control.Applicative ( (<|>) )

#if MIN_VERSION_base(4,11,0)
#else
import Data.Semigroup
#endif


-- | Unlike most monad transformers, a 'Parser' is built around the concept of
-- success and failure, so its "default" form is better structured over 'Maybe'
-- than over 'Data.Functor.Identity.Identity'.
type Parser stream = ParserT stream Maybe

-- | Set the constructed parser loose on a given input.  Returns both the
-- resulting value and the remaining contents of the 'Stream'.
runParser :: Parser stream out -> stream -> Maybe (out, stream)
runParser = runParserT

-- | Purely a convenience of the package rather than the module, the state
-- machines described by the HTML standard all involve some degree of
-- persistence, and so are built over a deeper monad stack.  This could easily
-- one of the most common transformers to add, anyway, no matter what input is
-- being parsed.
type StateParser state stream = N.S.L.StateT state (Parser stream)

-- | Encapsulation of an operation for transforming the head of a 'Stream' into
-- some other value.  Standard usage, with similar behaviour to other
-- "Text.Parsec"-derived parsers, ("accept the first which matches") may be
-- obtained by instantiating @gather@ with 'Maybe', or non-deterministic
-- parsing ("accept any of these") through @[]@.
-- 
-- Notably, this implementation is designed to allow laziness in both input and
-- output.  For the best usage, therefore, consume as little input at a time as
-- possible, and so call 'runParser' often).
-- 
-- As part of this simplification, all "Text.Parsec"-style integrated state
-- (use 'Control.Monad.Trans.State.StateT') and "Text.Megaparsec"-style error
-- pretty-printing (build your position tracking into the @stream@, and/or wrap
-- the output in 'Either') has been stripped out.
newtype ParserT stream gather out = ParserT
    { runParserT :: stream -> gather (out, stream)
    }

instance
  ( Functor gather
  ) => Functor (ParserT stream gather) where
    fmap f (ParserT a') = ParserT $ fmap (F.B.first f) . a'

-- | 'pure' is a 'ParserT' which succeeds without consuming any input.  ('<*>')
-- and the other sequencing functions run the right 'ParserT' over the
-- remaining input after the left 'ParserT' returns.
instance
  ( Monad gather
  ) => Applicative (ParserT stream gather) where
    pure out = ParserT $ \input -> pure (out, input)
    ParserT f' <*> ParserT a' = ParserT $ \input -> do
        (f, interm) <- f' input
        (a, output) <- a' interm
        return (f a, output)

-- | 'A.empty' is a 'ParserT' which fails without consuming any input.  ('<|>')
-- applies both 'ParserT's to the same input ("automatically backtracks").
instance
  ( A.Alternative gather, Monad gather
  ) => A.Alternative (ParserT stream gather) where
    empty = ParserT $ const A.empty
    ParserT a' <|> ParserT b' = ParserT $ \input ->
        a' input <|> b' input

-- | ('<>') runs the right 'ParserT' over the remaining input after the left
-- 'ParserT' returns.
instance
  ( Monad gather
  , Semigroup out
  ) => Semigroup (ParserT stream gather out) where
    ParserT a' <> ParserT b' = ParserT $ \input -> do
        (a, interm) <- a' input
        (b, output) <- b' interm
        return (a <> b, output)

-- | 'mempty' is a 'ParserT' which /succeeds/ without consuming any input; it
-- is therefore identical to @'pure' 'mempty'@ and /not/ 'A.empty' or 'N.mzero'.
instance
  ( Monad gather
  , Monoid out
#if MIN_VERSION_base(4,11,0)
  ) => Monoid (ParserT stream gather out) where
#else
  , Semigroup out
  ) => Monoid (ParserT stream gather out) where
    mappend = (<>)
#endif
    mempty = ParserT $ \input -> return (mempty, input)

-- | ('>>=') runs the 'ParserT' resulting from the right (generation function)
-- argument over the remaining input after the left (value) 'ParserT' returns.
instance
  ( Monad gather
  ) => Monad (ParserT stream gather) where
    ParserT a' >>= f = ParserT $ \input -> do
        (a, output) <- a' input
        let ParserT b' = f a
        b' output

-- | 'N.mzero' is a 'ParserT' which fails without consuming input, while
-- 'N.mplus' applies both to the same input, modulo the semantics of the
-- @'A.Alternative' gather@ instance.
instance
  ( A.Alternative gather, Monad gather
  ) => N.MonadPlus (ParserT stream gather)

instance
  ( N.F.MonadFail gather
  ) => N.F.MonadFail (ParserT stream gather) where
    fail = ParserT . const . N.F.fail

-- | 'N.X.throwError' is a 'ParserT' which fails without consuming any input.
-- 'N.X.catchError' runs the recovery parser over the /same/ input as was
-- passed to the original (failing) parser.
instance
  ( N.X.MonadError err gather
  ) => N.X.MonadError err (ParserT stream gather) where
    throwError = ParserT . const . N.X.throwError
    catchError (ParserT a') f = ParserT $ \input -> N.X.catchError (a' input) $ \err ->
        let ParserT b' = f err
        in  b' input

instance
  ( Monad gather
  ) => N.FX.MonadFix (ParserT stream gather) where
    mfix f = ParserT $ \input -> do
        let ParserT f' = N.FX.mfix f
        f' input

instance N.T.MonadTrans (ParserT stream) where
    lift a' = ParserT $ \input ->
        a' >>= \a -> return (a, input)

-- | Performs an action on the current input without consuming it; i.e.
-- 'N.R.ask' is identical to 'N.S.get'.
instance
  ( Monad gather
  ) => N.R.MonadReader stream (ParserT stream gather) where
    ask = N.S.get
    local f (ParserT a') = ParserT $ a' . f
    reader f = ParserT $ \input -> pure (f input, input)

-- | Operates over the input that has not yet been processed.
-- 
-- Note that this therefore provides the means for forcing an early
-- end-of-stream:
-- 
-- @
-- 'N.S.put' 'mempty'
-- @
instance
  ( Monad gather
  ) => N.S.MonadState stream (ParserT stream gather) where
    state f = ParserT $ pure . f

instance
  ( N.IO.MonadIO gather
  ) => N.IO.MonadIO (ParserT stream gather) where
    liftIO a' = ParserT $ \input -> do
        a <- N.IO.liftIO a'
        return (a, input)

-- | The parser the inner function generates is run over the remaining input
-- after the argument function runs (thus generating the inner function).
instance N.C.MonadCont gather => N.C.MonadCont (ParserT stream gather) where
    callCC f = ParserT $ \input -> N.C.callCC $ \g ->
        runParserT (f $ \a -> ParserT $ \input' -> g (a, input')) input


-- | A sequence of values which may be processed via a 'MonadParser'.  This
-- class is essentially just a unification of the various list-like interfaces
-- (@'uncons' == 'head'@, etc.) as Haskell's abstractions are slightly lacking
-- in that area.
-- 
-- >>> Just (tok, str) == uncons (cons tok str)
-- True
class Monoid stream => Stream stream token | stream -> token where
    {-# MINIMAL cons, uncons #-}
    -- | Prepend a token to the stream for proximate processing, before
    -- everything already in it.
    cons :: token -> stream -> stream
    -- | As 'cons', but append multiple tokens at once.
    consChunk :: stream -> stream -> stream
    consChunk a b = case uncons a of
        Nothing -> b
        Just (t, ts) -> cons t $ consChunk ts b
    -- | Retrieve the next token from the stream.
    -- 
    -- This should only return 'Nothing' if the stream is actually empty---if
    -- the next value is not available yet due to slow IO or other computation,
    -- 'uncons' waits until it is.
    uncons :: stream -> Maybe (token, stream)
    -- | Retrieve the next several tokens from the stream.
    -- 
    -- If fewer tokens are in the input stream than asked for, the left side of
    -- the return value is the (shorter than requested) entire input stream and
    -- the right is 'mempty'.
    unconsChunk :: Word -> stream -> (stream, stream)
    unconsChunk 0 str = (mempty, str)
    unconsChunk n str = case uncons str of
        Nothing -> (mempty, str)
        Just (t, str') ->
            let (ts, str'') = unconsChunk (pred n) str'
            in  (cons t ts, str'')
    -- | The number of tokens remaining in the stream.
    chunkLen :: stream -> Word
    chunkLen str = case uncons str of
            Just (_, str') -> 1 + chunkLen str'
            Nothing -> 0
instance Stream BS.L.ByteString W.Word8 where
    cons = BS.L.cons
    consChunk = (<>)
    uncons = BS.L.uncons
    unconsChunk = BS.L.splitAt . fromIntegral
    chunkLen = fromIntegral . BS.L.length
instance Stream BS.S.ByteString W.Word8 where
    cons = BS.S.cons
    consChunk = (<>)
    uncons = BS.S.uncons
    unconsChunk = BS.S.splitAt . fromIntegral
    chunkLen = fromIntegral . BS.S.length
instance Stream T.L.Text Char where
    cons = T.L.cons
    consChunk = (<>)
    uncons = T.L.uncons
    unconsChunk = T.L.splitAt . fromIntegral
    chunkLen = fromIntegral . T.L.length
instance Stream T.S.Text Char where
    cons = T.S.cons
    consChunk = (<>)
    uncons = T.S.uncons
    unconsChunk = T.S.splitAt . fromIntegral
    chunkLen = fromIntegral . T.S.length
instance Stream [token] token where
    cons = (:)
    consChunk = (<>)
    uncons [] = Nothing
    uncons (t:ts) = Just (t, ts)
    unconsChunk = splitAt . fromIntegral
    chunkLen = fromIntegral . length


-- | Generalize the transformation of an input 'Stream' into a more meaningful
-- value.  This class provides the basic building blocks from which more
-- expressive such parsers may be constructed.
-- 
-- See also the description of 'ParserT' for some of the design decisions.
class
  ( A.Alternative m, Monad m
  , Stream stream token, Monoid stream
  ) => MonadParser m stream token | m -> stream where

    -- | Runs the argument parser on the current input, without consuming any
    -- of it; these are identical semantics to saving and restoring the input
    -- after running the computation, assuming the 'N.S.MonadState' instance
    -- runs over the input stream (see 'ParserT'):
    -- 
    -- @
    -- input <- 'N.S.get'
    -- a <- parser
    -- 'N.S.put' input
    -- @
    -- 
    -- @
    -- a <- 'lookAhead' parser
    -- @
    lookAhead :: m out -> m out

    -- | Succeeds if and only if the argument parser fails (the input is not
    -- consumed).
    avoiding :: m out -> m ()

    -- | Retrieve the next token in the stream, whatever it may be.  Identical
    -- to @'uncons'@ in all but type.
    next :: m token

    -- | Retrieve the next several tokens in the stream.  Identical to
    -- 'Control.Monad.Combinators.count' (with a safer index type) in the case
    -- that @gather@ is a list @[token]@.
    -- 
    -- If fewer tokens are in the input stream than asked for, returns what
    -- does remain in the input stream.
    nextChunk :: Word -> m stream

    -- | Prepend a token to the input stream to be processed next.  Identical
    -- to operating on the stream directly through 'N.S.MonadState', if that
    -- instance also exists.
    -- 
    -- @
    -- stream <- 'N.S.get'
    -- 'N.S.put' $ 'cons' tok stream
    -- @
    -- 
    -- @
    -- 'push' tok
    -- @
    push :: token -> m ()

    -- | Concatenate the given sequence with the existing input, processing the
    -- argument before the older @stream@.
    pushChunk :: stream -> m ()

    -- | Drop the remainder of the input, simulating an early end-of-stream.
    -- Can be emulated through appropriate 'N.S.MonadState' and 'Monoid'
    -- instances:
    -- 
    -- @
    -- stream <- 'N.S.get'
    -- 'N.S.put' 'mempty'
    -- 'return' stream
    -- @
    -- 
    -- @
    -- 'abridge'
    -- @
    abridge :: m stream

instance
  ( A.Alternative gather, Monad gather
  , Stream stream token, Monoid stream
  ) => MonadParser (ParserT stream gather) stream token where
    lookAhead (ParserT a') = ParserT $ \input -> do
        (a, _) <- a' input
        return (a, input)
    avoiding (ParserT bad') = ParserT $ \input -> do
        bad <- A.optional $ bad' input
        if Y.isJust bad
            then A.empty
            else pure ((), input)
    next = ParserT $ maybe A.empty return . uncons
    nextChunk l = ParserT $ return . unconsChunk l
    push tok = ParserT $ \input -> return ((), cons tok input)
    pushChunk str = ParserT $ \input -> return ((), consChunk str input)
    abridge = ParserT $ \input -> return (input, mempty)

#if MIN_VERSION_transformers(0,5,3)
instance
  ( MonadParser trans stream token
  , Monoid accum
  , N.MonadPlus trans
  ) => MonadParser (N.A.AccumT accum trans) stream token where
    lookAhead (N.A.AccumT trans) = N.A.AccumT $ lookAhead . trans
    avoiding (N.A.AccumT trans) = N.A.AccumT $ \accum ->
        avoiding (trans accum) >> return ((), accum)
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge
#endif

instance
  ( MonadParser trans stream token
  , Monoid except
  ) => MonadParser (N.X.ExceptT except trans) stream token where
    lookAhead (N.X.ExceptT trans) = N.X.ExceptT $ lookAhead trans
    avoiding (N.X.ExceptT trans) = N.X.ExceptT $
        avoiding trans >> return (pure ())
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  ) => MonadParser (N.I.IdentityT trans) stream token where
    lookAhead (N.I.IdentityT trans) = N.I.IdentityT $ lookAhead trans
    avoiding (N.I.IdentityT trans) = N.I.IdentityT $ avoiding trans
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  ) => MonadParser (N.M.MaybeT trans) stream token where
    lookAhead (N.M.MaybeT trans) = N.M.MaybeT $ lookAhead trans
    avoiding (N.M.MaybeT trans) = N.M.MaybeT $
        avoiding trans >> return (pure ())
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  ) => MonadParser (N.R.ReaderT reader trans) stream token where
    lookAhead (N.R.ReaderT trans) = N.R.ReaderT $ lookAhead . trans
    avoiding (N.R.ReaderT trans) = N.R.ReaderT $ avoiding . trans
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  , N.MonadPlus trans
  ) => MonadParser (N.S.L.StateT state trans) stream token where
    lookAhead (N.S.L.StateT trans) = N.S.L.StateT $ lookAhead . trans
    avoiding (N.S.L.StateT trans) = N.S.L.StateT $ \state ->
        avoiding (fst <$> trans state) >> return ((), state)
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  , N.MonadPlus trans
  ) => MonadParser (N.S.S.StateT state trans) stream token where
    lookAhead (N.S.S.StateT trans) = N.S.S.StateT $ lookAhead . trans
    avoiding (N.S.S.StateT trans) = N.S.S.StateT $ \state ->
        avoiding (fst <$> trans state) >> return ((), state)
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  , Monoid writer
  ) => MonadParser (N.W.L.WriterT writer trans) stream token where
    lookAhead (N.W.L.WriterT trans) = N.W.L.WriterT $ lookAhead trans
    avoiding (N.W.L.WriterT trans) = N.W.L.WriterT $
        avoiding (fmap fst trans) >> return ((), mempty)
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  , Monoid writer
  ) => MonadParser (N.W.S.WriterT writer trans) stream token where
    lookAhead (N.W.S.WriterT trans) = N.W.S.WriterT $ lookAhead trans
    avoiding (N.W.S.WriterT trans) = N.W.S.WriterT $
        avoiding (fmap fst trans) >> return ((), mempty)
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  , Monoid writer
  , N.MonadPlus trans
  ) => MonadParser (N.T.S.RWST reader writer state trans) stream token where
    lookAhead (N.T.S.RWST trans) = N.T.S.RWST $ \reader -> lookAhead . trans reader
    avoiding (N.T.S.RWST trans) = N.T.S.RWST $ \reader state ->
        avoiding (trans reader state) >> return ((), state, mempty)
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge

instance
  ( MonadParser trans stream token
  , Monoid writer
  , N.MonadPlus trans
  ) => MonadParser (N.T.L.RWST reader writer state trans) stream token where
    lookAhead (N.T.L.RWST trans) = N.T.L.RWST $ \reader -> lookAhead . trans reader
    avoiding (N.T.L.RWST trans) = N.T.L.RWST $ \reader state -> do
        avoiding $ trans reader state
        return ((), state, mempty)
    next = N.T.lift next
    nextChunk = N.T.lift . nextChunk
    push = N.T.lift . push
    pushChunk = N.T.lift . pushChunk
    abridge = N.T.lift abridge


-- | Succeeds if and only if the input is empty.
end :: MonadParser trans stream token => trans ()
end = avoiding next

-- | Expect a specific token from the 'Stream', and fail if a different
-- token is found instead.  Identical to running 'satisfying' with equality
-- in the (by far most likely) case that @gather@ is a 'Monad' in addition
-- to an 'A.Alternative':
-- 
-- @
-- tok \<- 'next' '>>=' 'satisfying' ('==' desired)
-- @
-- 
-- @
-- tok <- 'token' desired
-- @
token ::
  ( MonadParser trans stream token
  , Eq token
  ) => token -> trans token
token desired = next >>= satisfying (== desired)

-- | Expect a specific sequence of tokens from the 'Stream', and fail if
-- anything else is found instead, or if the 'Stream' doesn't have enough
-- characters before its end.  Identical to running 'satisfying' with equality
-- over 'nextChunk' in the case that @stream@ is an 'Eq' (which all provided
-- instances are) and can easily provide a 'length' (which they do, unless the
-- sequence to test against also needs to be lazy).
-- 
-- @
-- stream \<- 'nextChunk' ('length' desired) '>>=' 'satisfying' ('==' desired)
-- @
-- 
-- @
-- stream <- 'chunk' desired
-- @
chunk ::
  ( MonadParser trans stream token
  , Eq stream
  ) => stream -> trans stream
chunk desired = nextChunk (chunkLen desired) >>= satisfying (== desired)

-- | Succeeds if and only if the value parsed by the argument parser satisfies
-- the predicate.  No further input is consumed.
satisfying :: MonadParser trans stream token => (out -> Bool) -> out -> trans out
satisfying test out
    | test out = return out
    | otherwise = A.empty