{-# OPTIONS_GHC -Wall #-}
{-# LANGUAGE StandaloneDeriving #-}
{-|
Description: A simple state- and transition-based tokeniser with location tracking

* Tokeniser
* A data structure representing a sequence of tokens, for internal use in the parser
-}
module Parser.Tokeniser (
  Tokeniser',
  Tokens',
  add_token,
  current_line_and_char,
  delete_char,
  gather_token,
  get_char,
  get_line_and_char,
  get_token,
  take_token,
  tokenisation_error,
  tokenise,
  tokens_ended) where
  import Control.Monad.Except (MonadError (..))
  import Control.Monad.RWS.Strict (RWST, execRWST)
  import Control.Monad.Reader (MonadReader (..))
  import Control.Monad.State.Strict (MonadState (..))
  import Control.Monad.Writer.Strict (MonadWriter (..))
  import Parser.Errors (Error (..))
  import Parser.Line_and_char (L (..), Line_and_char, init_line_and_char)
  data State char = State {State char -> Line_and_char
state_line_and_char :: Line_and_char, State char -> [char]
state_text :: [char]}
  -- | A tokeniser that works with any kind of custom characters, tokens and errors. The custom character type is necessary

  -- if you want to classify characters according to their behavior before tokenisation - for example, wrap all operators,

  -- letters, delimiters or digits in the same constructor to simplify pattern matching.

  data Tokeniser' char token err t =
    Tokeniser {Tokeniser' char token err t
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     t
run_tokeniser :: RWST (char -> Line_and_char -> Line_and_char) [L token] (State char) (Either err) t}
  -- | A sequence of tokens with locations. For internal use in the parser.

  data Tokens' token = Tokens [L token] Line_and_char
  instance Applicative (Tokeniser' char token err) where
    Tokeniser RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  (a -> b)
tokenise_0 <*> :: Tokeniser' char token err (a -> b)
-> Tokeniser' char token err a -> Tokeniser' char token err b
<*> Tokeniser RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  a
tokenise_1 = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  b
-> Tokeniser' char token err b
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser (RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  (a -> b)
tokenise_0 RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  (a -> b)
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     a
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  a
tokenise_1)
    pure :: a -> Tokeniser' char token err a
pure a
x = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  a
-> Tokeniser' char token err a
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser (a
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     a
forall (m :: * -> *) a. Monad m => a -> m a
return a
x)
  instance Functor (Tokeniser' char token err) where
    fmap :: (a -> b)
-> Tokeniser' char token err a -> Tokeniser' char token err b
fmap a -> b
f (Tokeniser RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  a
tokenise') = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  b
-> Tokeniser' char token err b
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser (a -> b
f (a -> b)
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     a
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  a
tokenise')
  instance Monad (Tokeniser' char token err) where
    Tokeniser RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  a
tokenise' >>= :: Tokeniser' char token err a
-> (a -> Tokeniser' char token err b)
-> Tokeniser' char token err b
>>= a -> Tokeniser' char token err b
f = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  b
-> Tokeniser' char token err b
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser (RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  a
tokenise' RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  a
-> (a
    -> RWST
         (char -> Line_and_char -> Line_and_char)
         [L token]
         (State char)
         (Either err)
         b)
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Tokeniser' char token err b
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     b
forall char token err t.
Tokeniser' char token err t
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     t
run_tokeniser (Tokeniser' char token err b
 -> RWST
      (char -> Line_and_char -> Line_and_char)
      [L token]
      (State char)
      (Either err)
      b)
-> (a -> Tokeniser' char token err b)
-> a
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> a -> Tokeniser' char token err b
f)
  deriving instance Show char => Show (State char)
  deriving instance Show token => Show (Tokens' token)
  -- | Add the token to the output. Note that the order of adding tokens is important.

  add_token :: Line_and_char -> token -> Tokeniser' char token err ()
  add_token :: Line_and_char -> token -> Tokeniser' char token err ()
add_token Line_and_char
line_and_char token
token = [L token] -> Tokeniser' char token err ()
forall token char err. [L token] -> Tokeniser' char token err ()
tell_Tokeniser [Line_and_char -> token -> L token
forall t. Line_and_char -> t -> L t
L Line_and_char
line_and_char token
token]
  ask_Tokeniser :: Tokeniser' char token err (char -> Line_and_char -> Line_and_char)
  ask_Tokeniser :: Tokeniser' char token err (char -> Line_and_char -> Line_and_char)
ask_Tokeniser = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  (char -> Line_and_char -> Line_and_char)
-> Tokeniser'
     char token err (char -> Line_and_char -> Line_and_char)
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  (char -> Line_and_char -> Line_and_char)
forall r (m :: * -> *). MonadReader r m => m r
ask
  -- | Get the location of the first token or, if there are none, the end of file. For internal use in the parser.

  current_line_and_char :: Tokens' token -> Line_and_char
  current_line_and_char :: Tokens' token -> Line_and_char
current_line_and_char (Tokens [L token]
tokens Line_and_char
end_line_and_char) =
    case [L token]
tokens of
      [] -> Line_and_char
end_line_and_char
      L Line_and_char
line_and_char token
_ : [L token]
_ -> Line_and_char
line_and_char
  -- | Delete the first character from the remaining text. Automatically updates the location.

  delete_char :: Tokeniser' char token err ()
  delete_char :: Tokeniser' char token err ()
delete_char =
    do
      char -> Line_and_char -> Line_and_char
next_line_and_char <- Tokeniser' char token err (char -> Line_and_char -> Line_and_char)
forall char token err.
Tokeniser' char token err (char -> Line_and_char -> Line_and_char)
ask_Tokeniser
      State Line_and_char
line_and_char [char]
text <- Tokeniser' char token err (State char)
forall char token err. Tokeniser' char token err (State char)
get_Tokeniser
      case [char]
text of
        [] -> () -> Tokeniser' char token err ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
        char
char : [char]
text' -> State char -> Tokeniser' char token err ()
forall char token err. State char -> Tokeniser' char token err ()
put_Tokeniser (Line_and_char -> [char] -> State char
forall char. Line_and_char -> [char] -> State char
State (char -> Line_and_char -> Line_and_char
next_line_and_char char
char Line_and_char
line_and_char) [char]
text')
  -- | Add a token that consists of several characters - for example, an operator, a word or a natural number. You have to

  -- provide a function that recognises suitable characters and a function that transforms the resulting string into a token.

  gather_token :: (char -> Maybe Char) -> (String -> token) -> Tokeniser' char token err ()
  gather_token :: (char -> Maybe Char)
-> (String -> token) -> Tokeniser' char token err ()
gather_token char -> Maybe Char
recognise_char String -> token
string_to_token =
    do
      Line_and_char
line_and_char <- Tokeniser' char token err Line_and_char
forall char token err. Tokeniser' char token err Line_and_char
get_line_and_char
      String
token <- (char -> Maybe Char) -> Tokeniser' char token err String
forall char token err.
(char -> Maybe Char) -> Tokeniser' char token err String
gather_token' char -> Maybe Char
recognise_char
      Line_and_char -> token -> Tokeniser' char token err ()
forall token char err.
Line_and_char -> token -> Tokeniser' char token err ()
add_token Line_and_char
line_and_char (String -> token
string_to_token String
token)
  gather_token' :: (char -> Maybe Char) -> Tokeniser' char token err String
  gather_token' :: (char -> Maybe Char) -> Tokeniser' char token err String
gather_token' char -> Maybe Char
recognise_char =
    let
      f :: Tokeniser' char token err String
f = (char -> Maybe Char) -> Tokeniser' char token err String
forall char token err.
(char -> Maybe Char) -> Tokeniser' char token err String
gather_token' char -> Maybe Char
recognise_char
    in
      do
        Maybe char
maybe_char <- Int -> Tokeniser' char token err (Maybe char)
forall char token err.
Int -> Tokeniser' char token err (Maybe char)
get_char Int
0
        case Maybe char
maybe_char Maybe char -> (char -> Maybe Char) -> Maybe Char
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= char -> Maybe Char
recognise_char of
          Maybe Char
Nothing -> String -> Tokeniser' char token err String
forall (m :: * -> *) a. Monad m => a -> m a
return String
""
          Just Char
char ->
            do
              Tokeniser' char token err ()
forall char token err. Tokeniser' char token err ()
delete_char
              String
token <- Tokeniser' char token err String
forall token err. Tokeniser' char token err String
f
              String -> Tokeniser' char token err String
forall (m :: * -> *) a. Monad m => a -> m a
return (Char
char Char -> ShowS
forall a. a -> [a] -> [a]
: String
token)
  get_Tokeniser :: Tokeniser' char token err (State char)
  get_Tokeniser :: Tokeniser' char token err (State char)
get_Tokeniser = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  (State char)
-> Tokeniser' char token err (State char)
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  (State char)
forall s (m :: * -> *). MonadState s m => m s
get
  -- | Take a look at a character without deleting it. Returns @Nothing@ if the index is negative or if the remaining text is

  -- too short.

  get_char :: Int -> Tokeniser' char token err (Maybe char)
  get_char :: Int -> Tokeniser' char token err (Maybe char)
get_char Int
i =
    do
      [char]
text <- State char -> [char]
forall char. State char -> [char]
state_text (State char -> [char])
-> Tokeniser' char token err (State char)
-> Tokeniser' char token err [char]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Tokeniser' char token err (State char)
forall char token err. Tokeniser' char token err (State char)
get_Tokeniser
      Maybe char -> Tokeniser' char token err (Maybe char)
forall (m :: * -> *) a. Monad m => a -> m a
return
        (case Int
0 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
i Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< [char] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [char]
text of
          Bool
False -> Maybe char
forall a. Maybe a
Nothing
          Bool
True -> char -> Maybe char
forall a. a -> Maybe a
Just ([char]
text [char] -> Int -> char
forall a. [a] -> Int -> a
!! Int
i))
  -- | Get the current location of the tokeniser.

  get_line_and_char :: Tokeniser' char token err Line_and_char
  get_line_and_char :: Tokeniser' char token err Line_and_char
get_line_and_char = State char -> Line_and_char
forall char. State char -> Line_and_char
state_line_and_char (State char -> Line_and_char)
-> Tokeniser' char token err (State char)
-> Tokeniser' char token err Line_and_char
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Tokeniser' char token err (State char)
forall char token err. Tokeniser' char token err (State char)
get_Tokeniser
  -- | Get the first token without deleting it. For internal use in the parser.

  get_token :: Tokens' token -> Maybe token
  get_token :: Tokens' token -> Maybe token
get_token (Tokens [L token]
tokens Line_and_char
_) =
    case [L token]
tokens of
      [] -> Maybe token
forall a. Maybe a
Nothing
      L Line_and_char
_ token
token : [L token]
_ -> token -> Maybe token
forall a. a -> Maybe a
Just token
token
  put_Tokeniser :: State char -> Tokeniser' char token err ()
  put_Tokeniser :: State char -> Tokeniser' char token err ()
put_Tokeniser State char
st = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  ()
-> Tokeniser' char token err ()
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser (State char
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     ()
forall s (m :: * -> *). MonadState s m => s -> m ()
put State char
st)
  -- | Recognises tokens that fit a certain pattern and transforms them into something more useful - for example, a string or an

  -- integer. Returns @Nothing@ if the first token does not fit the pattern, and returns the transformed token and the rest of

  -- the sequence if it does fit. For internal use in the parser.

  take_token :: (token -> Maybe t) -> Tokens' token -> Maybe (t, Tokens' token)
  take_token :: (token -> Maybe t) -> Tokens' token -> Maybe (t, Tokens' token)
take_token token -> Maybe t
f (Tokens [L token]
tokens Line_and_char
end_line_and_char) =
    case [L token]
tokens of
      [] -> Maybe (t, Tokens' token)
forall a. Maybe a
Nothing
      L Line_and_char
_ token
token : [L token]
tokens' ->
        do
          t
x <- token -> Maybe t
f token
token
          (t, Tokens' token) -> Maybe (t, Tokens' token)
forall (m :: * -> *) a. Monad m => a -> m a
return (t
x, [L token] -> Line_and_char -> Tokens' token
forall token. [L token] -> Line_and_char -> Tokens' token
Tokens [L token]
tokens' Line_and_char
end_line_and_char)
  tell_Tokeniser :: [L token] -> Tokeniser' char token err ()
  tell_Tokeniser :: [L token] -> Tokeniser' char token err ()
tell_Tokeniser [L token]
tokens = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  ()
-> Tokeniser' char token err ()
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser ([L token]
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     ()
forall w (m :: * -> *). MonadWriter w m => w -> m ()
tell [L token]
tokens)
  throwError_Tokeniser :: err -> Tokeniser' char token err t
  throwError_Tokeniser :: err -> Tokeniser' char token err t
throwError_Tokeniser err
err = RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
forall char token err t.
RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  t
-> Tokeniser' char token err t
Tokeniser (err
-> RWST
     (char -> Line_and_char -> Line_and_char)
     [L token]
     (State char)
     (Either err)
     t
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError err
err)
  -- | Throw a tokenisation error at the current location.

  tokenisation_error :: (Line_and_char -> err) -> Tokeniser' char token err t
  tokenisation_error :: (Line_and_char -> err) -> Tokeniser' char token err t
tokenisation_error Line_and_char -> err
err =
    do
      Line_and_char
line_and_char <- Tokeniser' char token err Line_and_char
forall char token err. Tokeniser' char token err Line_and_char
get_line_and_char
      err -> Tokeniser' char token err t
forall err char token t. err -> Tokeniser' char token err t
throwError_Tokeniser (Line_and_char -> err
err Line_and_char
line_and_char)
  -- | Tokenise the text. For internal use in the parser.

  tokenise ::
    (
      (Char -> char) ->
      (char -> Line_and_char -> Line_and_char) ->
      Tokeniser' char token err () ->
      String ->
      Either Error (Either err (Tokens' token)))
  tokenise :: (Char -> char)
-> (char -> Line_and_char -> Line_and_char)
-> Tokeniser' char token err ()
-> String
-> Either Error (Either err (Tokens' token))
tokenise Char -> char
classify_char char -> Line_and_char -> Line_and_char
next_line_and_char (Tokeniser RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  ()
tokenise') String
text =
    case RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  ()
-> (char -> Line_and_char -> Line_and_char)
-> State char
-> Either err (State char, [L token])
forall (m :: * -> *) r w s a.
Monad m =>
RWST r w s m a -> r -> s -> m (s, w)
execRWST RWST
  (char -> Line_and_char -> Line_and_char)
  [L token]
  (State char)
  (Either err)
  ()
tokenise' char -> Line_and_char -> Line_and_char
next_line_and_char (Line_and_char -> [char] -> State char
forall char. Line_and_char -> [char] -> State char
State Line_and_char
init_line_and_char (Char -> char
classify_char (Char -> char) -> String -> [char]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String
text)) of
      Left err
err -> Either err (Tokens' token)
-> Either Error (Either err (Tokens' token))
forall a b. b -> Either a b
Right (err -> Either err (Tokens' token)
forall a b. a -> Either a b
Left err
err)
      Right (State Line_and_char
line_and_char [char]
text', [L token]
tokens) ->
        case [char]
text' of
          [] -> Either err (Tokens' token)
-> Either Error (Either err (Tokens' token))
forall a b. b -> Either a b
Right (Tokens' token -> Either err (Tokens' token)
forall a b. b -> Either a b
Right ([L token] -> Line_and_char -> Tokens' token
forall token. [L token] -> Line_and_char -> Tokens' token
Tokens [L token]
tokens Line_and_char
line_and_char))
          [char]
_ -> Error -> Either Error (Either err (Tokens' token))
forall a b. a -> Either a b
Left Error
Incomplete_tokenisation
  -- | Check whether the sequence of tokens has ended. For internal use in the parser.

  tokens_ended :: Tokens' token -> Bool
  tokens_ended :: Tokens' token -> Bool
tokens_ended (Tokens [L token]
tokens Line_and_char
_) = [L token] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [L token]
tokens