{-# OPTIONS_GHC -Wall #-}
{-# LANGUAGE StandaloneDeriving #-}
{-|
Description: A simple state- and transition-based tokeniser with location tracking.

* Tokeniser
-}
module Parser.Tokeniser (
  Tokeniser',
  Tokens',
  add_token,
  current_line_and_char,
  delete_char,
  gather_token,
  get_char,
  get_token,
  take_token,
  tokenisation_error,
  tokenise,
  tokens_ended) where
  import Control.Monad.Except (MonadError (..))
  import Control.Monad.RWS.Strict (RWST, execRWST)
  import Control.Monad.Reader (MonadReader (..))
  import Control.Monad.State.Strict (MonadState (..))
  import Control.Monad.Writer.Strict (MonadWriter (..))
  import Parser.Errors (Error (..))
  import Parser.Line_and_char (L (..), Line_and_char, init_line_and_char)
  data State char_class = State {State char_class -> Line_and_char
state_line_and_char :: Line_and_char, State char_class -> [char_class]
state_text :: [char_class]}
  -- | A tokeniser that works with any kind of custom characters, tokens and errors. The custom character type is useful if you

  -- need to classify characters according to their behavior before tokenisation - for example, wrap all operators, letters,

  -- delimiters or digits in the same constructor to simplify pattern matching.

  data Tokeniser' char_class token err t =
    Tokeniser {Tokeniser' char_class token err t
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     t
run_tokeniser :: RWST (char_class -> Line_and_char -> Line_and_char) [L token] (State char_class) (Either err) t}
  -- | A sequence of tokens with locations. For internal use in the parser.

  data Tokens' token = Tokens [L token] Line_and_char
  instance Applicative (Tokeniser' char_class token err) where
    Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (a -> b)
tokenise_0 <*> :: Tokeniser' char_class token err (a -> b)
-> Tokeniser' char_class token err a
-> Tokeniser' char_class token err b
<*> Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  a
tokenise_1 = RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  b
-> Tokeniser' char_class token err b
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser (RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (a -> b)
tokenise_0 RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (a -> b)
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     a
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  a
tokenise_1)
    pure :: a -> Tokeniser' char_class token err a
pure a
x = RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  a
-> Tokeniser' char_class token err a
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser (a
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     a
forall (m :: * -> *) a. Monad m => a -> m a
return a
x)
  instance Functor (Tokeniser' char_class token err) where
    fmap :: (a -> b)
-> Tokeniser' char_class token err a
-> Tokeniser' char_class token err b
fmap a -> b
f (Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  a
tokenise') = RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  b
-> Tokeniser' char_class token err b
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser (a -> b
f (a -> b)
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     a
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  a
tokenise')
  instance Monad (Tokeniser' char_class token err) where
    Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  a
tokenise' >>= :: Tokeniser' char_class token err a
-> (a -> Tokeniser' char_class token err b)
-> Tokeniser' char_class token err b
>>= a -> Tokeniser' char_class token err b
f = RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  b
-> Tokeniser' char_class token err b
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser (RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  a
tokenise' RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  a
-> (a
    -> RWST
         (char_class -> Line_and_char -> Line_and_char)
         [L token]
         (State char_class)
         (Either err)
         b)
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Tokeniser' char_class token err b
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     b
forall char_class token err t.
Tokeniser' char_class token err t
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     t
run_tokeniser (Tokeniser' char_class token err b
 -> RWST
      (char_class -> Line_and_char -> Line_and_char)
      [L token]
      (State char_class)
      (Either err)
      b)
-> (a -> Tokeniser' char_class token err b)
-> a
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> a -> Tokeniser' char_class token err b
f)
  deriving instance Show char_class => Show (State char_class)
  deriving instance Show token => Show (Tokens' token)
  -- | Add the token to the output. Note that the order of adding tokens is important and you have to add the token before

  -- deleting the respective characters to get the correct location.

  add_token :: token -> Tokeniser' char_class token err ()
  add_token :: token -> Tokeniser' char_class token err ()
add_token token
token =
    do
      Line_and_char
line_and_char <- State char_class -> Line_and_char
forall char_class. State char_class -> Line_and_char
state_line_and_char (State char_class -> Line_and_char)
-> Tokeniser' char_class token err (State char_class)
-> Tokeniser' char_class token err Line_and_char
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
-> Tokeniser' char_class token err (State char_class)
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
forall s (m :: * -> *). MonadState s m => m s
get
      Line_and_char -> token -> Tokeniser' char_class token err ()
forall token char_class err.
Line_and_char -> token -> Tokeniser' char_class token err ()
add_token' Line_and_char
line_and_char token
token
  add_token' :: Line_and_char -> token -> Tokeniser' char_class token err ()
  add_token' :: Line_and_char -> token -> Tokeniser' char_class token err ()
add_token' Line_and_char
line_and_char token
token = RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  ()
-> Tokeniser' char_class token err ()
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser ([L token]
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     ()
forall w (m :: * -> *). MonadWriter w m => w -> m ()
tell [Line_and_char -> token -> L token
forall t. Line_and_char -> t -> L t
L Line_and_char
line_and_char token
token])
  -- | Get the location of the first token or, if there are none, the end of file. For internal use in the parser.

  current_line_and_char :: Tokens' token -> Line_and_char
  current_line_and_char :: Tokens' token -> Line_and_char
current_line_and_char (Tokens [L token]
tokens Line_and_char
end_line_and_char) =
    case [L token]
tokens of
      [] -> Line_and_char
end_line_and_char
      L Line_and_char
line_and_char token
_ : [L token]
_ -> Line_and_char
line_and_char
  -- | Delete the first character from the remaining text. Automatically updates the location.

  delete_char :: Tokeniser' char_class token err ()
  delete_char :: Tokeniser' char_class token err ()
delete_char =
    do
      char_class -> Line_and_char -> Line_and_char
next_line_and_char <- RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (char_class -> Line_and_char -> Line_and_char)
-> Tokeniser'
     char_class token err (char_class -> Line_and_char -> Line_and_char)
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (char_class -> Line_and_char -> Line_and_char)
forall r (m :: * -> *). MonadReader r m => m r
ask
      State Line_and_char
line_and_char [char_class]
text <- RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
-> Tokeniser' char_class token err (State char_class)
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
forall s (m :: * -> *). MonadState s m => m s
get
      case [char_class]
text of
        [] -> () -> Tokeniser' char_class token err ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
        char_class
char_class : [char_class]
text' -> RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  ()
-> Tokeniser' char_class token err ()
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser (State char_class
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     ()
forall s (m :: * -> *). MonadState s m => s -> m ()
put (Line_and_char -> [char_class] -> State char_class
forall char_class.
Line_and_char -> [char_class] -> State char_class
State (char_class -> Line_and_char -> Line_and_char
next_line_and_char char_class
char_class Line_and_char
line_and_char) [char_class]
text'))
  -- | Add a token that consists of several characters - for example, an operator, a word or a number. You have to provide a

  -- function that recognises suitable characters and a function that transforms the resulting string into a token.

  gather_token :: (char_class -> Maybe Char) -> (String -> token) -> Tokeniser' char_class token err ()
  gather_token :: (char_class -> Maybe Char)
-> (String -> token) -> Tokeniser' char_class token err ()
gather_token char_class -> Maybe Char
recognise_char String -> token
string_to_token =
    do
      Line_and_char
line_and_char <- State char_class -> Line_and_char
forall char_class. State char_class -> Line_and_char
state_line_and_char (State char_class -> Line_and_char)
-> Tokeniser' char_class token err (State char_class)
-> Tokeniser' char_class token err Line_and_char
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
-> Tokeniser' char_class token err (State char_class)
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
forall s (m :: * -> *). MonadState s m => m s
get
      String
token <- (char_class -> Maybe Char)
-> Tokeniser' char_class token err String
forall char_class token err.
(char_class -> Maybe Char)
-> Tokeniser' char_class token err String
gather_token' char_class -> Maybe Char
recognise_char
      Line_and_char -> token -> Tokeniser' char_class token err ()
forall token char_class err.
Line_and_char -> token -> Tokeniser' char_class token err ()
add_token' Line_and_char
line_and_char (String -> token
string_to_token String
token)
  gather_token' :: (char_class -> Maybe Char) -> Tokeniser' char_class token err String
  gather_token' :: (char_class -> Maybe Char)
-> Tokeniser' char_class token err String
gather_token' char_class -> Maybe Char
recognise_char =
    let
      f :: Tokeniser' char_class token err String
f = (char_class -> Maybe Char)
-> Tokeniser' char_class token err String
forall char_class token err.
(char_class -> Maybe Char)
-> Tokeniser' char_class token err String
gather_token' char_class -> Maybe Char
recognise_char
    in
      do
        Maybe char_class
maybe_char_class <- Int -> Tokeniser' char_class token err (Maybe char_class)
forall char_class token err.
Int -> Tokeniser' char_class token err (Maybe char_class)
get_char Int
0
        case Maybe char_class
maybe_char_class Maybe char_class -> (char_class -> Maybe Char) -> Maybe Char
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= char_class -> Maybe Char
recognise_char of
          Maybe Char
Nothing -> String -> Tokeniser' char_class token err String
forall (m :: * -> *) a. Monad m => a -> m a
return String
""
          Just Char
char ->
            do
              Tokeniser' char_class token err ()
forall char_class token err. Tokeniser' char_class token err ()
delete_char
              String
token <- Tokeniser' char_class token err String
forall token err. Tokeniser' char_class token err String
f
              String -> Tokeniser' char_class token err String
forall (m :: * -> *) a. Monad m => a -> m a
return (Char
char Char -> ShowS
forall a. a -> [a] -> [a]
: String
token)
  -- | Take a look at a character without deleting it. Returns @Nothing@ if the index is negative or if the remaining text is

  -- too short.

  get_char :: Int -> Tokeniser' char_class token err (Maybe char_class)
  get_char :: Int -> Tokeniser' char_class token err (Maybe char_class)
get_char Int
i =
    do
      [char_class]
text <- State char_class -> [char_class]
forall char_class. State char_class -> [char_class]
state_text (State char_class -> [char_class])
-> Tokeniser' char_class token err (State char_class)
-> Tokeniser' char_class token err [char_class]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
-> Tokeniser' char_class token err (State char_class)
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
forall s (m :: * -> *). MonadState s m => m s
get
      Maybe char_class
-> Tokeniser' char_class token err (Maybe char_class)
forall (m :: * -> *) a. Monad m => a -> m a
return
        (case Int
0 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
i Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< [char_class] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [char_class]
text of
          Bool
False -> Maybe char_class
forall a. Maybe a
Nothing
          Bool
True -> char_class -> Maybe char_class
forall a. a -> Maybe a
Just ([char_class]
text [char_class] -> Int -> char_class
forall a. [a] -> Int -> a
!! Int
i))
  -- | Get the first token without deleting it. For internal use in the parser.

  get_token :: Tokens' token -> Maybe token
  get_token :: Tokens' token -> Maybe token
get_token (Tokens [L token]
tokens Line_and_char
_) =
    case [L token]
tokens of
      [] -> Maybe token
forall a. Maybe a
Nothing
      L Line_and_char
_ token
token : [L token]
_ -> token -> Maybe token
forall a. a -> Maybe a
Just token
token
  -- | Recognises tokens that fit a certain pattern and transforms them into something more useful - for example, a string or an

  -- integer. Returns @Nothing@ if the first token does not fit the pattern, and returns the transformed token and the rest of

  -- the sequence if it does fit. For internal use in the parser.

  take_token :: (token -> Maybe t) -> Tokens' token -> Maybe (t, Tokens' token)
  take_token :: (token -> Maybe t) -> Tokens' token -> Maybe (t, Tokens' token)
take_token token -> Maybe t
f (Tokens [L token]
tokens Line_and_char
end_line_and_char) =
    case [L token]
tokens of
      [] -> Maybe (t, Tokens' token)
forall a. Maybe a
Nothing
      L Line_and_char
_ token
token : [L token]
tokens' ->
        do
          t
x <- token -> Maybe t
f token
token
          (t, Tokens' token) -> Maybe (t, Tokens' token)
forall (m :: * -> *) a. Monad m => a -> m a
return (t
x, [L token] -> Line_and_char -> Tokens' token
forall token. [L token] -> Line_and_char -> Tokens' token
Tokens [L token]
tokens' Line_and_char
end_line_and_char)
  -- | Throw a tokenisation error at the current location.

  tokenisation_error :: (Line_and_char -> err) -> Tokeniser' char_class token err t
  tokenisation_error :: (Line_and_char -> err) -> Tokeniser' char_class token err t
tokenisation_error Line_and_char -> err
err =
    do
      Line_and_char
line_and_char <- State char_class -> Line_and_char
forall char_class. State char_class -> Line_and_char
state_line_and_char (State char_class -> Line_and_char)
-> Tokeniser' char_class token err (State char_class)
-> Tokeniser' char_class token err Line_and_char
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
-> Tokeniser' char_class token err (State char_class)
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  (State char_class)
forall s (m :: * -> *). MonadState s m => m s
get
      RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
forall char_class token err t.
RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  t
-> Tokeniser' char_class token err t
Tokeniser (err
-> RWST
     (char_class -> Line_and_char -> Line_and_char)
     [L token]
     (State char_class)
     (Either err)
     t
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError (Line_and_char -> err
err Line_and_char
line_and_char))
  -- | Tokenise the text. For internal use in the parser.

  tokenise ::
    (
      (Char -> char_class) ->
      (char_class -> Line_and_char -> Line_and_char) ->
      Tokeniser' char_class token err () ->
      String ->
      Either Error (Either err (Tokens' token)))
  tokenise :: (Char -> char_class)
-> (char_class -> Line_and_char -> Line_and_char)
-> Tokeniser' char_class token err ()
-> String
-> Either Error (Either err (Tokens' token))
tokenise Char -> char_class
classify_char char_class -> Line_and_char -> Line_and_char
next_line_and_char (Tokeniser RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  ()
tokenise') String
text =
    case RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  ()
-> (char_class -> Line_and_char -> Line_and_char)
-> State char_class
-> Either err (State char_class, [L token])
forall (m :: * -> *) r w s a.
Monad m =>
RWST r w s m a -> r -> s -> m (s, w)
execRWST RWST
  (char_class -> Line_and_char -> Line_and_char)
  [L token]
  (State char_class)
  (Either err)
  ()
tokenise' char_class -> Line_and_char -> Line_and_char
next_line_and_char (Line_and_char -> [char_class] -> State char_class
forall char_class.
Line_and_char -> [char_class] -> State char_class
State Line_and_char
init_line_and_char (Char -> char_class
classify_char (Char -> char_class) -> String -> [char_class]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String
text)) of
      Left err
err -> Either err (Tokens' token)
-> Either Error (Either err (Tokens' token))
forall a b. b -> Either a b
Right (err -> Either err (Tokens' token)
forall a b. a -> Either a b
Left err
err)
      Right (State Line_and_char
line_and_char [char_class]
text', [L token]
tokens) ->
        case [char_class]
text' of
          [] -> Either err (Tokens' token)
-> Either Error (Either err (Tokens' token))
forall a b. b -> Either a b
Right (Tokens' token -> Either err (Tokens' token)
forall a b. b -> Either a b
Right ([L token] -> Line_and_char -> Tokens' token
forall token. [L token] -> Line_and_char -> Tokens' token
Tokens [L token]
tokens Line_and_char
line_and_char))
          [char_class]
_ -> Error -> Either Error (Either err (Tokens' token))
forall a b. a -> Either a b
Left Error
Incomplete_tokenisation
  -- | Check whether the sequence of tokens has ended. For internal use in the parser.

  tokens_ended :: Tokens' token -> Bool
  tokens_ended :: Tokens' token -> Bool
tokens_ended (Tokens [L token]
tokens Line_and_char
_) = [L token] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [L token]
tokens