{-# LANGUAGE FlexibleInstances #-}

-- | A monad class for writing pure tokenizers in an imperative-looking way.
--
-- Main idea: You 'walk' through the input string like a turtle, and everytime
-- you find a token boundary, you call 'emit'. If some specific kinds of tokens
-- should be suppressed, you can 'discard' them instead (or filter afterwards).
--
-- An concrete instance of this class is provided in "Control.Monad.Tokenizer".

module Control.Monad.Tokenizer.Class (
  -- * Monad class
  MonadTokenizer(..),
  untilEOT,
  -- * Utilities
  pop,
  walkWhile,
  walkFold,
  -- * Text types
  Tokenizable(..)
  ) where

import Control.Monad
import Data.Char
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT

-- | Text types that can be split by the Tokenizer monad. In this module,
-- instances are provided for String, strict Text, and lazy Text.
-- If you are dealing with ASCII ByteStrings, you can find instances in
-- the modules "Control.Monad.Tokenizer.Char8.Strict" and
-- "Control.Monad.Tokenizer.Char8.Lazy"
class Tokenizable t where
  tnull :: t -> Bool
  thead :: t -> Char
  ttail :: t -> t
  ttake :: Int -> t -> t
  tdrop :: Int -> t -> t
  tlower :: t -> t

instance Tokenizable T.Text where
  tnull = T.null
  thead = T.head
  ttail = T.tail
  ttake = T.take
  tdrop = T.drop
  tlower = T.toLower

instance Tokenizable LT.Text where
  tnull = LT.null
  thead = LT.head
  ttail = LT.tail
  ttake = LT.take . fromIntegral
  tdrop = LT.drop . fromIntegral
  tlower = LT.toLower

instance Tokenizable [Char] where
  tnull = null
  thead = head
  ttail = tail
  ttake = take
  tdrop = drop
  tlower = map toLower

-- | A monad for turtle tokenization.
class Monad m => MonadTokenizer m where
  -- | Proceed to the next character
  walk :: m ()
  -- | Walk back to the previous character, unless it was discarded/emitted.
  walkBack :: m ()
  -- | Peek the current character
  peek :: m Char
  -- | Restore the state after the last emit/discard.
  restore :: m ()
  -- | Break at the current position and emit the scanned token
  emit :: m ()
  -- | Break at the current position and discard the scanned token
  discard :: m ()
  -- | Have I reached the end of the input text?
  isEOT :: m Bool
  -- | Check if the next input chars agree with the given string
  lookAhead :: [Char] -> m Bool
  -- | Peek the current character and proceed
  pop :: m Char
  pop = peek <* walk

-- | Proceed as long as a given function succeeds
walkWhile :: MonadTokenizer m => (Char -> Bool) -> m ()
walkWhile f = do
  c <- peek
  when (c /= '\0' && f c) $
       walk >> walkWhile f

-- | Proceed as long as a given fold returns Just (generalization of walkWhile)
walkFold :: MonadTokenizer m => a -> (Char -> a -> Maybe a) -> m ()
walkFold s0 f = do
  c <- peek
  unless (c == '\0') $ case f c s0 of
      Nothing -> return ()
      Just s -> walk >> walkFold s f

-- | Repeat a given tokenizer as long as the end of text is not reached
untilEOT :: MonadTokenizer m => m () -> m ()
untilEOT f = do
  eot <- isEOT
  unless eot $ f >> untilEOT f