{-# LANGUAGE FlexibleInstances #-} -- | A monad class for writing pure tokenizers in an imperative-looking way. -- -- Main idea: You 'walk' through the input string like a turtle, and everytime -- you find a token boundary, you call 'emit'. If some specific kinds of tokens -- should be suppressed, you can 'discard' them instead (or filter afterwards). -- -- An concrete instance of this class is provided in "Control.Monad.Tokenizer". module Control.Monad.Tokenizer.Class ( -- * Monad class MonadTokenizer(..), untilEOT, -- * Utilities pop, walkWhile, walkFold, -- * Text types Tokenizable(..) ) where import Control.Monad import Data.Char import qualified Data.Text as T import qualified Data.Text.Lazy as LT -- | Text types that can be split by the Tokenizer monad. In this module, -- instances are provided for String, strict Text, and lazy Text. -- If you are dealing with ASCII ByteStrings, you can find instances in -- the modules "Control.Monad.Tokenizer.Char8.Strict" and -- "Control.Monad.Tokenizer.Char8.Lazy" class Tokenizable t where tnull :: t -> Bool thead :: t -> Char ttail :: t -> t ttake :: Int -> t -> t tdrop :: Int -> t -> t tlower :: t -> t instance Tokenizable T.Text where tnull = T.null thead = T.head ttail = T.tail ttake = T.take tdrop = T.drop tlower = T.toLower instance Tokenizable LT.Text where tnull = LT.null thead = LT.head ttail = LT.tail ttake = LT.take . fromIntegral tdrop = LT.drop . fromIntegral tlower = LT.toLower instance Tokenizable [Char] where tnull = null thead = head ttail = tail ttake = take tdrop = drop tlower = map toLower -- | A monad for turtle tokenization. class Monad m => MonadTokenizer m where -- | Proceed to the next character walk :: m () -- | Walk back to the previous character, unless it was discarded/emitted. walkBack :: m () -- | Peek the current character peek :: m Char -- | Restore the state after the last emit/discard. restore :: m () -- | Break at the current position and emit the scanned token emit :: m () -- | Break at the current position and discard the scanned token discard :: m () -- | Have I reached the end of the input text? isEOT :: m Bool -- | Check if the next input chars agree with the given string lookAhead :: [Char] -> m Bool -- | Peek the current character and proceed pop :: MonadTokenizer m => m Char pop = peek <* walk -- | Proceed as long as a given function succeeds walkWhile :: MonadTokenizer m => (Char -> Bool) -> m () walkWhile f = do c <- peek when (c /= '\0' && f c) $ walk >> walkWhile f -- | Proceed as long as a given fold returns Just (generalization of walkWhile) walkFold :: MonadTokenizer m => a -> (Char -> a -> Maybe a) -> m () walkFold s0 f = do c <- peek unless (c == '\0') $ case f c s0 of Nothing -> return () Just s -> walk >> walkFold s f -- | Repeat a given tokenizer as long as the end of text is not reached untilEOT :: MonadTokenizer m => m () -> m () untilEOT f = do eot <- isEOT unless eot $ f >> untilEOT f