{-# LANGUAGE OverloadedStrings, BangPatterns #-} module Control.Monad.Tokenizer where import Control.Monad import Data.Char import Data.Monoid import qualified Data.Text as T -- | Tokenizer monad. Use runTokenizer to run it newtype Tokenizer a = Tokenizer { runTokenizer' :: (T.Text, Int, T.Text) -> (a,[T.Text] -> [T.Text],T.Text,Int,T.Text) } instance Functor Tokenizer where fmap = liftM instance Applicative Tokenizer where pure = return (<*>) = ap instance Monad Tokenizer where return a = Tokenizer $ \(whole,count,tail) -> (a,id,whole,count,tail) m >>= f = Tokenizer $ \(whole,count,tail) -> let (a1,o1,w1,!c1,t1) = runTokenizer' m (whole,count,tail) (a2,o2,w2,!c2,t2) = runTokenizer' (f a1) (w1,c1,t1) in (a2,o1.o2,w2,c2,t2) -- | Check if the next input chars agree with the given string lookAhead :: [Char] -> Tokenizer Bool lookAhead chars = Tokenizer $ \(whole,count,tail) -> let h = T.unpack $ T.take (length chars) tail in (h == chars, id, whole, count, tail) -- | Proceed to the next character walk :: Tokenizer () walk = Tokenizer $ \(whole,count,tail) -> if T.null tail then ((),id,whole,count,tail) else ((),id,whole,count+1,T.tail tail) -- | Peek the current character peek :: Tokenizer Char peek = Tokenizer $ \(whole,count,tail) -> (th tail,id,whole,count,tail) where th t | T.null t = '\0' | otherwise = T.head t -- | Peek the current character and proceed pop :: Tokenizer Char pop = peek <* walk -- | Break at the current position and emit the scanned token emit :: Tokenizer () emit = Tokenizer $ \(whole,count,tail) -> ((),(T.take count whole:),tail,0,tail) -- | Break at the current position and discard the scanned token discard :: Tokenizer () discard = Tokenizer $ \(whole,count,tail) -> ((),id,tail,0,tail) -- | Proceed as long as a given function succeeds walkWhile :: (Char -> Bool) -> Tokenizer () walkWhile f = do c <- peek when (c /= '\0' && f c) $ walk >> walkWhile f -- | Proceed as long as a given fold returns Just (generalization of walkWhile) walkFold :: a -> (Char -> a -> Maybe a) -> Tokenizer () walkFold s0 f = do c <- peek unless (c == '\0') $ case f c s0 of Nothing -> return () Just s -> walk >> walkFold s f -- | Repeat a given tokenizer as long as the end of text is not reached untilEOT :: Tokenizer () -> Tokenizer () untilEOT f = do eot <- Tokenizer $ \(whole,count,tail) -> (T.null tail,id, whole, count, tail) unless eot $ f >> untilEOT f -- | Split a string into tokens using the given tokenizer runTokenizer :: Tokenizer () -> T.Text -> [T.Text] runTokenizer m input = let input' = T.toLower input in case runTokenizer' m (input',0,input') of (_, tokens, _, _, _) -> tokens [] -- | Split a string into tokens using the given tokenizer, case sensitive version runTokenizerCS :: Tokenizer () -> T.Text -> [T.Text] runTokenizerCS m input = case runTokenizer' m (input,0,input) of (_, tokens, _, _, _) -> tokens []