{-# LANGUAGE OverloadedStrings, BangPatterns, FlexibleInstances #-} -- | A monad for writing pure tokenizers in an imperative-looking way. -- -- Main idea: You 'walk' through the input string like a turtle, and everytime -- you find a token boundary, you call 'emit'. If some specific kinds of tokens -- should be suppressed, you can 'discard' them instead (or filter afterwards). -- -- This module supports strict text, lazy text, and strings, though the package -- also provides support for ASCII bytestrings in separate modules. -- -- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols: -- -- > tokenizeWords :: T.Text -> [T.Text] -- > tokenizeWords = runTokenizer $ untilEOT $ do -- > c <- pop -- > if isStopSym c -- > then discard -- > else if c `elem` ("  \t\r\n" :: [Char]) -- > then discard -- > else do -- > walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c)) -- > emit module Control.Monad.Tokenizer ( -- * Monad Tokenizer, runTokenizer, runTokenizerCS, untilEOT, -- * Tests peek, isEOT, lookAhead, -- * Movement walk, walkBack, pop, walkWhile, walkFold, -- * Transactions emit, discard, restore, -- * Embedding embed, embed_, discardAndEmbed, -- * Conversion convert, convertWith, -- * Text types Tokenizable(..) ) where import Control.Monad import Data.Char import Data.Monoid import Data.String import qualified Data.Text as T import qualified Data.Text.Lazy as LT import Control.Monad.Tokenizer.Class -- | Tokenizer monad. Use runTokenizer or runTokenizerCS to run it newtype Tokenizer t a = Tokenizer { runTokenizer' :: (t, Int, t) -> (a,[t] -> [t],t,Int,t) } --- type explanation: (whole text since last emission, chars passed, remaining text) --- -> (result, difference list of tokens, whole text, chars passed, remaining text) instance Functor (Tokenizer t) where fmap = liftM instance Applicative (Tokenizer t) where pure = return (<*>) = ap instance Monad (Tokenizer t) where return a = Tokenizer $ \(whole,count,tail) -> (a,id,whole,count,tail) m >>= f = Tokenizer $ \(whole,count,tail) -> let (a1,o1,w1,!c1,t1) = runTokenizer' m (whole,count,tail) (a2,o2,w2,!c2,t2) = runTokenizer' (f a1) (w1,c1,t1) in (a2,o1.o2,w2,c2,t2) instance Tokenizable t => MonadTokenizer (Tokenizer t) where lookAhead chars = Tokenizer $ \(whole,count,tail) -> let h = unpack $ ttake (length chars) tail in (h == chars, id, whole, count, tail) where unpack t | tnull t = [] | otherwise = thead t : unpack (ttail t) walk = Tokenizer $ \(whole,count,tail) -> if tnull tail then ((),id,whole,count,tail) else ((),id,whole,count+1,ttail tail) walkBack = Tokenizer $ \(whole,count,_) -> if count > 0 then ((),id,whole,count-1,tdrop (count-1) whole) else ((),id,whole,0,whole) restore = Tokenizer $ \(whole,_,_) -> ((),id,whole,0,whole) peek = Tokenizer $ \(whole,count,tail) -> (th tail,id,whole,count,tail) where th t | tnull t = '\0' | otherwise = thead t emit = Tokenizer $ \(whole,count,tail) -> ((),(ttake count whole:),tail,0,tail) discard = Tokenizer $ \(whole,count,tail) -> ((),id,tail,0,tail) isEOT = Tokenizer $ \(whole, count, tail) -> (tnull tail, id, whole, count, tail) -- | Embed a pure tokenizer into the monad. The arguments to the function are (visited string, remaining string), and the return value is expected to be (result, emitted tokens, remaining string). embed :: Tokenizable t => ((t,t) -> (a,[t],t)) -> Tokenizer t a embed f = Tokenizer $ \(whole,count,tail) -> let (a, rs, rem) = f (tdrop count whole, tail) in (a, (rs++), rem, 0, rem) -- | Embed a pure tokenizer into the monad without a result. embed_ :: Tokenizable t => ((t,t) -> ([t],t)) -> Tokenizer t () embed_ f = embed f' where f' (v,u) = let (rs,rem) = f (v,u) in ((),rs,rem) -- | Embed a pure tokenizer into the monad. The visited string is discarded, and -- the given function is run on the unvisited part. The return value is expected -- to be (result, emitted tokens, remaining string). discardAndEmbed :: Tokenizable t => (t -> (a,[t],t)) -> Tokenizer t a discardAndEmbed f = discard >> embed (f . snd) -- | Natural transformation to convert between tokenizers of different text types. Note that this operation does not perform encoding\/decoding (i.e. converting from ByteString to Text does not decode Unicode characters). To do so, use convertWith a provide the correct encoding\/decoding functions. convert :: (Tokenizable t, IsString t, Tokenizable s, IsString s) => Tokenizer s a -> Tokenizer t a convert = convertWith fromString unpack . convertWith unpack fromString where unpack t | tnull t = [] | otherwise = thead t : unpack (ttail t) -- | Natural transformation to convert between tokenizers of different text types, using the given conversion functions. convertWith :: (s -> t) -> (t -> s) -> Tokenizer s a -> Tokenizer t a convertWith fw bw (Tokenizer runTok) = Tokenizer $ \(whole,count,tail) -> let (a, o, w, c, t) = runTok (bw whole, count, bw tail) o' = map fw $ o [] in (a, (o'++), fw w, c, fw t) -- | Split a string into tokens using the given tokenizer runTokenizer :: Tokenizable t => Tokenizer t () -> t -> [t] runTokenizer m input = let input' = tlower input in case runTokenizer' m (input',0,input') of (_, tokens, _, _, _) -> tokens [] -- | Split a string into tokens using the given tokenizer, case sensitive version runTokenizerCS :: Tokenizable t => Tokenizer t () -> t -> [t] runTokenizerCS m input = case runTokenizer' m (input,0,input) of (_, tokens, _, _, _) -> tokens []