{-# LANGUAGE FlexibleInstances #-} -- | A monad transformer for tokenizing streams of text. -- -- Main idea: You 'walk' through the input string like a turtle, and everytime -- you find a token boundary, you call 'emit'. If some specific kinds of tokens -- should be suppressed, you can 'discard' them instead (or filter afterwards). -- -- The package tokenizer-monad provides a monad (and class) for tokenizing -- pure text/strings in memory. This package supplements it with a transformer -- to work on impure Streams of text/strings. Your existing tokenizers can -- be ported without code changes. -- -- This module supports strict text, lazy text, strings, lazy ASCII bytestrings and strict bytestrings. For working with Unicode encodings, have a look at "Control.Monad.Tokenizer.Streaming.Decode". -- -- For examples on how to write tokenizers, have a look at the package -- tokenizer-monad. Here's an example on how to use it with streams: -- -- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols: -- -- > tokenizeWords :: Monad m => Stream (Of T.Text) m () -> Stream (Of T.Text) m () -- > tokenizeWords = runTokenizerT $ untilEOT $ do -- > c <- pop -- > if isStopSym c -- > then discard -- > else if c `elem` ("  \t\r\n" :: [Char]) -- > then discard -- > else do -- > walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c)) -- > emit module Control.Monad.Tokenizer.Streaming ( -- * Monad transformer TokenizerT, runTokenizerT, runTokenizerCST, C.untilEOT, -- * Tests C.peek, C.isEOT, C.lookAhead, -- * Movement C.walk, C.walkBack, C.pop, C.walkWhile, C.walkFold, -- * Transactions C.emit, C.discard, C.restore, -- * Text types Tokenizable(..) ) where import Control.Monad import Control.Monad.Trans import Data.Monoid import qualified Streaming.Prelude as S import Streaming import qualified Control.Monad.Tokenizer.Class as C import Control.Monad.Tokenizer.Char8.Lazy () import Control.Monad.Tokenizer.Char8.Strict () import qualified Data.Text as T import qualified Data.Text.Lazy as LT import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as LBS -- | Tokenizer transformer. Use 'runTokenizerT' or 'runTokenizerCST' to run it newtype TokenizerT t m a = TokenizerT { runTokenizerT' :: (t,Stream (Of t) m ()) -> Stream (Of t) m (a,t,Stream (Of t) m ()) } -- (visited, remaining) -> Stream (of emissions) m (result, visited, remaining) -- | Split a text stream into tokens using the given tokenizer, case sensitive version runTokenizerCST :: (Tokenizable t,Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a runTokenizerCST tok ins = do (a,_,_) <- runTokenizerT' tok (mempty,ins) return a -- | Split a text stream into tokens using the given tokenizer runTokenizerT :: (Tokenizable t,Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a runTokenizerT tok = runTokenizerCST tok . S.map C.tlower -- | Text types that can be split by the TokenizerT transformer. In this module, -- instances are provided for String, strict Text, and lazy Text. -- There are also instances for strict and lazy ByteStrings, but keep in mind -- that they assume ASCII encoding. If you want to apply reasonable decoding, -- try Control.Monad.Tokenizer.Streaming.Decode. class (C.Tokenizable t, Monoid t) => Tokenizable t where tsingleton :: Char -> t tinit :: t -> t tlast :: t -> Char instance Tokenizable [Char] where tsingleton = pure tinit = init tlast = last instance Tokenizable T.Text where tsingleton = T.singleton tinit = T.init tlast = T.last instance Tokenizable LT.Text where tsingleton = LT.singleton tinit = LT.init tlast = LT.last instance Tokenizable BS.ByteString where tsingleton = BS.singleton tinit = BS.init tlast = BS.last instance Tokenizable LBS.ByteString where tsingleton = LBS.singleton tinit = LBS.init tlast = LBS.last instance Monad m => Functor (TokenizerT t m) where fmap = liftM instance Monad m => Applicative (TokenizerT t m) where (<*>) = ap pure = return instance Monad m => Monad (TokenizerT t m) where return a = TokenizerT $ \(v,rem) -> return (a,v,rem) m >>= f = TokenizerT $ \(v,rem) -> do (a1,v1,rem1) <- runTokenizerT' m (v,rem) (a2,v2,rem2) <- runTokenizerT' (f a1) (v1,rem1) return (a2,v2,rem2) instance MonadTrans (TokenizerT t) where lift m = TokenizerT $ \(v,rem) -> do a <- lift m return (a,v,rem) uncons1 :: (Monad m,Tokenizable t) => Stream (Of t) m a -> m (Maybe (Char,Stream (Of t) m a)) uncons1 stream = do muc <- S.uncons stream case muc of Nothing -> return Nothing Just (one,more) | C.tnull one -> uncons1 more Just (one,more) -> let (th, tt) = (C.thead one, C.ttail one) more' | C.tnull tt = more | otherwise = S.cons tt more in return $ Just (th, more') unconsn :: (Monad m,Tokenizable t) => Int -> Stream (Of t) m a -> m ([Char],Stream (Of t) m a) unconsn 0 stream = return ([],stream) unconsn n stream = do muc <- uncons1 stream case muc of Nothing -> return ([],stream) Just (h,stream') -> do (t,stream'') <- unconsn (n-1) stream' return (h:t,stream'') instance (Monad m, Tokenizable t) => C.MonadTokenizer (TokenizerT t m) where walk = void C.pop pop = TokenizerT $ \(v,rem) -> do muc <- lift $ uncons1 rem case muc of Nothing -> return ('\0',v,rem) Just (h,rem') -> return (h,v<>tsingleton h,rem') peek = TokenizerT $ \(v,rem) -> do muc <- lift $ uncons1 rem case muc of Nothing -> return ('\0',v,rem) Just (h,_) -> return (h,v,rem) restore = TokenizerT $ \(v,rem) -> return ((),mempty,S.cons v rem) emit = TokenizerT $ \(v,rem) -> S.yield v >> return ((),mempty,rem) discard = TokenizerT $ \(v,rem) -> return ((),mempty,rem) isEOT = TokenizerT $ \(v,rem) -> do muc <- lift $ S.uncons rem case muc of Nothing -> return (True,v,rem) Just _ -> return (False,v,rem) lookAhead cs = TokenizerT $ \(v,rem) -> do (cs',_) <- lift $ unconsn (length cs) rem return (cs == cs', v, rem) walkBack = TokenizerT $ \(v,rem) -> if C.tnull v then return ((),v,rem) else return ((),tinit v,S.cons (tsingleton $ tlast v) rem) words' :: C.MonadTokenizer m => m () words' = C.untilEOT $ do c <- C.pop if c `elem` " \t\n\r" then C.discard else do C.walkWhile (\c -> not (c `elem` " \t\n\r")) C.emit