Safe Haskell	Safe
Language	Haskell2010

Control.Monad.Tokenizer.Streaming

Contents

Monad transformer
Tests
Movement
Transactions
Text types

Description

A monad transformer for tokenizing streams of text.

Main idea: You walk through the input string like a turtle, and everytime you find a token boundary, you call emit. If some specific kinds of tokens should be suppressed, you can discard them instead (or filter afterwards).

The package tokenizer-monad provides a monad (and class) for tokenizing pure text/strings in memory. This package supplements it with a transformer to work on impure Streams of text/strings. Your existing tokenizers can be ported without code changes.

This module supports strict text, lazy text, strings, lazy ASCII bytestrings and strict bytestrings. For working with Unicode encodings, have a look at Control.Monad.Tokenizer.Streaming.Decode.

For examples on how to write tokenizers, have a look at the package tokenizer-monad. Here's an example on how to use it with streams:

Example for a simple tokenizer, that splits words by whitespace and discards stop symbols:

tokenizeWords :: Monad m => Stream (Of T.Text) m () -> Stream (Of T.Text) m ()
tokenizeWords = runTokenizerT $ untilEOT $ do
  c <- pop
  if isStopSym c
    then discard
    else if c `elem` ("  \t\r\n" :: [Char])
         then discard
         else do
           walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c))
           emit

Synopsis

data TokenizerT t m a
runTokenizerT :: (Tokenizable t, Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a
runTokenizerCST :: (Tokenizable t, Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a
untilEOT :: MonadTokenizer m => m () -> m ()
peek :: MonadTokenizer m => m Char
isEOT :: MonadTokenizer m => m Bool
lookAhead :: MonadTokenizer m => [Char] -> m Bool
walk :: MonadTokenizer m => m ()
walkBack :: MonadTokenizer m => m ()
pop :: MonadTokenizer m => m Char
walkWhile :: MonadTokenizer m => (Char -> Bool) -> m ()
walkFold :: MonadTokenizer m => a -> (Char -> a -> Maybe a) -> m ()
emit :: MonadTokenizer m => m ()
discard :: MonadTokenizer m => m ()
restore :: MonadTokenizer m => m ()
class (Tokenizable t, Monoid t) => Tokenizable t where
- tsingleton :: Char -> t
- tinit :: t -> t
- tlast :: t -> Char

Monad transformer

data TokenizerT t m a Source #

Tokenizer transformer. Use runTokenizerT or runTokenizerCST to run it

Instances

MonadTrans (TokenizerT t) Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods lift :: Monad m => m a -> TokenizerT t m a #
Monad m => Monad (TokenizerT t m) Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods (>>=) :: TokenizerT t m a -> (a -> TokenizerT t m b) -> TokenizerT t m b # (>>) :: TokenizerT t m a -> TokenizerT t m b -> TokenizerT t m b # return :: a -> TokenizerT t m a # fail :: String -> TokenizerT t m a #
Monad m => Functor (TokenizerT t m) Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods fmap :: (a -> b) -> TokenizerT t m a -> TokenizerT t m b # (<$) :: a -> TokenizerT t m b -> TokenizerT t m a #
Monad m => Applicative (TokenizerT t m) Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods pure :: a -> TokenizerT t m a # (<>) :: TokenizerT t m (a -> b) -> TokenizerT t m a -> TokenizerT t m b # liftA2 :: (a -> b -> c) -> TokenizerT t m a -> TokenizerT t m b -> TokenizerT t m c # (>) :: TokenizerT t m a -> TokenizerT t m b -> TokenizerT t m b # (<*) :: TokenizerT t m a -> TokenizerT t m b -> TokenizerT t m a #
(Monad m, Tokenizable t) => MonadTokenizer (TokenizerT t m) Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods walk :: TokenizerT t m () # walkBack :: TokenizerT t m () # peek :: TokenizerT t m Char # restore :: TokenizerT t m () # emit :: TokenizerT t m () # discard :: TokenizerT t m () # isEOT :: TokenizerT t m Bool # lookAhead :: [Char] -> TokenizerT t m Bool # pop :: TokenizerT t m Char #

runTokenizerT :: (Tokenizable t, Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a Source #

Split a text stream into tokens using the given tokenizer

runTokenizerCST :: (Tokenizable t, Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a Source #

Split a text stream into tokens using the given tokenizer, case sensitive version

untilEOT :: MonadTokenizer m => m () -> m () #

Repeat a given tokenizer as long as the end of text is not reached

Tests

peek :: MonadTokenizer m => m Char #

Peek the current character

isEOT :: MonadTokenizer m => m Bool #

Have I reached the end of the input text?

lookAhead :: MonadTokenizer m => [Char] -> m Bool #

Check if the next input chars agree with the given string

Movement

walk :: MonadTokenizer m => m () #

Proceed to the next character

walkBack :: MonadTokenizer m => m () #

Walk back to the previous character, unless it was discarded/emitted.

pop :: MonadTokenizer m => m Char #

Peek the current character and proceed

walkWhile :: MonadTokenizer m => (Char -> Bool) -> m () #

Proceed as long as a given function succeeds

walkFold :: MonadTokenizer m => a -> (Char -> a -> Maybe a) -> m () #

Proceed as long as a given fold returns Just (generalization of walkWhile)

Transactions

emit :: MonadTokenizer m => m () #

Break at the current position and emit the scanned token

discard :: MonadTokenizer m => m () #

Break at the current position and discard the scanned token

restore :: MonadTokenizer m => m () #

Restore the state after the last emit/discard.

Text types

class (Tokenizable t, Monoid t) => Tokenizable t where Source #

Text types that can be split by the TokenizerT transformer. In this module, instances are provided for String, strict Text, and lazy Text. There are also instances for strict and lazy ByteStrings, but keep in mind that they assume ASCII encoding. If you want to apply reasonable decoding, try Control.Monad.Tokenizer.Streaming.Decode.

Methods

tsingleton :: Char -> t Source #

tinit :: t -> t Source #

tlast :: t -> Char Source #

Instances

Tokenizable ByteString Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods tsingleton :: Char -> ByteString Source # tinit :: ByteString -> ByteString Source # tlast :: ByteString -> Char Source #
Tokenizable ByteString Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods tsingleton :: Char -> ByteString Source # tinit :: ByteString -> ByteString Source # tlast :: ByteString -> Char Source #
Tokenizable Text Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods tsingleton :: Char -> Text Source # tinit :: Text -> Text Source # tlast :: Text -> Char Source #
Tokenizable Text Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods tsingleton :: Char -> Text Source # tinit :: Text -> Text Source # tlast :: Text -> Char Source #
Tokenizable [Char] Source #
Instance details Defined in Control.Monad.Tokenizer.Streaming Methods tsingleton :: Char -> [Char] Source # tinit :: [Char] -> [Char] Source # tlast :: [Char] -> Char Source #