-- | A monad for writing pure tokenizers in an imperative-looking way. -- -- Main idea: You 'walk' through the input string like a turtle, and everytime -- you find a token boundary, you call 'emit'. If some specific kinds of tokens -- should be suppressed, you can 'discard' them instead (or filter afterwards). -- -- This module supports is specialized for strict text. The module "Control.Monad.Tokenizer" provides more general types. -- -- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols: -- -- > tokenizeWords :: T.Text -> [T.Text] -- > tokenizeWords = runTokenizer $ untilEOT $ do -- > c <- pop -- > if isStopSym c -- > then discard -- > else if c `elem` ("  \t\r\n" :: [Char]) -- > then discard -- > else do -- > walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c)) -- > emit module Control.Monad.Tokenizer.Text.Strict ( -- * Monad Tokenizer, runTokenizer, runTokenizerCS, untilEOT, -- * Tests peek, isEOT, lookAhead, -- * Movement walk, walkBack, pop, walkWhile, walkFold, -- * Transactions emit, discard, restore ) where import qualified Control.Monad.Tokenizer as G import Data.Text -- | Tokenizer monad. Use runTokenizer or runTokenizerCS to run it type Tokenizer = G.Tokenizer Text -- | Check if the next input chars agree with the given string lookAhead = G.lookAhead :: [Char] -> Tokenizer Bool -- | Proceed to the next character walk = G.walk :: Tokenizer () -- | Walk back to the previous character, unless it was discarded/emitted. walkBack = G.walkBack :: Tokenizer () -- | Restore the state after the last emit/discard. restore = G.restore :: Tokenizer () -- | Peek the current character peek = G.peek :: Tokenizer Char -- | Peek the current character and proceed pop = G.pop :: Tokenizer Char -- | Break at the current position and emit the scanned token emit = G.emit :: Tokenizer () -- | Break at the current position and discard the scanned token discard = G.discard :: Tokenizer () -- | Have I reached the end of the input text? isEOT = G.isEOT :: Tokenizer Bool -- | Proceed as long as a given function succeeds walkWhile = G.walkWhile :: (Char -> Bool) -> Tokenizer () -- | Proceed as long as a given fold returns Just (generalization of walkWhile) walkFold = G.walkFold :: a -> (Char -> a -> Maybe a) -> Tokenizer () -- | Repeat a given tokenizer as long as the end of text is not reached untilEOT = G.untilEOT :: Tokenizer () -> Tokenizer () -- | Split a string into tokens using the given tokenizer runTokenizer = G.runTokenizer :: Tokenizer () -> Text -> [Text] -- | Split a string into tokens using the given tokenizer, case sensitive version runTokenizerCS = G.runTokenizerCS :: Tokenizer () -> Text -> [Text]