-- | A monad for writing pure tokenizers in an imperative-looking way. -- -- Main idea: You 'walk' through the input string like a turtle, and everytime -- you find a token boundary, you call 'emit'. If some specific kinds of tokens -- should be suppressed, you can 'discard' them instead (or filter afterwards). -- -- This module supports is specialized for strict 'ByteString's. The module -- "Control.Monad.Tokenizer" provides more general types, but does not export a -- 'Tokenizable' instance for 'ByteString's, as its implementation depends on the -- encoding. This module assumes ASCII encoding (you have been warned!). -- -- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols: -- -- > tokenizeWords :: ByteString -> [ByteString] -- > tokenizeWords = runTokenizer $ untilEOT $ do -- > c <- pop -- > if isStopSym c -- > then discard -- > else if c `elem` (" \t\r\n" :: [Char]) -- > then discard -- > else do -- > walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c)) -- > emit module Control.Monad.Tokenizer.Char8.Strict ( -- * Monad Tokenizer, runTokenizer, runTokenizerCS, untilEOT, -- * Tests peek, isEOT, lookAhead, -- * Movement walk, walkBack, pop, walkWhile, walkFold, -- * Transactions emit, discard, restore ) where import qualified Control.Monad.Tokenizer as G import Data.ByteString.Char8 as C8 import Data.Char -- | Tokenizer monad. Use runTokenizer or runTokenizerCS to run it type Tokenizer = G.Tokenizer ByteString -- | Check if the next input chars agree with the given string lookAhead = G.lookAhead :: [Char] -> Tokenizer Bool -- | Proceed to the next character walk = G.walk :: Tokenizer () -- | Walk back to the previous character, unless it was discarded/emitted. walkBack = G.walkBack :: Tokenizer () -- | Restore the state after the last emit/discard. restore = G.restore :: Tokenizer () -- | Peek the current character peek = G.peek :: Tokenizer Char -- | Peek the current character and proceed pop = G.pop :: Tokenizer Char -- | Break at the current position and emit the scanned token emit = G.emit :: Tokenizer () -- | Break at the current position and discard the scanned token discard = G.discard :: Tokenizer () -- | Have I reached the end of the input text? isEOT = G.isEOT :: Tokenizer Bool -- | Proceed as long as a given function succeeds walkWhile = G.walkWhile :: (Char -> Bool) -> Tokenizer () -- | Proceed as long as a given fold returns Just (generalization of walkWhile) walkFold = G.walkFold :: a -> (Char -> a -> Maybe a) -> Tokenizer () -- | Repeat a given tokenizer as long as the end of text is not reached untilEOT = G.untilEOT :: Tokenizer () -> Tokenizer () -- | Split a string into tokens using the given tokenizer runTokenizer = G.runTokenizer :: Tokenizer () -> ByteString -> [ByteString] -- | Split a string into tokens using the given tokenizer, case sensitive version runTokenizerCS = G.runTokenizerCS :: Tokenizer () -> ByteString -> [ByteString] -- | Assuming ASCII encoding instance G.Tokenizable ByteString where tnull = C8.null thead = C8.head ttail = C8.tail ttake = C8.take tdrop = C8.drop tlower = C8.map toLower