-- | Functions for running 'M.TokenizerT' on Unicode bytestring streams. -- -- For more information on how to work with 'M.TokenizerT', have a look at the -- module "Control.Monad.Tokenizer.Streaming". For more information on writing -- tokenizers, have a look at the module "Control.Monad.Tokenizer" from the -- package tokenizer-monad. -- -- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols: -- -- > tokenizeWords :: Monad m => Q.ByteString m () -> Stream (Of T.Text) m () -- > tokenizeWords = runUtf8TokenizerT $ untilEOT $ do -- > c <- pop -- > if isStopSym c -- > then discard -- > else if c `elem` ("  \t\r\n" :: [Char]) -- > then discard -- > else do -- > walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c)) -- > emit module Control.Monad.Tokenizer.Streaming.Decode ( -- * UTF-8 runUtf8TokenizerT, runUtf8TokenizerCST, -- * UTF-16 runUtf16LETokenizerT, runUtf16LETokenizerCST, runUtf16BETokenizerT, runUtf16BETokenizerCST, -- * UTF-32 runUtf32LETokenizerT, runUtf32LETokenizerCST, runUtf32BETokenizerT, runUtf32BETokenizerCST, -- * Helpers module Control.Monad.Tokenizer.Streaming, runDecodingTokenizerT, runDecodingTokenizerCST, decodeStream ) where import qualified Control.Monad.Tokenizer.Streaming as M import qualified Control.Monad.Tokenizer.Streaming import Streaming import qualified Streaming.Prelude as S import qualified Data.Text as T import qualified Data.ByteString.Streaming as Q import qualified Data.ByteString as BS import Data.Streaming.Text -- | Decode a Unicode bytestring stream into a stream of Text chunks. decodeStream :: Monad m => (BS.ByteString -> DecodeResult) -> Q.ByteString m () -> Stream (Of T.Text) m () decodeStream decoder = let step decode stream = do muc <- lift $ S.uncons stream case muc of Nothing -> case decoder BS.empty of DecodeResultFailure succ _ -> do S.yield succ fail "Decoding ended ungracefully" DecodeResultSuccess succ _ -> do S.yield succ Just (one,more) | BS.null one -> step decode more | otherwise -> case decoder one of DecodeResultFailure succ _ -> do S.yield succ fail "Decoding error" DecodeResultSuccess succ cont -> do S.yield succ step cont more in step decoder . Q.toChunks runDecodingTokenizerCST :: Monad m => (BS.ByteString -> DecodeResult) -> M.TokenizerT T.Text m a -> Q.ByteString m () -> Stream (Of T.Text) m a runDecodingTokenizerCST decoder tok ins = M.runTokenizerCST tok $ decodeStream decoder ins runDecodingTokenizerT :: Monad m => (BS.ByteString -> DecodeResult) -> M.TokenizerT T.Text m a -> Q.ByteString m () -> Stream (Of T.Text) m a runDecodingTokenizerT decoder tok ins = M.runTokenizerT tok $ decodeStream decoder ins runUtf8TokenizerCST, runUtf8TokenizerT, runUtf16LETokenizerCST, runUtf16LETokenizerT, runUtf16BETokenizerCST, runUtf16BETokenizerT, runUtf32LETokenizerCST, runUtf32LETokenizerT, runUtf32BETokenizerCST, runUtf32BETokenizerT :: Monad m => M.TokenizerT T.Text m a -> Q.ByteString m () -> Stream (Of T.Text) m a runUtf8TokenizerCST = runDecodingTokenizerCST decodeUtf8 runUtf8TokenizerT = runDecodingTokenizerT decodeUtf8 runUtf16LETokenizerCST = runDecodingTokenizerCST decodeUtf16LE runUtf16LETokenizerT = runDecodingTokenizerT decodeUtf16LE runUtf16BETokenizerCST = runDecodingTokenizerCST decodeUtf16BE runUtf16BETokenizerT = runDecodingTokenizerT decodeUtf16BE runUtf32LETokenizerCST = runDecodingTokenizerCST decodeUtf32LE runUtf32LETokenizerT = runDecodingTokenizerT decodeUtf32LE runUtf32BETokenizerCST = runDecodingTokenizerCST decodeUtf32BE runUtf32BETokenizerT = runDecodingTokenizerT decodeUtf32BE