\section{Text} :f Data/Enumerator/Text.hs |Data.Enumerator.Text module header| module Data.Enumerator.Text ( |Data.Enumerator.Text exports| ) where import qualified Prelude import Prelude hiding (head, drop, takeWhile) import Data.Enumerator hiding (head, drop) import qualified Data.Text as T |Data.Enumerator.Text imports| : \subsection{IO} Reading text is similar to reading bytes, but the enumerators have slightly different behavior -- instead of reading in fixed-size chunks of data, the text enumerators read in lines. This matches similar text-based {\sc api}s, such as Python's {\tt xreadlines()}. :d Data.Enumerator.Text imports import Data.Enumerator.Util (tryStep) import qualified Data.Text.IO as TIO import qualified Control.Exception as Exc import Control.Monad.IO.Class (MonadIO) import qualified System.IO as IO import System.IO.Error (isEOFError) : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.enumHandle| enumHandle :: MonadIO m => IO.Handle -> Enumerator T.Text m b enumHandle h = loop where loop (Continue k) = withText $ \maybeText -> case maybeText of Nothing -> continue k Just text -> k (Chunks [text]) >>== loop loop step = returnI step withText = tryStep $ Exc.catch (Just `fmap` TIO.hGetLine h) (\err -> if isEOFError err then return Nothing else Exc.throwIO err) : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.enumFile| enumFile :: FilePath -> Enumerator T.Text IO b enumFile path = enum where withHandle = tryStep (IO.openFile path IO.ReadMode) enum step = withHandle $ \h -> Iteratee $ Exc.finally (runIteratee (enumHandle h step)) (IO.hClose h) : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.iterHandle| iterHandle :: MonadIO m => IO.Handle -> Iteratee T.Text m () iterHandle h = continue step where step EOF = yield () EOF step (Chunks []) = continue step step (Chunks chunks) = let put = mapM_ (TIO.hPutStr h) chunks in tryStep put (\_ -> continue step) : :d Data.Enumerator.Text exports -- * Text IO enumHandle , enumFile , iterHandle : \subsection{List analogues} :d Data.Enumerator.Text imports import qualified Data.Text.Lazy as TL : :f Data/Enumerator/Text.hs toChunks :: TL.Text -> Stream T.Text toChunks = Chunks . TL.toChunks : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.head| head :: Monad m => Iteratee T.Text m (Maybe Char) head = continue loop where loop (Chunks xs) = case TL.uncons (TL.fromChunks xs) of Just (char, extra) -> yield (Just char) (toChunks extra) Nothing -> head loop EOF = yield Nothing EOF : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.drop| drop :: Monad m => Integer -> Iteratee T.Text m () drop n | n <= 0 = return () drop n = continue (loop n) where loop n' (Chunks xs) = iter where lazy = TL.fromChunks xs len = toInteger (TL.length lazy) iter = if len < n' then drop (n' - len) else yield () (toChunks (TL.drop (fromInteger n') lazy)) loop _ EOF = yield () EOF : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.dropWhile| dropWhile :: Monad m => (Char -> Bool) -> Iteratee T.Text m () dropWhile p = continue loop where loop (Chunks xs) = iter where lazy = TL.dropWhile p (TL.fromChunks xs) iter = if TL.null lazy then continue loop else yield () (toChunks lazy) loop EOF = yield () EOF : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.take| take :: Monad m => Integer -> Iteratee T.Text m TL.Text take n | n <= 0 = return TL.empty take n = continue (loop id n) where loop acc n' (Chunks xs) = iter where lazy = TL.fromChunks xs len = toInteger (TL.length lazy) iter = if len < n' then continue (loop (acc . (TL.append lazy)) (n' - len)) else let (xs', extra) = TL.splitAt (fromInteger n') lazy in yield (acc xs') (toChunks extra) loop acc _ EOF = yield (acc TL.empty) EOF : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.takeWhile| takeWhile :: Monad m => (Char -> Bool) -> Iteratee T.Text m TL.Text takeWhile p = continue (loop id) where loop acc (Chunks []) = continue (loop acc) loop acc (Chunks xs) = iter where lazy = TL.fromChunks xs (xs', extra) = tlSpanBy p lazy iter = if TL.null extra then continue (loop (acc . (TL.append lazy))) else yield (acc xs') (toChunks extra) loop acc EOF = yield (acc TL.empty) EOF : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.consume| consume :: Monad m => Iteratee T.Text m TL.Text consume = continue (loop id) where loop acc (Chunks []) = continue (loop acc) loop acc (Chunks xs) = iter where lazy = TL.fromChunks xs iter = continue (loop (acc . (TL.append lazy))) loop acc EOF = yield (acc TL.empty) EOF : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.require| require :: Monad m => Integer -> Iteratee T.Text m () require n | n <= 0 = return () require n = continue (loop id n) where loop acc n' (Chunks xs) = iter where lazy = TL.fromChunks xs len = toInteger (TL.length lazy) iter = if len < n' then continue (loop (acc . (TL.append lazy)) (n' - len)) else yield () (toChunks (acc lazy)) loop _ _ EOF = throwError (Exc.ErrorCall "require: Unexpected EOF") : Same caveats as {\tt Data.Enumerator.List.isolate} :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.isolate| isolate :: Monad m => Integer -> Enumeratee T.Text T.Text m b isolate n step | n <= 0 = return step isolate n (Continue k) = continue loop where loop (Chunks []) = continue loop loop (Chunks xs) = iter where lazy = TL.fromChunks xs len = toInteger (TL.length lazy) iter = if len <= n then k (Chunks xs) >>== isolate (n - len) else let (s1, s2) = TL.splitAt (fromInteger n) lazy in k (toChunks s1) >>== (\step -> yield step (toChunks s2)) loop EOF = k EOF >>== (\step -> yield step EOF) isolate n step = drop n >> return step : :d Data.Enumerator.Text exports -- * List analogues , Data.Enumerator.Text.head , Data.Enumerator.Text.drop , Data.Enumerator.Text.dropWhile , Data.Enumerator.Text.take , Data.Enumerator.Text.takeWhile , Data.Enumerator.Text.consume , require , isolate : \subsection{Codecs} Many protocols need the non-blocking input behavior of binary \io{}, but are defined in terms of unicode characters. The {\tt encode} and {\tt decode} enumeratees allow text-based protocols to be easily parsed from a binary input source. Most common codecs ({\sc utf-8}, {\sc iso-8859-1}, {\sc ascii}) are supported; more complex codecs can be implemented by bindings to libraries such as libicu. All of the codecs here are incremental; that is, they try to read as much data as possible, but no more. This allows iteratees to read partial data if the input stream contains invalid data. :d Data.Enumerator.Text imports import qualified Data.ByteString as B import Data.Enumerator.Util (tSpanBy, tlSpanBy, reprWord, reprChar) : :d Data.Enumerator.Text exports -- * Codecs , Codec , encode , decode |text codec exports| : :f Data/Enumerator/Text.hs data Codec = Codec { codecName :: T.Text , codecEncode :: T.Text -> (B.ByteString, Maybe (Exc.SomeException, T.Text)) , codecDecode :: B.ByteString -> (T.Text, Either (Exc.SomeException, B.ByteString) B.ByteString) } instance Show Codec where showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c) : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.encode| encode :: Monad m => Codec -> Enumeratee T.Text B.ByteString m b encode codec = checkDone (continue . step) where step k EOF = yield (Continue k) EOF step k (Chunks xs) = loop k xs loop k [] = continue (step k) loop k (x:xs) = let (bytes, extra) = codecEncode codec x extraChunks = Chunks $ case extra of Nothing -> xs Just (_, text) -> text:xs checkError k' = case extra of Nothing -> loop k' xs Just (exc, _) -> throwError exc in if B.null bytes then checkError k else k (Chunks [bytes]) >>== checkDoneEx extraChunks checkError : :f Data/Enumerator/Text.hs |apidoc Data.Enumerator.Text.decode| decode :: Monad m => Codec -> Enumeratee B.ByteString T.Text m b decode codec = checkDone (continue . step B.empty) where step _ k EOF = yield (Continue k) EOF step acc k (Chunks xs) = loop acc k xs loop acc k [] = continue (step acc k) loop acc k (x:xs) = let (text, extra) = codecDecode codec (B.append acc x) extraChunks = Chunks (either snd id extra : xs) checkError k' = case extra of Left (exc, _) -> throwError exc Right bytes -> loop bytes k' xs in if T.null text then checkError k else k (Chunks [text]) >>== checkDoneEx extraChunks checkError : Most of the codecs here need to perform at least basic bitbashing, to calculate how many input bytes will be needed for the next character. :d Data.Enumerator.Text imports import Control.Arrow (first) import Data.Bits ((.&.), (.|.), shiftL) import Data.Char (ord) import Data.Word (Word8, Word16) import qualified Data.ByteString.Char8 as B8 import qualified Data.Text.Encoding as TE : The variable-width decoders all follow the same basic pattern. First, they examine their input to calculate how many bytes the decoder function should accept. Next they try to decode it -- if the input is valid, decoding is finished. If the input is invalid, trying to decode the full input will throw an exception. When an exception is caught, decoding is passed off to {\tt splitSlowly} for a more careful parse. The input is reduced until the decoder can parse something, and the rest of the bytes are stored for later. An error will only be thrown if the iteratee requires input, but there are no valid bytes remaining. :f Data/Enumerator/Text.hs byteSplits :: B.ByteString -> [(B.ByteString, B.ByteString)] byteSplits bytes = loop (B.length bytes) where loop 0 = [(B.empty, bytes)] loop n = B.splitAt n bytes : loop (n - 1) : :d Data.Enumerator.Text imports import Data.Maybe (catMaybes) : :f Data/Enumerator/Text.hs splitSlowly :: (B.ByteString -> T.Text) -> B.ByteString -> (T.Text, Either (Exc.SomeException, B.ByteString) B.ByteString) splitSlowly dec bytes = valid where valid = firstValid (Prelude.map decFirst splits) splits = byteSplits bytes firstValid = Prelude.head . catMaybes tryDec = tryEvaluate . dec decFirst (a, b) = case tryDec a of Left _ -> Nothing Right text -> Just (text, case tryDec b of Left exc -> Left (exc, b) -- this case shouldn't occur, since splitSlowly -- is only called when parsing failed somewhere Right _ -> Right B.empty) : \subsubsection{UTF-8} :d text codec exports , utf8 : :f Data/Enumerator/Text.hs utf8 :: Codec utf8 = Codec name enc dec where name = T.pack "UTF-8" enc text = (TE.encodeUtf8 text, Nothing) dec bytes = case splitQuickly bytes of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf8 bytes |utf8 split bytes| : :d utf8 split bytes splitQuickly bytes = loop 0 >>= maybeDecode where |utf8 required bytes count| maxN = B.length bytes loop n | n == maxN = Just (TE.decodeUtf8 bytes, B.empty) loop n = let req = required (B.index bytes n) tooLong = first TE.decodeUtf8 (B.splitAt n bytes) decodeMore = loop $! n + req in if req == 0 then Nothing else if n + req > maxN then Just tooLong else decodeMore : :d utf8 required bytes count required x0 | x0 .&. 0x80 == 0x00 = 1 | x0 .&. 0xE0 == 0xC0 = 2 | x0 .&. 0xF0 == 0xE0 = 3 | x0 .&. 0xF8 == 0xF0 = 4 -- Invalid input; let Text figure it out | otherwise = 0 : \subsubsection{UTF-16} :d text codec exports , utf16_le , utf16_be : :f Data/Enumerator/Text.hs utf16_le :: Codec utf16_le = Codec name enc dec where name = T.pack "UTF-16-LE" enc text = (TE.encodeUtf16LE text, Nothing) dec bytes = case splitQuickly bytes of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf16LE bytes |utf16-le split bytes| : :f Data/Enumerator/Text.hs utf16_be :: Codec utf16_be = Codec name enc dec where name = T.pack "UTF-16-BE" enc text = (TE.encodeUtf16BE text, Nothing) dec bytes = case splitQuickly bytes of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf16BE bytes |utf16-be split bytes| : :d utf16-le split bytes splitQuickly bytes = maybeDecode (loop 0) where maxN = B.length bytes loop n | n == maxN = decodeAll | (n + 1) == maxN = decodeTo n loop n = let req = utf16Required (B.index bytes 0) (B.index bytes 1) decodeMore = loop $! n + req in if n + req > maxN then decodeTo n else decodeMore decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) decodeAll = (TE.decodeUtf16LE bytes, B.empty) : :d utf16-be split bytes splitQuickly bytes = maybeDecode (loop 0) where maxN = B.length bytes loop n | n == maxN = decodeAll | (n + 1) == maxN = decodeTo n loop n = let req = utf16Required (B.index bytes 1) (B.index bytes 0) decodeMore = loop $! n + req in if n + req > maxN then decodeTo n else decodeMore decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) decodeAll = (TE.decodeUtf16BE bytes, B.empty) : :f Data/Enumerator/Text.hs utf16Required :: Word8 -> Word8 -> Int utf16Required x0 x1 = required where required = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 x :: Word16 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 : \subsubsection{UTF-32} :d text codec exports , utf32_le , utf32_be : :f Data/Enumerator/Text.hs utf32_le :: Codec utf32_le = Codec name enc dec where name = T.pack "UTF-32-LE" enc text = (TE.encodeUtf32LE text, Nothing) dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf32LE bs utf32_be :: Codec utf32_be = Codec name enc dec where name = T.pack "UTF-32-BE" enc text = (TE.encodeUtf32BE text, Nothing) dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf32BE bs : :f Data/Enumerator/Text.hs utf32SplitBytes :: (B.ByteString -> T.Text) -> B.ByteString -> Maybe (T.Text, B.ByteString) utf32SplitBytes dec bytes = split where split = maybeDecode (dec toDecode, extra) len = B.length bytes lenExtra = mod len 4 lenToDecode = len - lenExtra (toDecode, extra) = if lenExtra == 0 then (bytes, B.empty) else B.splitAt lenToDecode bytes : \subsubsection{ASCII} :d text codec exports , ascii : :f Data/Enumerator/Text.hs ascii :: Codec ascii = Codec name enc dec where name = T.pack "ASCII" enc text = (bytes, extra) where (safe, unsafe) = tSpanBy (\c -> ord c <= 0x7F) text bytes = B8.pack (T.unpack safe) extra = if T.null unsafe then Nothing else Just (illegalEnc name (T.head unsafe), unsafe) dec bytes = (text, extra) where (safe, unsafe) = B.span (<= 0x7F) bytes text = T.pack (B8.unpack safe) extra = if B.null unsafe then Right B.empty else Left (illegalDec name (B.head unsafe), unsafe) : \subsubsection{ISO 8859-1} :d text codec exports , iso8859_1 : :f Data/Enumerator/Text.hs iso8859_1 :: Codec iso8859_1 = Codec name enc dec where name = T.pack "ISO-8859-1" enc text = (bytes, extra) where (safe, unsafe) = tSpanBy (\c -> ord c <= 0xFF) text bytes = B8.pack (T.unpack safe) extra = if T.null unsafe then Nothing else Just (illegalEnc name (T.head unsafe), unsafe) dec bytes = (T.pack (B8.unpack bytes), Right B.empty) : \subsection{Encoding Utilities} :f Data/Enumerator/Text.hs illegalEnc :: T.Text -> Char -> Exc.SomeException illegalEnc name c = Exc.toException . Exc.ErrorCall $ concat [ "Codec " , show name , " can't encode character " , reprChar c ] : :f Data/Enumerator/Text.hs illegalDec :: T.Text -> Word8 -> Exc.SomeException illegalDec name w = Exc.toException . Exc.ErrorCall $ concat [ "Codec " , show name , " can't decode byte " , reprWord w ] : :d Data.Enumerator.Text imports import System.IO.Unsafe (unsafePerformIO) : :f Data/Enumerator/Text.hs tryEvaluate :: a -> Either Exc.SomeException a tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate maybeDecode:: (a, b) -> Maybe (a, b) maybeDecode (a, b) = case tryEvaluate a of Left _ -> Nothing Right _ -> Just (a, b) :