----------------------------------------------------------------------------- -- | -- Module: Data.Enumerator.Text -- Copyright: 2010 John Millikin -- License: MIT -- -- Maintainer: jmillikin@gmail.com -- Portability: portable -- -- This module is intended to be imported qualified: -- -- @ -- import qualified Data.Enumerator.Text as ET -- @ -- -- Since: 0.2 -- ----------------------------------------------------------------------------- module Data.Enumerator.Text ( -- * Text IO enumHandle , enumFile , iterHandle -- * List analogues , Data.Enumerator.Text.head , Data.Enumerator.Text.drop , Data.Enumerator.Text.dropWhile , Data.Enumerator.Text.take , Data.Enumerator.Text.takeWhile , Data.Enumerator.Text.consume , require , isolate -- * Codecs , Codec , encode , decode , utf8 , utf16_le , utf16_be , utf32_le , utf32_be , ascii , iso8859_1 ) where import qualified Prelude import Prelude hiding (head, drop, takeWhile) import Data.Enumerator hiding (head, drop) import qualified Data.Text as T import Data.Enumerator.Util (tryStep) import qualified Data.Text.IO as TIO import qualified Control.Exception as Exc import Control.Monad.IO.Class (MonadIO) import qualified System.IO as IO import System.IO.Error (isEOFError) import qualified Data.Text.Lazy as TL import qualified Data.ByteString as B import Data.Enumerator.Util (tSpanBy, tlSpanBy, reprWord, reprChar) import Control.Arrow (first) import Data.Bits ((.&.), (.|.), shiftL) import Data.Char (ord) import Data.Word (Word8, Word16) import qualified Data.ByteString.Char8 as B8 import qualified Data.Text.Encoding as TE import Data.Maybe (catMaybes) import System.IO.Unsafe (unsafePerformIO) -- | Read lines of text from the handle, and stream them to an 'Iteratee'. -- If an exception occurs during file IO, enumeration will stop and 'Error' -- will be returned. Exceptions from the iteratee are not caught. -- -- The handle should be opened with an appropriate text encoding, and -- in 'IO.ReadMode' or 'IO.ReadWriteMode'. -- -- Since: 0.2 enumHandle :: MonadIO m => IO.Handle -> Enumerator T.Text m b enumHandle h = loop where loop (Continue k) = withText $ \maybeText -> case maybeText of Nothing -> continue k Just text -> k (Chunks [text]) >>== loop loop step = returnI step withText = tryStep $ Exc.catch (Just `fmap` TIO.hGetLine h) (\err -> if isEOFError err then return Nothing else Exc.throwIO err) -- | Opens a file path in text mode, and passes the handle to 'enumHandle'. -- The file will be closed when the 'Iteratee' finishes. -- -- Since: 0.2 enumFile :: FilePath -> Enumerator T.Text IO b enumFile path = enum where withHandle = tryStep (IO.openFile path IO.ReadMode) enum step = withHandle $ \h -> Iteratee $ Exc.finally (runIteratee (enumHandle h step)) (IO.hClose h) -- | Read text from a stream and write it to a handle. If an exception -- occurs during file IO, enumeration will stop and 'Error' will be -- returned. -- -- The handle should be opened with an appropriate text encoding, and -- in 'IO.WriteMode' or 'IO.ReadWriteMode'. -- -- Since: 0.2 iterHandle :: MonadIO m => IO.Handle -> Iteratee T.Text m () iterHandle h = continue step where step EOF = yield () EOF step (Chunks []) = continue step step (Chunks chunks) = let put = mapM_ (TIO.hPutStr h) chunks in tryStep put (\_ -> continue step) toChunks :: TL.Text -> Stream T.Text toChunks = Chunks . TL.toChunks -- | Get the next character from the stream, or 'Nothing' if the stream has -- ended. -- -- Since: 0.4.5 head :: Monad m => Iteratee T.Text m (Maybe Char) head = continue loop where loop (Chunks xs) = case TL.uncons (TL.fromChunks xs) of Just (char, extra) -> yield (Just char) (toChunks extra) Nothing -> head loop EOF = yield Nothing EOF -- | @drop n@ ignores /n/ characters of input from the stream. -- -- Since: 0.4.5 drop :: Monad m => Integer -> Iteratee T.Text m () drop n | n <= 0 = return () drop n = continue (loop n) where loop n' (Chunks xs) = iter where lazy = TL.fromChunks xs len = toInteger (TL.length lazy) iter = if len < n' then drop (n' - len) else yield () (toChunks (TL.drop (fromInteger n') lazy)) loop _ EOF = yield () EOF -- | @dropWhile p@ ignores input from the stream until the first character -- which does not match the predicate. -- -- Since: 0.4.5 dropWhile :: Monad m => (Char -> Bool) -> Iteratee T.Text m () dropWhile p = continue loop where loop (Chunks xs) = iter where lazy = TL.dropWhile p (TL.fromChunks xs) iter = if TL.null lazy then continue loop else yield () (toChunks lazy) loop EOF = yield () EOF -- | @take n@ extracts the next /n/ characters from the stream, as a lazy -- Text. -- -- Since: 0.4.5 take :: Monad m => Integer -> Iteratee T.Text m TL.Text take n | n <= 0 = return TL.empty take n = continue (loop id n) where loop acc n' (Chunks xs) = iter where lazy = TL.fromChunks xs len = toInteger (TL.length lazy) iter = if len < n' then continue (loop (acc . (TL.append lazy)) (n' - len)) else let (xs', extra) = TL.splitAt (fromInteger n') lazy in yield (acc xs') (toChunks extra) loop acc _ EOF = yield (acc TL.empty) EOF -- | @takeWhile p@ extracts input from the stream until the first character -- which does not match the predicate. -- -- Since: 0.4.5 takeWhile :: Monad m => (Char -> Bool) -> Iteratee T.Text m TL.Text takeWhile p = continue (loop id) where loop acc (Chunks []) = continue (loop acc) loop acc (Chunks xs) = iter where lazy = TL.fromChunks xs (xs', extra) = tlSpanBy p lazy iter = if TL.null extra then continue (loop (acc . (TL.append lazy))) else yield (acc xs') (toChunks extra) loop acc EOF = yield (acc TL.empty) EOF -- | Read all remaining input from the stream, and return as a lazy -- Text. -- -- Since: 0.4.5 consume :: Monad m => Iteratee T.Text m TL.Text consume = continue (loop id) where loop acc (Chunks []) = continue (loop acc) loop acc (Chunks xs) = iter where lazy = TL.fromChunks xs iter = continue (loop (acc . (TL.append lazy))) loop acc EOF = yield (acc TL.empty) EOF -- | @require n@ buffers input until at least /n/ characters are available, -- or throws an error if the stream ends early. -- -- Since: 0.4.5 require :: Monad m => Integer -> Iteratee T.Text m () require n | n <= 0 = return () require n = continue (loop id n) where loop acc n' (Chunks xs) = iter where lazy = TL.fromChunks xs len = toInteger (TL.length lazy) iter = if len < n' then continue (loop (acc . (TL.append lazy)) (n' - len)) else yield () (toChunks (acc lazy)) loop _ _ EOF = throwError (Exc.ErrorCall "require: Unexpected EOF") -- | @isolate n@ reads at most /n/ characters from the stream, and passes -- them to its iteratee. If the iteratee finishes early, characters continue -- to be consumed from the outer stream until /n/ have been consumed. -- -- Since: 0.4.5 isolate :: Monad m => Integer -> Enumeratee T.Text T.Text m b isolate n step | n <= 0 = return step isolate n (Continue k) = continue loop where loop (Chunks []) = continue loop loop (Chunks xs) = iter where lazy = TL.fromChunks xs len = toInteger (TL.length lazy) iter = if len <= n then k (Chunks xs) >>== isolate (n - len) else let (s1, s2) = TL.splitAt (fromInteger n) lazy in k (toChunks s1) >>== (\step -> yield step (toChunks s2)) loop EOF = k EOF >>== (\step -> yield step EOF) isolate n step = drop n >> return step data Codec = Codec { codecName :: T.Text , codecEncode :: T.Text -> (B.ByteString, Maybe (Exc.SomeException, T.Text)) , codecDecode :: B.ByteString -> (T.Text, Either (Exc.SomeException, B.ByteString) B.ByteString) } instance Show Codec where showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c) -- | Convert text into bytes, using the provided codec. If the codec is -- not capable of representing an input character, an error will be thrown. -- -- Since: 0.2 encode :: Monad m => Codec -> Enumeratee T.Text B.ByteString m b encode codec = checkDone (continue . step) where step k EOF = yield (Continue k) EOF step k (Chunks xs) = loop k xs loop k [] = continue (step k) loop k (x:xs) = let (bytes, extra) = codecEncode codec x extraChunks = Chunks $ case extra of Nothing -> xs Just (_, text) -> text:xs checkError k' = case extra of Nothing -> loop k' xs Just (exc, _) -> throwError exc in if B.null bytes then checkError k else k (Chunks [bytes]) >>== checkDoneEx extraChunks checkError -- | Convert bytes into text, using the provided codec. If the codec is -- not capable of decoding an input byte sequence, an error will be thrown. -- -- Since: 0.2 decode :: Monad m => Codec -> Enumeratee B.ByteString T.Text m b decode codec = checkDone (continue . step B.empty) where step _ k EOF = yield (Continue k) EOF step acc k (Chunks xs) = loop acc k xs loop acc k [] = continue (step acc k) loop acc k (x:xs) = let (text, extra) = codecDecode codec (B.append acc x) extraChunks = Chunks (either snd id extra : xs) checkError k' = case extra of Left (exc, _) -> throwError exc Right bytes -> loop bytes k' xs in if T.null text then checkError k else k (Chunks [text]) >>== checkDoneEx extraChunks checkError byteSplits :: B.ByteString -> [(B.ByteString, B.ByteString)] byteSplits bytes = loop (B.length bytes) where loop 0 = [(B.empty, bytes)] loop n = B.splitAt n bytes : loop (n - 1) splitSlowly :: (B.ByteString -> T.Text) -> B.ByteString -> (T.Text, Either (Exc.SomeException, B.ByteString) B.ByteString) splitSlowly dec bytes = valid where valid = firstValid (Prelude.map decFirst splits) splits = byteSplits bytes firstValid = Prelude.head . catMaybes tryDec = tryEvaluate . dec decFirst (a, b) = case tryDec a of Left _ -> Nothing Right text -> Just (text, case tryDec b of Left exc -> Left (exc, b) -- this case shouldn't occur, since splitSlowly -- is only called when parsing failed somewhere Right _ -> Right B.empty) utf8 :: Codec utf8 = Codec name enc dec where name = T.pack "UTF-8" enc text = (TE.encodeUtf8 text, Nothing) dec bytes = case splitQuickly bytes of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf8 bytes splitQuickly bytes = loop 0 >>= maybeDecode where required x0 | x0 .&. 0x80 == 0x00 = 1 | x0 .&. 0xE0 == 0xC0 = 2 | x0 .&. 0xF0 == 0xE0 = 3 | x0 .&. 0xF8 == 0xF0 = 4 -- Invalid input; let Text figure it out | otherwise = 0 maxN = B.length bytes loop n | n == maxN = Just (TE.decodeUtf8 bytes, B.empty) loop n = let req = required (B.index bytes n) tooLong = first TE.decodeUtf8 (B.splitAt n bytes) decodeMore = loop $! n + req in if req == 0 then Nothing else if n + req > maxN then Just tooLong else decodeMore utf16_le :: Codec utf16_le = Codec name enc dec where name = T.pack "UTF-16-LE" enc text = (TE.encodeUtf16LE text, Nothing) dec bytes = case splitQuickly bytes of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf16LE bytes splitQuickly bytes = maybeDecode (loop 0) where maxN = B.length bytes loop n | n == maxN = decodeAll | (n + 1) == maxN = decodeTo n loop n = let req = utf16Required (B.index bytes 0) (B.index bytes 1) decodeMore = loop $! n + req in if n + req > maxN then decodeTo n else decodeMore decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) decodeAll = (TE.decodeUtf16LE bytes, B.empty) utf16_be :: Codec utf16_be = Codec name enc dec where name = T.pack "UTF-16-BE" enc text = (TE.encodeUtf16BE text, Nothing) dec bytes = case splitQuickly bytes of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf16BE bytes splitQuickly bytes = maybeDecode (loop 0) where maxN = B.length bytes loop n | n == maxN = decodeAll | (n + 1) == maxN = decodeTo n loop n = let req = utf16Required (B.index bytes 1) (B.index bytes 0) decodeMore = loop $! n + req in if n + req > maxN then decodeTo n else decodeMore decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) decodeAll = (TE.decodeUtf16BE bytes, B.empty) utf16Required :: Word8 -> Word8 -> Int utf16Required x0 x1 = required where required = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 x :: Word16 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 utf32_le :: Codec utf32_le = Codec name enc dec where name = T.pack "UTF-32-LE" enc text = (TE.encodeUtf32LE text, Nothing) dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf32LE bs utf32_be :: Codec utf32_be = Codec name enc dec where name = T.pack "UTF-32-BE" enc text = (TE.encodeUtf32BE text, Nothing) dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of Just (text, extra) -> (text, Right extra) Nothing -> splitSlowly TE.decodeUtf32BE bs utf32SplitBytes :: (B.ByteString -> T.Text) -> B.ByteString -> Maybe (T.Text, B.ByteString) utf32SplitBytes dec bytes = split where split = maybeDecode (dec toDecode, extra) len = B.length bytes lenExtra = mod len 4 lenToDecode = len - lenExtra (toDecode, extra) = if lenExtra == 0 then (bytes, B.empty) else B.splitAt lenToDecode bytes ascii :: Codec ascii = Codec name enc dec where name = T.pack "ASCII" enc text = (bytes, extra) where (safe, unsafe) = tSpanBy (\c -> ord c <= 0x7F) text bytes = B8.pack (T.unpack safe) extra = if T.null unsafe then Nothing else Just (illegalEnc name (T.head unsafe), unsafe) dec bytes = (text, extra) where (safe, unsafe) = B.span (<= 0x7F) bytes text = T.pack (B8.unpack safe) extra = if B.null unsafe then Right B.empty else Left (illegalDec name (B.head unsafe), unsafe) iso8859_1 :: Codec iso8859_1 = Codec name enc dec where name = T.pack "ISO-8859-1" enc text = (bytes, extra) where (safe, unsafe) = tSpanBy (\c -> ord c <= 0xFF) text bytes = B8.pack (T.unpack safe) extra = if T.null unsafe then Nothing else Just (illegalEnc name (T.head unsafe), unsafe) dec bytes = (T.pack (B8.unpack bytes), Right B.empty) illegalEnc :: T.Text -> Char -> Exc.SomeException illegalEnc name c = Exc.toException . Exc.ErrorCall $ concat [ "Codec " , show name , " can't encode character " , reprChar c ] illegalDec :: T.Text -> Word8 -> Exc.SomeException illegalDec name w = Exc.toException . Exc.ErrorCall $ concat [ "Codec " , show name , " can't decode byte " , reprWord w ] tryEvaluate :: a -> Either Exc.SomeException a tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate maybeDecode:: (a, b) -> Maybe (a, b) maybeDecode (a, b) = case tryEvaluate a of Left _ -> Nothing Right _ -> Just (a, b)