\section{Text}

:f Data/Enumerator/Text.hs
|Data.Enumerator.Text module header|
module Data.Enumerator.Text (
	|Data.Enumerator.Text exports|
	) where
import qualified Prelude
import Prelude hiding (head, drop, takeWhile)
import Data.Enumerator hiding (head, drop)
import qualified Data.Text as T
|Data.Enumerator.Text imports|
:

\subsection{IO}

Reading text is similar to reading bytes, but the enumerators have slightly
different behavior -- instead of reading in fixed-size chunks of data, the
text enumerators read in lines. This matches similar text-based {\sc api}s,
such as Python's {\tt xreadlines()}.

:d Data.Enumerator.Text imports
import Data.Enumerator.Util (tryStep)
import qualified Data.Text.IO as TIO

import qualified Control.Exception as Exc
import Control.Monad.IO.Class (MonadIO)
import qualified System.IO as IO
import System.IO.Error (isEOFError)
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.enumHandle|
enumHandle :: MonadIO m => IO.Handle
           -> Enumerator T.Text m b
enumHandle h = loop where
	loop (Continue k) = withText $ \maybeText ->
		case maybeText of
			Nothing -> continue k
			Just text -> k (Chunks [text]) >>== loop
	
	loop step = returnI step
	withText = tryStep $ Exc.catch
		(Just `fmap` TIO.hGetLine h)
		(\err -> if isEOFError err
			then return Nothing
			else Exc.throwIO err)
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.enumFile|
enumFile :: FilePath -> Enumerator T.Text IO b
enumFile path = enum where
	withHandle = tryStep (IO.openFile path IO.ReadMode)
	enum step = withHandle $ \h -> Iteratee $ Exc.finally
		(runIteratee (enumHandle h step))
		(IO.hClose h)
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.iterHandle|
iterHandle :: MonadIO m => IO.Handle
           -> Iteratee T.Text m ()
iterHandle h = continue step where
	step EOF = yield () EOF
	step (Chunks []) = continue step
	step (Chunks chunks) = let
		put = mapM_ (TIO.hPutStr h) chunks
		in tryStep put (\_ -> continue step)
:

:d Data.Enumerator.Text exports
  -- * Text IO
  enumHandle
, enumFile
, iterHandle
:

\subsection{List analogues}

:d Data.Enumerator.Text imports
import qualified Data.Text.Lazy as TL
:

:f Data/Enumerator/Text.hs
toChunks :: TL.Text -> Stream T.Text
toChunks = Chunks . TL.toChunks
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.head|
head :: Monad m => Iteratee T.Text m (Maybe Char)
head = continue loop where
	loop (Chunks xs) = case TL.uncons (TL.fromChunks xs) of
		Just (char, extra) -> yield (Just char) (toChunks extra)
		Nothing -> head
	loop EOF = yield Nothing EOF
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.drop|
drop :: Monad m => Integer -> Iteratee T.Text m ()
drop n | n <= 0 = return ()
drop n = continue (loop n) where
	loop n' (Chunks xs) = iter where
		lazy = TL.fromChunks xs
		len = toInteger (TL.length lazy)
		iter = if len < n'
			then drop (n' - len)
			else yield () (toChunks (TL.drop (fromInteger n') lazy))
	loop _ EOF = yield () EOF
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.dropWhile|
dropWhile :: Monad m => (Char -> Bool) -> Iteratee T.Text m ()
dropWhile p = continue loop where
	loop (Chunks xs) = iter where
		lazy = TL.dropWhile p (TL.fromChunks xs)
		iter = if TL.null lazy
			then continue loop
			else yield () (toChunks lazy)
	loop EOF = yield () EOF
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.take|
take :: Monad m => Integer -> Iteratee T.Text m TL.Text
take n | n <= 0 = return TL.empty
take n = continue (loop id n) where
	loop acc n' (Chunks xs) = iter where
		lazy = TL.fromChunks xs
		len = toInteger (TL.length lazy)
		
		iter = if len < n'
			then continue (loop (acc . (TL.append lazy)) (n' - len))
			else let
				(xs', extra) = TL.splitAt (fromInteger n') lazy
				in yield (acc xs') (toChunks extra)
	loop acc _ EOF = yield (acc TL.empty) EOF
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.takeWhile|
takeWhile :: Monad m => (Char -> Bool) -> Iteratee T.Text m TL.Text
takeWhile p = continue (loop id) where
	loop acc (Chunks []) = continue (loop acc)
	loop acc (Chunks xs) = iter where
		lazy = TL.fromChunks xs
		(xs', extra) = tlSpanBy p lazy
		iter = if TL.null extra
			then continue (loop (acc . (TL.append lazy)))
			else yield (acc xs') (toChunks extra)
	loop acc EOF = yield (acc TL.empty) EOF
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.consume|
consume :: Monad m => Iteratee T.Text m TL.Text
consume = continue (loop id) where
	loop acc (Chunks []) = continue (loop acc)
	loop acc (Chunks xs) = iter where
		lazy = TL.fromChunks xs
		iter = continue (loop (acc . (TL.append lazy)))
	loop acc EOF = yield (acc TL.empty) EOF
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.require|
require :: Monad m => Integer -> Iteratee T.Text m ()
require n | n <= 0 = return ()
require n = continue (loop id n) where
	loop acc n' (Chunks xs) = iter where
		lazy = TL.fromChunks xs
		len = toInteger (TL.length lazy)
		iter = if len < n'
			then continue (loop (acc . (TL.append lazy)) (n' - len))
			else yield () (toChunks (acc lazy))
	loop _ _ EOF = throwError (Exc.ErrorCall "require: Unexpected EOF")
:

Same caveats as {\tt Data.Enumerator.List.isolate}

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.isolate|
isolate :: Monad m => Integer -> Enumeratee T.Text T.Text m b
isolate n step | n <= 0 = return step
isolate n (Continue k) = continue loop where
	loop (Chunks []) = continue loop
	loop (Chunks xs) = iter where
		lazy = TL.fromChunks xs
		len = toInteger (TL.length lazy)
		
		iter = if len <= n
			then k (Chunks xs) >>== isolate (n - len)
			else let
				(s1, s2) = TL.splitAt (fromInteger n) lazy
				in k (toChunks s1) >>== (\step -> yield step (toChunks s2))
	loop EOF = k EOF >>== (\step -> yield step EOF)
isolate n step = drop n >> return step
:

:d Data.Enumerator.Text exports
-- * List analogues
, Data.Enumerator.Text.head
, Data.Enumerator.Text.drop
, Data.Enumerator.Text.dropWhile
, Data.Enumerator.Text.take
, Data.Enumerator.Text.takeWhile
, Data.Enumerator.Text.consume
, require
, isolate
:

\subsection{Codecs}

Many protocols need the non-blocking input behavior of binary \io{}, but
are defined in terms of unicode characters. The {\tt encode} and
{\tt decode} enumeratees allow text-based protocols to be easily parsed
from a binary input source.

Most common codecs ({\sc utf-8}, {\sc iso-8859-1}, {\sc ascii}) are
supported; more complex codecs can be implemented by bindings to libraries
such as libicu.

All of the codecs here are incremental; that is, they try to read as much
data as possible, but no more. This allows iteratees to read partial data
if the input stream contains invalid data.

:d Data.Enumerator.Text imports
import qualified Data.ByteString as B
import Data.Enumerator.Util (tSpanBy, tlSpanBy, reprWord, reprChar)
:

:d Data.Enumerator.Text exports
  -- * Codecs
, Codec
, encode
, decode
|text codec exports|
:

:f Data/Enumerator/Text.hs
data Codec = Codec
	{ codecName :: T.Text
	, codecEncode
		:: T.Text
		-> (B.ByteString, Maybe (Exc.SomeException, T.Text))
	, codecDecode
		:: B.ByteString
		-> (T.Text, Either
			(Exc.SomeException, B.ByteString)
			B.ByteString)
	}

instance Show Codec where
	showsPrec d c = showParen (d > 10) $
		showString "Codec " . shows (codecName c)
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.encode|
encode :: Monad m => Codec
       -> Enumeratee T.Text B.ByteString m b
encode codec = checkDone (continue . step) where
	step k EOF = yield (Continue k) EOF
	step k (Chunks xs) = loop k xs
	
	loop k [] = continue (step k)
	loop k (x:xs) = let
		(bytes, extra) = codecEncode codec x
		extraChunks = Chunks $ case extra of
			Nothing -> xs
			Just (_, text) -> text:xs
		
		checkError k' = case extra of
			Nothing -> loop k' xs
			Just (exc, _) -> throwError exc
		
		in if B.null bytes
			then checkError k
			else k (Chunks [bytes]) >>==
				checkDoneEx extraChunks checkError
:

:f Data/Enumerator/Text.hs
|apidoc Data.Enumerator.Text.decode|
decode :: Monad m => Codec
       -> Enumeratee B.ByteString T.Text m b
decode codec = checkDone (continue . step B.empty) where
	step _   k EOF = yield (Continue k) EOF
	step acc k (Chunks xs) = loop acc k xs
	
	loop acc k [] = continue (step acc k)
	loop acc k (x:xs) = let
		(text, extra) = codecDecode codec (B.append acc x)
		extraChunks = Chunks (either snd id extra : xs)
		
		checkError k' = case extra of
			Left (exc, _) -> throwError exc
			Right bytes -> loop bytes k' xs
		
		in if T.null text
			then checkError k
			else k (Chunks [text]) >>==
				checkDoneEx extraChunks checkError
:

Most of the codecs here need to perform at least basic bitbashing, to
calculate how many input bytes will be needed for the next character.

:d Data.Enumerator.Text imports
import Control.Arrow (first)
import Data.Bits ((.&.), (.|.), shiftL)
import Data.Char (ord)
import Data.Word (Word8, Word16)
import qualified Data.ByteString.Char8 as B8
import qualified Data.Text.Encoding as TE
:

The variable-width decoders all follow the same basic pattern. First,
they examine their input to calculate how many bytes the decoder
function should accept. Next they try to decode it -- if the input
is valid, decoding is finished.

If the input is invalid, trying to decode the full input will throw
an exception. When an exception is caught, decoding is passed off to
{\tt splitSlowly} for a more careful parse. The input is reduced until
the decoder can parse something, and the rest of the bytes are stored
for later. An error will only be thrown if the iteratee requires input,
but there are no valid bytes remaining.

:f Data/Enumerator/Text.hs
byteSplits :: B.ByteString
           -> [(B.ByteString, B.ByteString)]
byteSplits bytes = loop (B.length bytes) where
	loop 0 = [(B.empty, bytes)]
	loop n = B.splitAt n bytes : loop (n - 1)
:

:d Data.Enumerator.Text imports
import Data.Maybe (catMaybes)
:

:f Data/Enumerator/Text.hs
splitSlowly :: (B.ByteString -> T.Text)
            -> B.ByteString
            -> (T.Text, Either
            	(Exc.SomeException, B.ByteString)
            	B.ByteString)
splitSlowly dec bytes = valid where
	valid = firstValid (Prelude.map decFirst splits)
	splits = byteSplits bytes
	firstValid = Prelude.head . catMaybes
	tryDec = tryEvaluate . dec
	
	decFirst (a, b) = case tryDec a of
		Left _ -> Nothing
		Right text -> Just (text, case tryDec b of
			Left exc -> Left (exc, b)
			
			-- this case shouldn't occur, since splitSlowly
			-- is only called when parsing failed somewhere
			Right _ -> Right B.empty)
:

\subsubsection{UTF-8}

:d text codec exports
, utf8
:

:f Data/Enumerator/Text.hs
utf8 :: Codec
utf8 = Codec name enc dec where
	name = T.pack "UTF-8"
	enc text = (TE.encodeUtf8 text, Nothing)
	dec bytes = case splitQuickly bytes of
		Just (text, extra) -> (text, Right extra)
		Nothing -> splitSlowly TE.decodeUtf8 bytes
	|utf8 split bytes|
:

:d utf8 split bytes
splitQuickly bytes = loop 0 >>= maybeDecode where
	|utf8 required bytes count|
	maxN = B.length bytes
	
	loop n | n == maxN = Just (TE.decodeUtf8 bytes, B.empty)
	loop n = let
		req = required (B.index bytes n)
		tooLong = first TE.decodeUtf8 (B.splitAt n bytes)
		decodeMore = loop $! n + req
		in if req == 0
			then Nothing
			else if n + req > maxN
				then Just tooLong
				else decodeMore
:

:d utf8 required bytes count
required x0
	| x0 .&. 0x80 == 0x00 = 1
	| x0 .&. 0xE0 == 0xC0 = 2
	| x0 .&. 0xF0 == 0xE0 = 3
	| x0 .&. 0xF8 == 0xF0 = 4
	
	-- Invalid input; let Text figure it out
	| otherwise           = 0
:

\subsubsection{UTF-16}

:d text codec exports
, utf16_le
, utf16_be
:

:f Data/Enumerator/Text.hs
utf16_le :: Codec
utf16_le = Codec name enc dec where
	name = T.pack "UTF-16-LE"
	enc text = (TE.encodeUtf16LE text, Nothing)
	dec bytes = case splitQuickly bytes of
		Just (text, extra) -> (text, Right extra)
		Nothing -> splitSlowly TE.decodeUtf16LE bytes
	|utf16-le split bytes|
:

:f Data/Enumerator/Text.hs
utf16_be :: Codec
utf16_be = Codec name enc dec where
	name = T.pack "UTF-16-BE"
	enc text = (TE.encodeUtf16BE text, Nothing)
	dec bytes = case splitQuickly bytes of
		Just (text, extra) -> (text, Right extra)
		Nothing -> splitSlowly TE.decodeUtf16BE bytes
	|utf16-be split bytes|
:

:d utf16-le split bytes
splitQuickly bytes = maybeDecode (loop 0) where
	maxN = B.length bytes
	
	loop n |  n      == maxN = decodeAll
	       | (n + 1) == maxN = decodeTo n
	loop n = let
		req = utf16Required
			(B.index bytes 0)
			(B.index bytes 1)
		decodeMore = loop $! n + req
		in if n + req > maxN
			then decodeTo n
			else decodeMore
	
	decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
	decodeAll = (TE.decodeUtf16LE bytes, B.empty)
:

:d utf16-be split bytes
splitQuickly bytes = maybeDecode (loop 0) where
	maxN = B.length bytes
	
	loop n |  n      == maxN = decodeAll
	       | (n + 1) == maxN = decodeTo n
	loop n = let
		req = utf16Required
			(B.index bytes 1)
			(B.index bytes 0)
		decodeMore = loop $! n + req
		in if n + req > maxN
			then decodeTo n
			else decodeMore
	
	decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
	decodeAll = (TE.decodeUtf16BE bytes, B.empty)
:

:f Data/Enumerator/Text.hs
utf16Required :: Word8 -> Word8 -> Int
utf16Required x0 x1 = required where
	required = if x >= 0xD800 && x <= 0xDBFF
		then 4
		else 2
	x :: Word16
	x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
:

\subsubsection{UTF-32}

:d text codec exports
, utf32_le
, utf32_be
:

:f Data/Enumerator/Text.hs
utf32_le :: Codec
utf32_le = Codec name enc dec where
	name = T.pack "UTF-32-LE"
	enc text = (TE.encodeUtf32LE text, Nothing)
	dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
		Just (text, extra) -> (text, Right extra)
		Nothing -> splitSlowly TE.decodeUtf32LE bs

utf32_be :: Codec
utf32_be = Codec name enc dec where
	name = T.pack "UTF-32-BE"
	enc text = (TE.encodeUtf32BE text, Nothing)
	dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
		Just (text, extra) -> (text, Right extra)
		Nothing -> splitSlowly TE.decodeUtf32BE bs
:

:f Data/Enumerator/Text.hs
utf32SplitBytes :: (B.ByteString -> T.Text)
                -> B.ByteString
                -> Maybe (T.Text, B.ByteString)
utf32SplitBytes dec bytes = split where
	split = maybeDecode (dec toDecode, extra)
	len = B.length bytes
	lenExtra = mod len 4
	
	lenToDecode = len - lenExtra
	(toDecode, extra) = if lenExtra == 0
		then (bytes, B.empty)
		else B.splitAt lenToDecode bytes
:

\subsubsection{ASCII}

:d text codec exports
, ascii
:

:f Data/Enumerator/Text.hs
ascii :: Codec
ascii = Codec name enc dec where
	name = T.pack "ASCII"
	enc text = (bytes, extra) where
		(safe, unsafe) = tSpanBy (\c -> ord c <= 0x7F) text
		bytes = B8.pack (T.unpack safe)
		extra = if T.null unsafe
			then Nothing
			else Just (illegalEnc name (T.head unsafe), unsafe)
	
	dec bytes = (text, extra) where
		(safe, unsafe) = B.span (<= 0x7F) bytes
		text = T.pack (B8.unpack safe)
		extra = if B.null unsafe
			then Right B.empty
			else Left (illegalDec name (B.head unsafe), unsafe)
:

\subsubsection{ISO 8859-1}

:d text codec exports
, iso8859_1
:

:f Data/Enumerator/Text.hs
iso8859_1 :: Codec
iso8859_1 = Codec name enc dec where
	name = T.pack "ISO-8859-1"
	enc text = (bytes, extra) where
		(safe, unsafe) = tSpanBy (\c -> ord c <= 0xFF) text
		bytes = B8.pack (T.unpack safe)
		extra = if T.null unsafe
			then Nothing
			else Just (illegalEnc name (T.head unsafe), unsafe)
	
	dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
:

\subsection{Encoding Utilities}

:f Data/Enumerator/Text.hs
illegalEnc :: T.Text -> Char -> Exc.SomeException
illegalEnc name c = Exc.toException . Exc.ErrorCall $
	concat [ "Codec "
	       , show name
	       , " can't encode character "
	       , reprChar c
	       ]
:

:f Data/Enumerator/Text.hs
illegalDec :: T.Text -> Word8 -> Exc.SomeException
illegalDec name w = Exc.toException . Exc.ErrorCall $
	concat [ "Codec "
	       , show name
	       , " can't decode byte "
	       , reprWord w
	       ]
:

:d Data.Enumerator.Text imports
import System.IO.Unsafe (unsafePerformIO)
:

:f Data/Enumerator/Text.hs
tryEvaluate :: a -> Either Exc.SomeException a
tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate

maybeDecode:: (a, b) -> Maybe (a, b)
maybeDecode (a, b) = case tryEvaluate a of
	Left _ -> Nothing
	Right _ -> Just (a, b)
: