{-# LANGUAGE BangPatterns, CPP, MagicHash #-} -- | -- Module : Data.Text.Internal.Fusion -- Copyright : (c) Tom Harper 2008-2009, -- (c) Bryan O'Sullivan 2009-2010, -- (c) Duncan Coutts 2009 -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- /Warning/: this is an internal module, and does not have a stable -- API or name. Functions in this module may not check or enforce -- preconditions expected by public modules. Use at your own risk! -- -- Text manipulation functions represented as fusible operations over -- streams. module Data.Text.Internal.Fusion ( -- * Types Stream(..) , Step(..) -- * Creation and elimination , stream , unstream , reverseStream , length -- * Transformations , reverse -- * Construction -- ** Scans , reverseScanr -- ** Accumulating maps , mapAccumL -- ** Generation and unfolding , unfoldrN -- * Indexing , index , findIndex , countChar ) where import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int, Num(..), Ord(..), ($), otherwise) import Data.Bits (shiftL, shiftR) import Data.Text.Internal (Text(..)) import Data.Text.Internal.Private (runText) import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeWrite) import qualified Data.Text.Array as A import qualified Data.Text.Internal.Fusion.Common as S import Data.Text.Internal.Fusion.Types import Data.Text.Internal.Fusion.Size import qualified Data.Text.Internal as I import qualified Data.Text.Internal.Encoding.Utf8 as U8 import GHC.Stack (HasCallStack) default(Int) -- | /O(n)/ Convert 'Text' into a 'Stream' 'Char'. -- -- __Properties__ -- -- @'unstream' . 'stream' = 'Data.Function.id'@ -- -- @'stream' . 'unstream' = 'Data.Function.id' @ stream :: #if defined(ASSERTS) HasCallStack => #endif Text -> Stream Char stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 2) len) where !end = off+len next !i | i >= end = Done | otherwise = Yield chr (i + l) where n0 = A.unsafeIndex arr i n1 = A.unsafeIndex arr (i + 1) n2 = A.unsafeIndex arr (i + 2) n3 = A.unsafeIndex arr (i + 3) l = U8.utf8LengthByLeader n0 chr = case l of 1 -> unsafeChr8 n0 2 -> U8.chr2 n0 n1 3 -> U8.chr3 n0 n1 n2 _ -> U8.chr4 n0 n1 n2 n3 {-# INLINE [0] stream #-} -- | /O(n)/ Converts 'Text' into a 'Stream' 'Char', but iterates -- backwards through the text. -- -- __Properties__ -- -- @'unstream' . 'reverseStream' = 'Data.Text.reverse' @ reverseStream :: Text -> Stream Char reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 2) len) where {-# INLINE next #-} next !i | i < off = Done | n0 < 0x80 = Yield (unsafeChr8 n0) (i - 1) | n1 >= 0xC0 = Yield (U8.chr2 n1 n0) (i - 2) | n2 >= 0xC0 = Yield (U8.chr3 n2 n1 n0) (i - 3) | otherwise = Yield (U8.chr4 n3 n2 n1 n0) (i - 4) where n0 = A.unsafeIndex arr i n1 = A.unsafeIndex arr (i - 1) n2 = A.unsafeIndex arr (i - 2) n3 = A.unsafeIndex arr (i - 3) {-# INLINE [0] reverseStream #-} -- | /O(n)/ Convert 'Stream' 'Char' into a 'Text'. -- -- __Properties__ -- -- @'unstream' . 'stream' = 'Data.Function.id'@ -- -- @'stream' . 'unstream' = 'Data.Function.id' @ unstream :: Stream Char -> Text unstream (Stream next0 s0 len) = runText $ \done -> do -- Before encoding each char we perform a buffer realloc check assuming -- worst case encoding size of four 8-bit units for the char. Just add an -- extra space to the buffer so that we do not end up reallocating even when -- all the chars are encoded as single unit. let mlen = upperBound 4 len + 3 arr0 <- A.new mlen let outer !arr !maxi = encode where -- keep the common case loop as small as possible encode !si !di = case next0 si of Done -> done arr di Skip si' -> encode si' di Yield c si' -- simply check for the worst case | maxi < di + 3 -> realloc si di | otherwise -> do n <- unsafeWrite arr di c encode si' (di + n) -- keep uncommon case separate from the common case code {-# NOINLINE realloc #-} realloc !si !di = do let newlen = (maxi + 1) * 2 arr' <- A.resizeM arr newlen outer arr' (newlen - 1) si di outer arr0 (mlen - 1) s0 0 {-# INLINE [0] unstream #-} {-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-} -- ---------------------------------------------------------------------------- -- * Basic stream functions -- | /O(n)/ Returns the number of characters in a 'Stream'. -- -- __Properties__ -- -- @'length' . 'stream' = 'Data.Text.length' @ length :: Stream Char -> Int length = S.lengthI {-# INLINE[0] length #-} -- | /O(n)/ Reverse the characters of a 'Stream' returning 'Text'. -- -- __Properties__ -- -- @'reverse' . 'stream' = 'Data.Text.reverse' @ reverse :: #if defined(ASSERTS) HasCallStack => #endif Stream Char -> Text reverse (Stream next s len0) | isEmpty len0 = I.empty | otherwise = I.text arr off' len' where len0' = upperBound 4 (larger len0 4) (arr, (off', len')) = A.run2 (A.new len0' >>= loop s (len0'-1) len0') loop !s0 !i !len marr = case next s0 of Done -> return (marr, (j, len-j)) where j = i + 1 Skip s1 -> loop s1 i len marr Yield x s1 | i < least -> {-# SCC "reverse/resize" #-} do let newLen = len `shiftL` 1 marr' <- A.new newLen A.copyM marr' (newLen-len) marr 0 len _ <- unsafeWrite marr' (len + i - least) x loop s1 (len + i - least - 1) newLen marr' | otherwise -> do _ <- unsafeWrite marr (i - least) x loop s1 (i - least - 1) len marr where least = U8.utf8Length x - 1 {-# INLINE [0] reverse #-} -- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with -- the input and result reversed. -- -- __Properties__ -- -- @'reverse' . 'reverseScanr' f c . 'reverseStream' = 'Data.Text.scanr' f c @ reverseScanr :: (Char -> Char -> Char) -> Char -> Stream Char -> Stream Char reverseScanr f z0 (Stream next0 s0 len) = Stream next (Scan1 z0 s0) (len+1) -- HINT maybe too low where {-# INLINE next #-} next (Scan1 z s) = Yield z (Scan2 z s) next (Scan2 z s) = case next0 s of Yield x s' -> let !x' = f x z in Yield x' (Scan2 x' s') Skip s' -> Skip (Scan2 z s') Done -> Done {-# INLINE reverseScanr #-} -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed -- value. However, the length of the result is limited by the -- first argument to 'unfoldrN'. This function is more efficient than -- 'unfoldr' when the length of the result is known. -- -- __Properties__ -- -- @'unstream' ('unfoldrN' n f a) = 'Data.Text.unfoldrN' n f a @ unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Stream Char unfoldrN n = S.unfoldrNI n {-# INLINE [0] unfoldrN #-} ------------------------------------------------------------------------------- -- ** Indexing streams -- | /O(n)/ stream index (subscript) operator, starting from 0. -- -- __Properties__ -- -- @'index' ('stream' t) n = 'Data.Text.index' t n @ index :: HasCallStack => Stream Char -> Int -> Char index = S.indexI {-# INLINE [0] index #-} -- | The 'findIndex' function takes a predicate and a stream and -- returns the index of the first element in the stream -- satisfying the predicate. -- -- __Properties__ -- -- @'findIndex' p . 'stream' = 'Data.Text.findIndex' p @ findIndex :: (Char -> Bool) -> Stream Char -> Maybe Int findIndex = S.findIndexI {-# INLINE [0] findIndex #-} -- | /O(n)/ The 'count' function returns the number of times the query -- element appears in the given stream. -- -- __Properties__ -- -- @'countChar' c . 'stream' = 'Data.Text.countChar' c @ countChar :: Char -> Stream Char -> Int countChar = S.countCharI {-# INLINE [0] countChar #-} -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a -- function to each element of a 'Text', passing an accumulating -- parameter from left to right, and returns a final 'Text'. -- -- __Properties__ -- -- @'mapAccumL' g z0 . 'stream' = 'Data.Text.mapAccumL' g z0@ mapAccumL :: #if defined(ASSERTS) HasCallStack => #endif (a -> Char -> (a,Char)) -> a -> Stream Char -> (a, Text) mapAccumL f z0 (Stream next0 s0 len) = (nz, I.text na 0 nl) where (na,(nz,nl)) = A.run2 (A.new mlen >>= \arr -> outer arr mlen z0 s0 0) where mlen = upperBound 4 len outer arr top = loop where loop !z !s !i = case next0 s of Done -> return (arr, (z,i)) Skip s' -> loop z s' i Yield x s' | j >= top -> {-# SCC "mapAccumL/resize" #-} do let top' = (top + 1) `shiftL` 1 arr' <- A.resizeM arr top' outer arr' top' z s i | otherwise -> do d <- unsafeWrite arr i c loop z' s' (i+d) where (z',c) = f z x j = i + U8.utf8Length c - 1 {-# INLINE [0] mapAccumL #-}