-- | -- Module : Foundation.String.UTF8 -- License : BSD-style -- Maintainer : Vincent Hanquez -- Stability : experimental -- Portability : portable -- -- A String type backed by a UTF8 encoded byte array and all the necessary -- functions to manipulate the string. -- -- You can think of String as a specialization of a byte array that -- have element of type Char. -- -- The String data must contain UTF8 valid data. -- {-# LANGUAGE BangPatterns #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE CPP #-} module Foundation.String.UTF8 ( String(..) --, Buffer , create , replicate -- * Binary conversion , Encoding(..) , fromBytes , fromChunkBytes , fromBytesUnsafe , fromBytesLenient , toBytes , mutableValidate , copy , ValidationFailure(..) -- * Legacy utility , lines , words ) where import Foundation.Array.Unboxed (UArray) import qualified Foundation.Array.Unboxed as Vec import Foundation.Array.Unboxed.ByteArray (MutableByteArray) import qualified Foundation.Array.Unboxed.Mutable as MVec import qualified Foundation.Collection as C import Foundation.Collection.Buildable import Foundation.Internal.Base import Foundation.Internal.MonadTrans import Foundation.Internal.Primitive import Foundation.Internal.Types import Foundation.Numerical import Foundation.Primitive.Monad import Foundation.Primitive.Types import Foundation.String.UTF8Table import GHC.Prim import GHC.ST import GHC.Types import GHC.Word import GHC.Char -- temporary import qualified Data.List import Data.Data import qualified Prelude import Foundation.String.ModifiedUTF8 (fromModified) import GHC.CString (unpackCString#, unpackCStringUtf8#) import qualified Foundation.String.Encoding.Encoding as Encoder import qualified Foundation.String.Encoding.ASCII7 as Encoder import qualified Foundation.String.Encoding.UTF16 as Encoder import qualified Foundation.String.Encoding.UTF32 as Encoder import qualified Foundation.String.Encoding.ISO_8859_1 as Encoder -- | Opaque packed array of characters in the UTF8 encoding newtype String = String (UArray Word8) deriving (Typeable, Monoid, Eq, Ord) instance Data String where toConstr s = mkConstr stringType (show s) [] Prefix dataTypeOf _ = stringType gunfold _ _ = error "gunfold" stringType :: DataType stringType = mkNoRepType "Foundation.String" newtype MutableString st = MutableString (MutableByteArray st) deriving (Typeable) instance Show String where show = show . sToList instance IsString String where fromString = sFromList instance IsList String where type Item String = Char fromList = sFromList toList = sToList type instance C.Element String = Char instance C.InnerFunctor String where imap = charMap instance C.Collection String where null = null length = length elem = elem minimum = Data.List.minimum . toList . C.getNonEmpty -- TODO faster implementation maximum = Data.List.maximum . toList . C.getNonEmpty -- TODO faster implementation instance C.Sequential String where take = take drop = drop splitAt = splitAt revTake = revTake revDrop = revDrop revSplitAt = revSplitAt splitOn = splitOn break = break breakElem = breakElem intersperse = intersperse span = span filter = filter reverse = reverse unsnoc = unsnoc uncons = uncons snoc = snoc cons = cons find = find sortBy = sortBy singleton = fromList . (:[]) instance C.Zippable String where zipWith f as bs = runST $ build 64 $ go f (toList as) (toList bs) where go _ [] _ = return () go _ _ [] = return () go f' (a':as') (b':bs') = append (f' a' b') >> go f' as' bs' instance Buildable String where type Mutable String = MutableString type Step String = Word8 append c = Builder $ State $ \(i, st) -> if offsetAsSize i + nbBytes >= chunkSize st then do cur <- unsafeFreezeShrink (curChunk st) (offsetAsSize i) newChunk <- new (chunkSize st) writeUTF8Char newChunk (Offset 0) utf8Char return ((), (sizeAsOffset nbBytes, st { prevChunks = cur : prevChunks st , prevChunksSize = offsetAsSize i + prevChunksSize st , curChunk = newChunk })) else do writeUTF8Char (curChunk st) i utf8Char return ((), (i + sizeAsOffset nbBytes, st)) where utf8Char = asUTF8Char c nbBytes = numBytes utf8Char {-# INLINE append #-} build sizeChunksI sb | sizeChunksI <= 3 = build 64 sb | otherwise = do first <- new sizeChunks ((), (i, st)) <- runState (runBuilder sb) (Offset 0, BuildingState [] (Size 0) first sizeChunks) cur <- unsafeFreezeShrink (curChunk st) (offsetAsSize i) -- Build final array let totalSize = prevChunksSize st + offsetAsSize i final <- Vec.new totalSize >>= fillFromEnd totalSize (cur : prevChunks st) >>= Vec.unsafeFreeze return $ String final where sizeChunks = Size sizeChunksI fillFromEnd _ [] mba = return mba fillFromEnd !end (String x:xs) mba = do let sz = Vec.lengthSize x Vec.unsafeCopyAtRO mba (sizeAsOffset (end - sz)) x (Offset 0) sz fillFromEnd (end - sz) xs mba {-# INLINE build #-} data ValidationFailure = InvalidHeader | InvalidContinuation | MissingByte deriving (Show,Eq,Typeable) instance Exception ValidationFailure data EncoderUTF8 = EncoderUTF8 instance Encoder.Encoding EncoderUTF8 where type Unit EncoderUTF8 = Word8 type Error EncoderUTF8 = ValidationFailure encodingNext _ = \ofs -> Right . nextWithIndexer ofs encodingWrite _ = writeWithBuilder -- | Validate a bytearray for UTF8'ness -- -- On success Nothing is returned -- On Failure the position along with the failure reason validate :: UArray Word8 -> Offset8 -> Size Word8 -> (Offset8, Maybe ValidationFailure) validate ba ofsStart sz = runST (Vec.unsafeIndexer ba go) where end = ofsStart `offsetPlusE` sz go :: (Offset Word8 -> Word8) -> ST s (Offset Word8, Maybe ValidationFailure) go getIdx = return $ loop ofsStart where loop ofs | ofs > end = error "validate: internal error: went pass offset" | ofs == end = (end, Nothing) | otherwise = case {-# SCC "validate.one" #-} one ofs of (nextOfs, Nothing) -> loop nextOfs (pos, Just failure) -> (pos, Just failure) one pos = case nbConts of 0 -> (pos + 1, Nothing) 0xff -> (pos, Just InvalidHeader) _ | (pos + 1) `offsetPlusE` nbContsE > end -> (pos, Just MissingByte) 1 -> let c1 = getIdx (pos + 1) in if isContinuation c1 then (pos + 2, Nothing) else (pos, Just InvalidContinuation) 2 -> let c1 = getIdx (pos + 1) c2 = getIdx (pos + 2) in if isContinuation c1 && isContinuation c2 then (pos + 3, Nothing) else (pos, Just InvalidContinuation) 3 -> let c1 = getIdx (pos + 1) c2 = getIdx (pos + 2) c3 = getIdx (pos + 3) in if isContinuation c1 && isContinuation c2 && isContinuation c3 then (pos + 4, Nothing) else (pos, Just InvalidContinuation) _ -> error "internal error" where !h = getIdx pos !nbContsE@(Size nbConts) = Size $ getNbBytes h {-# INLINE go #-} mutableValidate :: PrimMonad prim => MutableByteArray (PrimState prim) -> Int -> Int -> prim (Int, Maybe ValidationFailure) mutableValidate mba ofsStart sz = do loop ofsStart where end = ofsStart + sz loop ofs | ofs > end = error "mutableValidate: internal error: went pass offset" | ofs == end = return (end, Nothing) | otherwise = do r <- one ofs case r of (nextOfs, Nothing) -> loop nextOfs (pos, Just failure) -> return (pos, Just failure) one pos = do h <- C.mutUnsafeRead mba pos let nbConts = getNbBytes h if nbConts == 0xff then return (pos, Just InvalidHeader) else if pos + 1 + nbConts > end then return (pos, Just MissingByte) else do case nbConts of 0 -> return (pos + 1, Nothing) 1 -> do c1 <- C.mutUnsafeRead mba (pos + 1) if isContinuation c1 then return (pos + 2, Nothing) else return (pos, Just InvalidContinuation) 2 -> do c1 <- C.mutUnsafeRead mba (pos + 1) c2 <- C.mutUnsafeRead mba (pos + 2) if isContinuation c1 && isContinuation c2 then return (pos + 3, Nothing) else return (pos, Just InvalidContinuation) 3 -> do c1 <- C.mutUnsafeRead mba (pos + 1) c2 <- C.mutUnsafeRead mba (pos + 2) c3 <- C.mutUnsafeRead mba (pos + 3) if isContinuation c1 && isContinuation c2 && isContinuation c3 then return (pos + 4, Nothing) else return (pos, Just InvalidContinuation) _ -> error "internal error" skipNextHeaderValue :: Word8 -> Size Word8 skipNextHeaderValue !x | x < 0xC0 = Size 1 -- 0b11000000 | x < 0xE0 = Size 2 -- 0b11100000 | x < 0xF0 = Size 3 -- 0b11110000 | otherwise = Size 4 {-# INLINE skipNextHeaderValue #-} nextWithIndexer :: (Offset Word8 -> Word8) -> Offset Word8 -> (Char, Offset Word8) nextWithIndexer getter off = case getNbBytes# h of 0# -> (toChar h, off + aone) 1# -> (toChar (decode2 (getter $ off + aone)), off + atwo) 2# -> (toChar (decode3 (getter $ off + aone) (getter $ off + atwo)), off + athree) 3# -> (toChar (decode4 (getter $ off + aone) (getter $ off + atwo) (getter $ off + athree)) , off + afour) r -> error ("next: internal error: invalid input: " <> show (I# r) <> " " <> show (W# h)) where aone = Offset 1 atwo = Offset 2 athree = Offset 3 afour = Offset 4 !(W8# h) = getter off toChar :: Word# -> Char toChar w = C# (chr# (word2Int# w)) decode2 :: Word8 -> Word# decode2 (W8# c1) = or# (uncheckedShiftL# (and# h 0x1f##) 6#) (and# c1 0x3f##) decode3 :: Word8 -> Word8 -> Word# decode3 (W8# c1) (W8# c2) = or# (uncheckedShiftL# (and# h 0xf##) 12#) (or# (uncheckedShiftL# (and# c1 0x3f##) 6#) (and# c2 0x3f##)) decode4 :: Word8 -> Word8 -> Word8 -> Word# decode4 (W8# c1) (W8# c2) (W8# c3) = or# (uncheckedShiftL# (and# h 0x7##) 18#) (or# (uncheckedShiftL# (and# c1 0x3f##) 12#) (or# (uncheckedShiftL# (and# c2 0x3f##) 6#) (and# c3 0x3f##)) ) writeWithBuilder :: (PrimMonad st, Monad st) => Char -> Builder (UArray Word8) st () writeWithBuilder c = if bool# (ltWord# x 0x80## ) then encode1 else if bool# (ltWord# x 0x800## ) then encode2 else if bool# (ltWord# x 0x10000##) then encode3 else encode4 where !(I# xi) = fromEnum c !x = int2Word# xi encode1 = append (W8# x) encode2 = do let x1 = or# (uncheckedShiftRL# x 6#) 0xc0## x2 = toContinuation x append (W8# x1) >> append (W8# x2) encode3 = do let x1 = or# (uncheckedShiftRL# x 12#) 0xe0## x2 = toContinuation (uncheckedShiftRL# x 6#) x3 = toContinuation x append (W8# x1) >> append (W8# x2) >> append (W8# x3) encode4 = do let x1 = or# (uncheckedShiftRL# x 18#) 0xf0## x2 = toContinuation (uncheckedShiftRL# x 12#) x3 = toContinuation (uncheckedShiftRL# x 6#) x4 = toContinuation x append (W8# x1) >> append (W8# x2) >> append (W8# x3) >> append (W8# x4) toContinuation :: Word# -> Word# toContinuation w = or# (and# w 0x3f##) 0x80## next :: String -> Offset8 -> (# Char, Offset8 #) next (String ba) (Offset n) = case getNbBytes# h of 0# -> (# toChar h, Offset $ n + 1 #) 1# -> (# toChar (decode2 (Vec.unsafeIndex ba (n + 1))) , Offset $ n + 2 #) 2# -> (# toChar (decode3 (Vec.unsafeIndex ba (n + 1)) (Vec.unsafeIndex ba (n + 2))) , Offset $ n + 3 #) 3# -> (# toChar (decode4 (Vec.unsafeIndex ba (n + 1)) (Vec.unsafeIndex ba (n + 2)) (Vec.unsafeIndex ba (n + 3))) , Offset $ n + 4 #) r -> error ("next: internal error: invalid input: offset=" <> show n <> " table=" <> show (I# r) <> " h=" <> show (W# h)) where !(W8# h) = Vec.unsafeIndex ba n toChar :: Word# -> Char toChar w = C# (chr# (word2Int# w)) decode2 :: Word8 -> Word# decode2 (W8# c1) = or# (uncheckedShiftL# (and# h 0x1f##) 6#) (and# c1 0x3f##) decode3 :: Word8 -> Word8 -> Word# decode3 (W8# c1) (W8# c2) = or# (uncheckedShiftL# (and# h 0xf##) 12#) (or# (uncheckedShiftL# (and# c1 0x3f##) 6#) (and# c2 0x3f##)) decode4 :: Word8 -> Word8 -> Word8 -> Word# decode4 (W8# c1) (W8# c2) (W8# c3) = or# (uncheckedShiftL# (and# h 0x7##) 18#) (or# (uncheckedShiftL# (and# c1 0x3f##) 12#) (or# (uncheckedShiftL# (and# c2 0x3f##) 6#) (and# c3 0x3f##)) ) -- | Different way to encode a Character in UTF8 represented as an ADT data UTF8Char = UTF8_1 {-# UNPACK #-} !Word8 | UTF8_2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 | UTF8_3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 | UTF8_4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 asUTF8Char :: Char -> UTF8Char asUTF8Char !c | bool# (ltWord# x 0x80## ) = encode1 | bool# (ltWord# x 0x800## ) = encode2 | bool# (ltWord# x 0x10000##) = encode3 | otherwise = encode4 where !(I# xi) = fromEnum c !x = int2Word# xi encode1 = UTF8_1 (W8# x) encode2 = let !x1 = W8# (or# (uncheckedShiftRL# x 6#) 0xc0##) !x2 = toContinuation x in UTF8_2 x1 x2 encode3 = let !x1 = W8# (or# (uncheckedShiftRL# x 12#) 0xe0##) !x2 = toContinuation (uncheckedShiftRL# x 6#) !x3 = toContinuation x in UTF8_3 x1 x2 x3 encode4 = let !x1 = W8# (or# (uncheckedShiftRL# x 18#) 0xf0##) !x2 = toContinuation (uncheckedShiftRL# x 12#) !x3 = toContinuation (uncheckedShiftRL# x 6#) !x4 = toContinuation x in UTF8_4 x1 x2 x3 x4 toContinuation :: Word# -> Word8 toContinuation w = W8# (or# (and# w 0x3f##) 0x80##) {-# INLINE toContinuation #-} numBytes :: UTF8Char -> Size8 numBytes UTF8_1{} = Size 1 numBytes UTF8_2{} = Size 2 numBytes UTF8_3{} = Size 3 numBytes UTF8_4{} = Size 4 writeUTF8Char :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> UTF8Char -> prim () writeUTF8Char (MutableString mba) (Offset i) (UTF8_1 x1) = C.mutUnsafeWrite mba i x1 writeUTF8Char (MutableString mba) (Offset i) (UTF8_2 x1 x2) = do C.mutUnsafeWrite mba i x1 C.mutUnsafeWrite mba (i+1) x2 writeUTF8Char (MutableString mba) (Offset i) (UTF8_3 x1 x2 x3) = do C.mutUnsafeWrite mba i x1 C.mutUnsafeWrite mba (i+1) x2 C.mutUnsafeWrite mba (i+2) x3 writeUTF8Char (MutableString mba) (Offset i) (UTF8_4 x1 x2 x3 x4) = do C.mutUnsafeWrite mba i x1 C.mutUnsafeWrite mba (i+1) x2 C.mutUnsafeWrite mba (i+2) x3 C.mutUnsafeWrite mba (i+3) x4 {-# INLINE writeUTF8Char #-} write :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> Char -> prim Offset8 write (MutableString mba) (Offset i) c = if bool# (ltWord# x 0x80## ) then encode1 else if bool# (ltWord# x 0x800## ) then encode2 else if bool# (ltWord# x 0x10000##) then encode3 else encode4 where !(I# xi) = fromEnum c !x = int2Word# xi encode1 = C.mutUnsafeWrite mba i (W8# x) >> return (Offset $ i + 1) encode2 = do let x1 = or# (uncheckedShiftRL# x 6#) 0xc0## x2 = toContinuation x C.mutUnsafeWrite mba i (W8# x1) C.mutUnsafeWrite mba (i+1) (W8# x2) return $ Offset (i + 2) encode3 = do let x1 = or# (uncheckedShiftRL# x 12#) 0xe0## x2 = toContinuation (uncheckedShiftRL# x 6#) x3 = toContinuation x C.mutUnsafeWrite mba i (W8# x1) C.mutUnsafeWrite mba (i+1) (W8# x2) C.mutUnsafeWrite mba (i+2) (W8# x3) return $ Offset (i + 3) encode4 = do let x1 = or# (uncheckedShiftRL# x 18#) 0xf0## x2 = toContinuation (uncheckedShiftRL# x 12#) x3 = toContinuation (uncheckedShiftRL# x 6#) x4 = toContinuation x C.mutUnsafeWrite mba i (W8# x1) C.mutUnsafeWrite mba (i+1) (W8# x2) C.mutUnsafeWrite mba (i+2) (W8# x3) C.mutUnsafeWrite mba (i+3) (W8# x4) return $ Offset (i + 4) toContinuation :: Word# -> Word# toContinuation w = or# (and# w 0x3f##) 0x80## {-# INLINE write #-} freeze :: PrimMonad prim => MutableString (PrimState prim) -> prim String freeze (MutableString mba) = String `fmap` C.unsafeFreeze mba {-# INLINE freeze #-} unsafeFreezeShrink :: PrimMonad prim => MutableString (PrimState prim) -> Size Word8 -> prim String unsafeFreezeShrink (MutableString mba) s = String <$> Vec.unsafeFreezeShrink mba s {-# INLINE unsafeFreezeShrink #-} ------------------------------------------------------------------------ -- real functions sToList :: String -> [Char] sToList s = loop azero where !nbBytes = size s end = azero `offsetPlusE` nbBytes loop idx | idx == end = [] | otherwise = let (# c , idx' #) = next s idx in c : loop idx' {-# RULES "String sFromList" forall s . sFromList (unpackCString# s) = String $ fromModified s #-} {-# RULES "String sFromList" forall s . sFromList (unpackCStringUtf8# s) = String $ fromModified s #-} sFromList :: [Char] -> String sFromList l = runST (new bytes >>= startCopy) where -- count how many bytes !bytes = C.foldl' (+) (Size 0) $ fmap (charToBytes . fromEnum) l startCopy :: MutableString (PrimState (ST st)) -> ST st String startCopy ms = loop azero l where loop _ [] = freeze ms loop idx (c:xs) = write ms idx c >>= \idx' -> loop idx' xs {-# INLINE [0] sFromList #-} null :: String -> Bool null (String ba) = C.length ba == 0 -- | Create a string composed of a number @n of Chars (Unicode code points). -- -- if the input @s contains less characters than required, then take :: Int -> String -> String take n s@(String ba) | n <= 0 = mempty | n >= C.length ba = s | otherwise = let (Offset o) = indexN n s in String $ Vec.take o ba -- | Create a string with the remaining Chars after dropping @n Chars from the beginning drop :: Int -> String -> String drop n s@(String ba) | n <= 0 = s | n >= C.length ba = mempty | otherwise = let (Offset o) = indexN n s in String $ Vec.drop o ba splitAt :: Int -> String -> (String, String) splitAt nI s@(String ba) | nI <= 0 = (mempty, s) | nI >= C.length ba = (s, mempty) | otherwise = let (Offset k) = indexN nI s (v1,v2) = C.splitAt k ba in (String v1, String v2) -- | Return the offset (in bytes) of the N'th sequence in an UTF8 String indexN :: Int -> String -> Offset Word8 indexN nI (String ba) = Vec.unsafeDewrap goVec goAddr ba where !n = Size nI end :: Offset Char !end = Offset 0 `offsetPlusE` n goVec :: ByteArray# -> Offset Word8 -> Offset Word8 goVec !ma !start = loop start (Offset 0) where !len = start `offsetPlusE` Vec.lengthSize ba loop :: Offset Word8 -> Offset Char -> Offset Word8 loop !idx !i | idx >= len || i >= end = sizeAsOffset (idx - start) | otherwise = loop (idx `offsetPlusE` d) (i + Offset 1) where d = skipNextHeaderValue (primBaIndex ma idx) {-# INLINE goVec #-} goAddr :: Ptr Word8 -> Offset Word8 -> ST s (Offset Word8) goAddr !(Ptr ptr) !start = return $ loop start (Offset 0) where !len = start `offsetPlusE` Vec.lengthSize ba loop :: Offset Word8 -> Offset Char -> Offset Word8 loop !idx !i | idx >= len || i >= end = sizeAsOffset (idx - start) | otherwise = loop (idx `offsetPlusE` d) (i + Offset 1) where d = skipNextHeaderValue (primAddrIndex ptr idx) {-# INLINE goAddr #-} {-# INLINE indexN #-} -- rev{Take,Drop,SplitAt} TODO optimise: -- we can process the string from the end using a skipPrev instead of getting the length revTake :: Int -> String -> String revTake nbElems v = drop (length v - nbElems) v revDrop :: Int -> String -> String revDrop nbElems v = take (length v - nbElems) v revSplitAt :: Int -> String -> (String, String) revSplitAt n v = (drop idx v, take idx v) where idx = length v - n -- | Split on the input string using the predicate as separator -- -- e.g. -- -- > splitOn (== ',') "," == ["",""] -- > splitOn (== ',') ",abc," == ["","abc",""] -- > splitOn (== ':') "abc" == ["abc"] -- > splitOn (== ':') "abc::def" == ["abc","","def"] -- > splitOn (== ':') "::abc::def" == ["","","abc","","def"] -- splitOn :: (Char -> Bool) -> String -> [String] splitOn predicate s | sz == Size 0 = [] | otherwise = loop azero azero where !sz = size s end = azero `offsetPlusE` sz loop prevIdx idx | idx == end = [sub s prevIdx idx] | otherwise = let (# c, idx' #) = next s idx in if predicate c then sub s prevIdx idx : loop idx' idx' else loop prevIdx idx' sub :: String -> Offset8 -> Offset8 -> String sub (String ba) (Offset start) (Offset end) = String $ Vec.sub ba start end -- | Split at a given index. splitIndex :: Offset8 -> String -> (String, String) splitIndex (Offset idx) (String ba) = (String v1, String v2) where (v1,v2) = C.splitAt idx ba break :: (Char -> Bool) -> String -> (String, String) break predicate s@(String ba) = runST $ Vec.unsafeIndexer ba go where !sz = size s end = azero `offsetPlusE` sz go :: (Offset Word8 -> Word8) -> ST st (String, String) go getIdx = loop (Offset 0) where !nextI = nextWithIndexer getIdx loop idx | idx == end = return (s, mempty) | otherwise = do let (c, idx') = nextI idx case predicate c of True -> return $ splitIndex idx s False -> loop idx' {-# INLINE loop #-} {-# INLINE [2] break #-} #if MIN_VERSION_base(4,9,0) {-# RULES "break (== 'c')" [3] forall c . break (eqChar c) = breakElem c #-} #else {-# RULES "break (== 'c')" [3] forall c . break (== c) = breakElem c #-} #endif breakElem :: Char -> String -> (String, String) breakElem !el s@(String ba) = case asUTF8Char el of UTF8_1 w -> let (# v1,v2 #) = Vec.splitElem w ba in (String v1, String v2) _ -> runST $ Vec.unsafeIndexer ba go where sz = size s end = azero `offsetPlusE` sz go :: (Offset Word8 -> Word8) -> ST st (String, String) go getIdx = loop (Offset 0) where !nextI = nextWithIndexer getIdx loop idx | idx == end = return (s, mempty) | otherwise = do let (c, idx') = nextI idx case el == c of True -> return $ splitIndex idx s False -> loop idx' elem :: Char -> String -> Bool elem !el s@(String ba) = case asUTF8Char el of UTF8_1 w -> Vec.elem w ba _ -> runST $ Vec.unsafeIndexer ba go where sz = size s end = azero `offsetPlusE` sz go :: (Offset Word8 -> Word8) -> ST st Bool go getIdx = loop (Offset 0) where !nextI = nextWithIndexer getIdx loop !idx | idx == end = return False | otherwise = do let (c, idx') = nextI idx case el == c of True -> return True False -> loop idx' intersperse :: Char -> String -> String intersperse sep src | srcLen <= 1 = src | otherwise = runST $ unsafeCopyFrom src dstBytes (go sep) where !srcBytes = size src !srcLen = lengthSize src dstBytes = (srcBytes :: Size8) + ((srcLen - 1) `scale` charToBytes (fromEnum sep)) lastSrcI :: Offset Char lastSrcI = 0 `offsetPlusE` (srcLen - 1) go :: Char -> String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8) go sep' src' srcI srcIdx dst dstIdx | srcI == lastSrcI = do nextDstIdx <- write dst dstIdx c return (nextSrcIdx, nextDstIdx) | otherwise = do nextDstIdx <- write dst dstIdx c nextDstIdx' <- write dst nextDstIdx sep' return (nextSrcIdx, nextDstIdx') where (# c, nextSrcIdx #) = next src' srcIdx -- | Allocate a new @String@ with a fill function that has access to the characters of -- the source @String@. unsafeCopyFrom :: String -- ^ Source string -> Size8 -- ^ Length of the destination string in bytes -> (String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8)) -- ^ Function called for each character in the source String -> ST s String -- ^ Returns the filled new string unsafeCopyFrom src dstBytes f = new dstBytes >>= fill (Offset 0) (Offset 0) (Offset 0) f >>= freeze where srcLen = length src end = Offset 0 `offsetPlusE` Size srcLen fill srcI srcIdx dstIdx f' dst' | srcI == end = return dst' | otherwise = do (nextSrcIdx, nextDstIdx) <- f' src srcI srcIdx dst' dstIdx fill (srcI + Offset 1) nextSrcIdx nextDstIdx f' dst' span :: (Char -> Bool) -> String -> (String, String) span predicate s = break (not . predicate) s -- | size in bytes size :: String -> Size8 size (String ba) = Size $ C.length ba lengthSize :: String -> Size Char lengthSize (String ba) | C.null ba = Size 0 | otherwise = Vec.unsafeDewrap goVec goAddr ba where goVec ma start = loop start (Size 0) where !end = start `offsetPlusE` Vec.lengthSize ba loop !idx !i | idx >= end = i | otherwise = loop (idx `offsetPlusE` d) (i + Size 1) where d = skipNextHeaderValue (primBaIndex ma idx) goAddr (Ptr ptr) start = return $ loop start (Size 0) where !end = start `offsetPlusE` Vec.lengthSize ba loop !idx !i | idx >= end = i | otherwise = loop (idx `offsetPlusE` d) (i + Size 1) where d = skipNextHeaderValue (primAddrIndex ptr idx) length :: String -> Int length s = let (Size sz) = lengthSize s in sz replicate :: Int -> Char -> String replicate n c = runST (new nbBytes >>= fill) where end = azero `offsetPlusE` nbBytes nbBytes = Size $ sz * n (Size sz) = charToBytes (fromEnum c) fill :: PrimMonad prim => MutableString (PrimState prim) -> prim String fill ms = loop (Offset 0) where loop idx | idx == end = freeze ms | otherwise = write ms idx c >>= loop -- | Copy the String copy :: String -> String copy (String s) = String (Vec.copy s) -- | Allocate a MutableString of a specific size in bytes. new :: PrimMonad prim => Size8 -- ^ in number of bytes, not of elements. -> prim (MutableString (PrimState prim)) new n = MutableString `fmap` MVec.new n create :: PrimMonad prim => Int -> (MutableString (PrimState prim) -> prim Int) -> prim String create sz f = do ms <- new (Size sz) filled <- f ms if filled == sz then freeze ms else C.take filled `fmap` freeze ms charToBytes :: Int -> Size8 charToBytes c | c < 0x80 = Size 1 | c < 0x800 = Size 2 | c < 0x10000 = Size 3 | c < 0x110000 = Size 4 | otherwise = error ("invalid code point: " `mappend` show c) charMap :: (Char -> Char) -> String -> String charMap f src = let !(elems, nbBytes) = allocateAndFill [] (Offset 0) (Size 0) in runST $ do dest <- new nbBytes copyLoop dest elems (Offset 0 `offsetPlusE` nbBytes) freeze dest where !srcSz = size src srcEnd = azero `offsetPlusE` srcSz allocateAndFill :: [(String, Size8)] -> Offset8 -> Size8 -> ([(String,Size8)], Size8) allocateAndFill acc idx bytesWritten | idx == srcEnd = (acc, bytesWritten) | otherwise = let (el@(_,addBytes), idx') = runST $ do -- make sure we allocate at least 4 bytes for the destination for the last few bytes -- otherwise allocating less would bring the danger of spinning endlessly -- and never succeeding. let !diffBytes = srcEnd - idx !allocatedBytes = if diffBytes <= Size 4 then Size 4 else diffBytes ms <- new allocatedBytes (dstIdx, srcIdx) <- fill ms allocatedBytes idx s <- freeze ms return ((s, dstIdx), srcIdx) in allocateAndFill (el : acc) idx' (bytesWritten + addBytes) fill :: PrimMonad prim => MutableString (PrimState prim) -> Size8 -> Offset8 -> prim (Size8, Offset8) fill mba dsz srcIdxOrig = loop (Offset 0) srcIdxOrig where endDst = (Offset 0) `offsetPlusE` dsz loop dstIdx srcIdx | srcIdx == srcEnd = return (offsetAsSize dstIdx, srcIdx) | dstIdx == endDst = return (offsetAsSize dstIdx, srcIdx) | otherwise = let (# c, srcIdx' #) = next src srcIdx c' = f c -- the mapped char !nbBytes = charToBytes (fromEnum c') in -- check if we have room in the destination buffer if dstIdx `offsetPlusE` nbBytes <= sizeAsOffset dsz then do dstIdx' <- write mba dstIdx c' loop dstIdx' srcIdx' else return (offsetAsSize dstIdx, srcIdx) copyLoop _ [] (Offset 0) = return () copyLoop _ [] n = error ("charMap invalid: " <> show n) copyLoop ms@(MutableString mba) ((String ba, sz):xs) end = do let start = end `offsetMinusE` sz Vec.unsafeCopyAtRO mba start ba (Offset 0) sz copyLoop ms xs start snoc :: String -> Char -> String snoc s@(String ba) c | len == Size 0 = C.singleton c | otherwise = runST $ do ms@(MutableString mba) <- new (len + nbBytes) Vec.unsafeCopyAtRO mba (Offset 0) ba (Offset 0) len _ <- write ms (azero `offsetPlusE` len) c freeze ms where !len = size s !nbBytes = charToBytes (fromEnum c) cons :: Char -> String -> String cons c s@(String ba) | len == Size 0 = C.singleton c | otherwise = runST $ do ms@(MutableString mba) <- new (len + nbBytes) idx <- write ms (Offset 0) c Vec.unsafeCopyAtRO mba idx ba (Offset 0) len freeze ms where !len = size s !nbBytes = charToBytes (fromEnum c) unsnoc :: String -> Maybe (String, Char) unsnoc s | null s = Nothing | otherwise = let (s1,s2) = revSplitAt 1 s in case toList s1 of -- TODO use index instead of toList [c] -> Just (s2, c) _ -> internalError "unsnoc" uncons :: String -> Maybe (Char, String) uncons s | null s = Nothing | otherwise = let (s1,s2) = splitAt 1 s in case toList s1 of -- TODO use index instead of ToList [c] -> Just (c, s2) _ -> internalError "uncons" find :: (Char -> Bool) -> String -> Maybe Char find predicate s = loop (Offset 0) where !sz = size s end = Offset 0 `offsetPlusE` sz loop idx | idx == end = Nothing | otherwise = let (# c, idx' #) = next s idx in case predicate c of True -> Just c False -> loop idx' sortBy :: (Char -> Char -> Ordering) -> String -> String sortBy sortF s = fromList $ Data.List.sortBy sortF $ toList s -- FIXME for tests filter :: (Char -> Bool) -> String -> String filter p s = fromList $ Data.List.filter p $ toList s reverse :: String -> String reverse s@(String ba) = runST $ do ms <- new len loop ms (Offset 0) (Offset 0 `offsetPlusE` len) where !len = size s -- write those bytes loop :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> Offset8 -> prim String loop ms@(MutableString mba) sidx@(Offset si) didx | didx == Offset 0 = freeze ms | otherwise = do let !h = Vec.unsafeIndex ba si !nb = Size (getNbBytes h + 1) didx'@(Offset d) = didx `offsetMinusE` nb case nb of Size 1 -> C.mutUnsafeWrite mba d h Size 2 -> do C.mutUnsafeWrite mba d h C.mutUnsafeWrite mba (d + 1) (Vec.unsafeIndex ba (si + 1)) Size 3 -> do C.mutUnsafeWrite mba d h C.mutUnsafeWrite mba (d + 1) (Vec.unsafeIndex ba (si + 1)) C.mutUnsafeWrite mba (d + 2) (Vec.unsafeIndex ba (si + 2)) Size 4 -> do C.mutUnsafeWrite mba d h C.mutUnsafeWrite mba (d + 1) (Vec.unsafeIndex ba (si + 1)) C.mutUnsafeWrite mba (d + 2) (Vec.unsafeIndex ba (si + 2)) C.mutUnsafeWrite mba (d + 3) (Vec.unsafeIndex ba (si + 3)) _ -> return () -- impossible loop ms (sidx `offsetPlusE` nb) didx' data Encoding = ASCII7 | UTF8 | UTF16 | UTF32 | ISO_8859_1 deriving (Typeable, Data, Eq, Ord, Show, Enum, Bounded) fromEncoderBytes :: ( Encoder.Encoding encoding , Exception (Encoder.Error encoding) , PrimType (Encoder.Unit encoding) ) => encoding -> UArray Word8 -> (String, Maybe ValidationFailure, UArray Word8) fromEncoderBytes enc bytes = ( String $ runST $ Encoder.convertFromTo enc EncoderUTF8 (Vec.recast bytes) , Nothing , mempty ) -- | Convert a ByteArray to a string assuming a specific encoding. -- -- It returns a 3-tuple of: -- -- * The string that has been succesfully converted without any error -- * An optional validation error -- * The remaining buffer that hasn't been processed (either as a result of an error, or because the encoded sequence is not fully available) -- -- Considering a stream of data that is fetched chunk by chunk, it's valid to assume -- that some sequence might fall in a chunk boundary. When converting chunks, -- if the error is Nothing and the remaining buffer is not empty, then this buffer -- need to be prepended to the next chunk fromBytes :: Encoding -> UArray Word8 -> (String, Maybe ValidationFailure, UArray Word8) fromBytes ASCII7 bytes = fromEncoderBytes Encoder.ASCII7 bytes fromBytes ISO_8859_1 bytes = fromEncoderBytes Encoder.ISO_8859_1 bytes fromBytes UTF16 bytes = fromEncoderBytes Encoder.UTF16 bytes fromBytes UTF32 bytes = fromEncoderBytes Encoder.UTF32 bytes fromBytes UTF8 bytes | C.null bytes = (mempty, Nothing, mempty) | otherwise = case validate bytes (Offset 0) (Size $ C.length bytes) of (_, Nothing) -> (fromBytesUnsafe bytes, Nothing, mempty) (Offset pos, Just vf) -> let (b1, b2) = C.splitAt pos bytes in (fromBytesUnsafe b1, toErr vf, b2) where toErr MissingByte = Nothing toErr InvalidHeader = Just InvalidHeader toErr InvalidContinuation = Just InvalidContinuation -- | Convert a UTF8 array of bytes to a String. -- -- If there's any error in the stream, it will automatically -- insert replacement bytes to replace invalid sequences. -- -- In the case of sequence that fall in the middle of 2 chunks, -- the remaining buffer is supposed to be preprended to the -- next chunk, and resume the parsing. fromBytesLenient :: UArray Word8 -> (String, UArray Word8) fromBytesLenient bytes | C.null bytes = (mempty, mempty) | otherwise = case validate bytes (Offset 0) (Size $ C.length bytes) of (_, Nothing) -> (fromBytesUnsafe bytes, mempty) (Offset pos, Just MissingByte) -> let (b1,b2) = C.splitAt pos bytes in (fromBytesUnsafe b1, b2) (Offset pos, Just InvalidHeader) -> let (b1,b2) = C.splitAt pos bytes (_,b3) = C.splitAt 1 b2 (s3, r) = fromBytesLenient b3 in (mconcat [fromBytesUnsafe b1,replacement, s3], r) (Offset pos, Just InvalidContinuation) -> let (b1,b2) = C.splitAt pos bytes (_,b3) = C.splitAt 1 b2 (s3, r) = fromBytesLenient b3 in (mconcat [fromBytesUnsafe b1,replacement, s3], r) where -- This is the replacement character U+FFFD used for any invalid header or continuation replacement :: String !replacement = fromBytesUnsafe $ fromList [0xef,0xbf,0xbd] fromChunkBytes :: [UArray Word8] -> [String] fromChunkBytes l = loop l where loop [] = [] loop (bytes:[]) = case validate bytes (Offset 0) (Size $ C.length bytes) of (_, Nothing) -> [fromBytesUnsafe bytes] (_, Just err) -> doErr err loop (bytes:cs@(c1:c2)) = case validate bytes (Offset 0) (Size $ C.length bytes) of (_, Nothing) -> fromBytesUnsafe bytes : loop cs (Offset pos, Just MissingByte) -> let (b1,b2) = C.splitAt pos bytes in fromBytesUnsafe b1 : loop ((b2 `mappend` c1) : c2) (_, Just err) -> doErr err doErr err = error ("fromChunkBytes: " <> show err) -- | Convert a Byte Array representing UTF8 data directly to a string without checking for UTF8 validity -- -- If the input contains invalid sequences, it will trigger runtime async errors when processing data. -- -- In doubt, use 'fromBytes' fromBytesUnsafe :: UArray Word8 -> String fromBytesUnsafe = String toEncoderBytes :: ( Encoder.Encoding encoding , PrimType (Encoder.Unit encoding) , Exception (Encoder.Error encoding) ) => encoding -> UArray Word8 -> UArray Word8 toEncoderBytes enc bytes = Vec.recast (runST $ Encoder.convertFromTo EncoderUTF8 enc bytes) -- | Convert a String to a bytearray in a specific encoding -- -- if the encoding is UTF8, the underlying buffer is returned without extra allocation or any processing -- -- In any other encoding, some allocation and processing are done to convert. toBytes :: Encoding -> String -> UArray Word8 toBytes UTF8 (String bytes) = bytes toBytes ASCII7 (String bytes) = toEncoderBytes Encoder.ASCII7 bytes toBytes ISO_8859_1 (String bytes) = toEncoderBytes Encoder.ISO_8859_1 bytes toBytes UTF16 (String bytes) = toEncoderBytes Encoder.UTF16 bytes toBytes UTF32 (String bytes) = toEncoderBytes Encoder.UTF32 bytes lines :: String -> [String] lines = fmap fromList . Prelude.lines . toList words :: String -> [String] words = fmap fromList . Prelude.words . toList