module Foundation.String.UTF8
( String(..)
, create
, replicate
, Encoding(..)
, fromBytes
, fromChunkBytes
, fromBytesUnsafe
, fromBytesLenient
, toBytes
, mutableValidate
, copy
, ValidationFailure(..)
, lines
, words
) where
import Foundation.Array.Unboxed (UArray)
import qualified Foundation.Array.Unboxed as Vec
import Foundation.Array.Unboxed.ByteArray (MutableByteArray)
import qualified Foundation.Array.Unboxed.Mutable as MVec
import qualified Foundation.Collection as C
import Foundation.Collection.Buildable
import Foundation.Internal.Base
import Foundation.Internal.MonadTrans
import Foundation.Internal.Primitive
import Foundation.Internal.Types
import Foundation.Numerical
import Foundation.Primitive.Monad
import Foundation.Primitive.Types
import Foundation.String.UTF8Table
import GHC.Prim
import GHC.ST
import GHC.Types
import GHC.Word
import GHC.Char
import qualified Data.List
import Data.Data
import qualified Prelude
import Foundation.String.ModifiedUTF8 (fromModified)
import GHC.CString (unpackCString#,
unpackCStringUtf8#)
import qualified Foundation.String.Encoding.Encoding as Encoder
import qualified Foundation.String.Encoding.ASCII7 as Encoder
import qualified Foundation.String.Encoding.UTF16 as Encoder
import qualified Foundation.String.Encoding.UTF32 as Encoder
import qualified Foundation.String.Encoding.ISO_8859_1 as Encoder
newtype String = String (UArray Word8)
deriving (Typeable, Monoid, Eq, Ord)
instance Data String where
toConstr s = mkConstr stringType (show s) [] Prefix
dataTypeOf _ = stringType
gunfold _ _ = error "gunfold"
stringType :: DataType
stringType = mkNoRepType "Foundation.String"
newtype MutableString st = MutableString (MutableByteArray st)
deriving (Typeable)
instance Show String where
show = show . sToList
instance IsString String where
fromString = sFromList
instance IsList String where
type Item String = Char
fromList = sFromList
toList = sToList
type instance C.Element String = Char
instance C.InnerFunctor String where
imap = charMap
instance C.Collection String where
null = null
length = length
elem = elem
minimum = Data.List.minimum . toList . C.getNonEmpty
maximum = Data.List.maximum . toList . C.getNonEmpty
instance C.Sequential String where
take = take
drop = drop
splitAt = splitAt
revTake = revTake
revDrop = revDrop
revSplitAt = revSplitAt
splitOn = splitOn
break = break
breakElem = breakElem
intersperse = intersperse
span = span
filter = filter
reverse = reverse
unsnoc = unsnoc
uncons = uncons
snoc = snoc
cons = cons
find = find
sortBy = sortBy
singleton = fromList . (:[])
instance C.Zippable String where
zipWith f as bs = runST $ build 64 $ go f (toList as) (toList bs)
where
go _ [] _ = return ()
go _ _ [] = return ()
go f' (a':as') (b':bs') = append (f' a' b') >> go f' as' bs'
instance Buildable String where
type Mutable String = MutableString
type Step String = Word8
append c = Builder $ State $ \(i, st) ->
if offsetAsSize i + nbBytes >= chunkSize st
then do
cur <- unsafeFreezeShrink (curChunk st) (offsetAsSize i)
newChunk <- new (chunkSize st)
writeUTF8Char newChunk (Offset 0) utf8Char
return ((), (sizeAsOffset nbBytes, st { prevChunks = cur : prevChunks st
, prevChunksSize = offsetAsSize i + prevChunksSize st
, curChunk = newChunk
}))
else do
writeUTF8Char (curChunk st) i utf8Char
return ((), (i + sizeAsOffset nbBytes, st))
where
utf8Char = asUTF8Char c
nbBytes = numBytes utf8Char
build sizeChunksI sb
| sizeChunksI <= 3 = build 64 sb
| otherwise = do
first <- new sizeChunks
((), (i, st)) <- runState (runBuilder sb) (Offset 0, BuildingState [] (Size 0) first sizeChunks)
cur <- unsafeFreezeShrink (curChunk st) (offsetAsSize i)
let totalSize = prevChunksSize st + offsetAsSize i
final <- Vec.new totalSize >>= fillFromEnd totalSize (cur : prevChunks st) >>= Vec.unsafeFreeze
return $ String final
where
sizeChunks = Size sizeChunksI
fillFromEnd _ [] mba = return mba
fillFromEnd !end (String x:xs) mba = do
let sz = Vec.lengthSize x
Vec.unsafeCopyAtRO mba (sizeAsOffset (end sz)) x (Offset 0) sz
fillFromEnd (end sz) xs mba
data ValidationFailure = InvalidHeader
| InvalidContinuation
| MissingByte
deriving (Show,Eq,Typeable)
instance Exception ValidationFailure
data EncoderUTF8 = EncoderUTF8
instance Encoder.Encoding EncoderUTF8 where
type Unit EncoderUTF8 = Word8
type Error EncoderUTF8 = ValidationFailure
encodingNext _ = \ofs -> Right . nextWithIndexer ofs
encodingWrite _ = writeWithBuilder
validate :: UArray Word8
-> Offset8
-> Size Word8
-> (Offset8, Maybe ValidationFailure)
validate ba ofsStart sz = runST (Vec.unsafeIndexer ba go)
where
end = ofsStart `offsetPlusE` sz
go :: (Offset Word8 -> Word8) -> ST s (Offset Word8, Maybe ValidationFailure)
go getIdx = return $ loop ofsStart
where
loop ofs
| ofs > end = error "validate: internal error: went pass offset"
| ofs == end = (end, Nothing)
| otherwise =
case one ofs of
(nextOfs, Nothing) -> loop nextOfs
(pos, Just failure) -> (pos, Just failure)
one pos =
case nbConts of
0 -> (pos + 1, Nothing)
0xff -> (pos, Just InvalidHeader)
_ | (pos + 1) `offsetPlusE` nbContsE > end -> (pos, Just MissingByte)
1 ->
let c1 = getIdx (pos + 1)
in if isContinuation c1
then (pos + 2, Nothing)
else (pos, Just InvalidContinuation)
2 ->
let c1 = getIdx (pos + 1)
c2 = getIdx (pos + 2)
in if isContinuation c1 && isContinuation c2
then (pos + 3, Nothing)
else (pos, Just InvalidContinuation)
3 ->
let c1 = getIdx (pos + 1)
c2 = getIdx (pos + 2)
c3 = getIdx (pos + 3)
in if isContinuation c1 && isContinuation c2 && isContinuation c3
then (pos + 4, Nothing)
else (pos, Just InvalidContinuation)
_ -> error "internal error"
where
!h = getIdx pos
!nbContsE@(Size nbConts) = Size $ getNbBytes h
mutableValidate :: PrimMonad prim
=> MutableByteArray (PrimState prim)
-> Int
-> Int
-> prim (Int, Maybe ValidationFailure)
mutableValidate mba ofsStart sz = do
loop ofsStart
where
end = ofsStart + sz
loop ofs
| ofs > end = error "mutableValidate: internal error: went pass offset"
| ofs == end = return (end, Nothing)
| otherwise = do
r <- one ofs
case r of
(nextOfs, Nothing) -> loop nextOfs
(pos, Just failure) -> return (pos, Just failure)
one pos = do
h <- C.mutUnsafeRead mba pos
let nbConts = getNbBytes h
if nbConts == 0xff
then return (pos, Just InvalidHeader)
else if pos + 1 + nbConts > end
then return (pos, Just MissingByte)
else do
case nbConts of
0 -> return (pos + 1, Nothing)
1 -> do
c1 <- C.mutUnsafeRead mba (pos + 1)
if isContinuation c1
then return (pos + 2, Nothing)
else return (pos, Just InvalidContinuation)
2 -> do
c1 <- C.mutUnsafeRead mba (pos + 1)
c2 <- C.mutUnsafeRead mba (pos + 2)
if isContinuation c1 && isContinuation c2
then return (pos + 3, Nothing)
else return (pos, Just InvalidContinuation)
3 -> do
c1 <- C.mutUnsafeRead mba (pos + 1)
c2 <- C.mutUnsafeRead mba (pos + 2)
c3 <- C.mutUnsafeRead mba (pos + 3)
if isContinuation c1 && isContinuation c2 && isContinuation c3
then return (pos + 4, Nothing)
else return (pos, Just InvalidContinuation)
_ -> error "internal error"
skipNextHeaderValue :: Word8 -> Size Word8
skipNextHeaderValue !x
| x < 0xC0 = Size 1
| x < 0xE0 = Size 2
| x < 0xF0 = Size 3
| otherwise = Size 4
nextWithIndexer :: (Offset Word8 -> Word8)
-> Offset Word8
-> (Char, Offset Word8)
nextWithIndexer getter off =
case getNbBytes# h of
0# -> (toChar h, off + aone)
1# -> (toChar (decode2 (getter $ off + aone)), off + atwo)
2# -> (toChar (decode3 (getter $ off + aone) (getter $ off + atwo)), off + athree)
3# -> (toChar (decode4 (getter $ off + aone) (getter $ off + atwo) (getter $ off + athree))
, off + afour)
r -> error ("next: internal error: invalid input: " <> show (I# r) <> " " <> show (W# h))
where
aone = Offset 1
atwo = Offset 2
athree = Offset 3
afour = Offset 4
!(W8# h) = getter off
toChar :: Word# -> Char
toChar w = C# (chr# (word2Int# w))
decode2 :: Word8 -> Word#
decode2 (W8# c1) =
or# (uncheckedShiftL# (and# h 0x1f##) 6#)
(and# c1 0x3f##)
decode3 :: Word8 -> Word8 -> Word#
decode3 (W8# c1) (W8# c2) =
or# (uncheckedShiftL# (and# h 0xf##) 12#)
(or# (uncheckedShiftL# (and# c1 0x3f##) 6#)
(and# c2 0x3f##))
decode4 :: Word8 -> Word8 -> Word8 -> Word#
decode4 (W8# c1) (W8# c2) (W8# c3) =
or# (uncheckedShiftL# (and# h 0x7##) 18#)
(or# (uncheckedShiftL# (and# c1 0x3f##) 12#)
(or# (uncheckedShiftL# (and# c2 0x3f##) 6#)
(and# c3 0x3f##))
)
writeWithBuilder :: (PrimMonad st, Monad st)
=> Char
-> Builder (UArray Word8) st ()
writeWithBuilder c =
if bool# (ltWord# x 0x80## ) then encode1
else if bool# (ltWord# x 0x800## ) then encode2
else if bool# (ltWord# x 0x10000##) then encode3
else encode4
where
!(I# xi) = fromEnum c
!x = int2Word# xi
encode1 = append (W8# x)
encode2 = do
let x1 = or# (uncheckedShiftRL# x 6#) 0xc0##
x2 = toContinuation x
append (W8# x1) >> append (W8# x2)
encode3 = do
let x1 = or# (uncheckedShiftRL# x 12#) 0xe0##
x2 = toContinuation (uncheckedShiftRL# x 6#)
x3 = toContinuation x
append (W8# x1) >> append (W8# x2) >> append (W8# x3)
encode4 = do
let x1 = or# (uncheckedShiftRL# x 18#) 0xf0##
x2 = toContinuation (uncheckedShiftRL# x 12#)
x3 = toContinuation (uncheckedShiftRL# x 6#)
x4 = toContinuation x
append (W8# x1) >> append (W8# x2) >> append (W8# x3) >> append (W8# x4)
toContinuation :: Word# -> Word#
toContinuation w = or# (and# w 0x3f##) 0x80##
next :: String -> Offset8 -> (# Char, Offset8 #)
next (String ba) (Offset n) =
case getNbBytes# h of
0# -> (# toChar h, Offset $ n + 1 #)
1# -> (# toChar (decode2 (Vec.unsafeIndex ba (n + 1))) , Offset $ n + 2 #)
2# -> (# toChar (decode3 (Vec.unsafeIndex ba (n + 1))
(Vec.unsafeIndex ba (n + 2))) , Offset $ n + 3 #)
3# -> (# toChar (decode4 (Vec.unsafeIndex ba (n + 1))
(Vec.unsafeIndex ba (n + 2))
(Vec.unsafeIndex ba (n + 3))) , Offset $ n + 4 #)
r -> error ("next: internal error: invalid input: offset=" <> show n <> " table=" <> show (I# r) <> " h=" <> show (W# h))
where
!(W8# h) = Vec.unsafeIndex ba n
toChar :: Word# -> Char
toChar w = C# (chr# (word2Int# w))
decode2 :: Word8 -> Word#
decode2 (W8# c1) =
or# (uncheckedShiftL# (and# h 0x1f##) 6#)
(and# c1 0x3f##)
decode3 :: Word8 -> Word8 -> Word#
decode3 (W8# c1) (W8# c2) =
or# (uncheckedShiftL# (and# h 0xf##) 12#)
(or# (uncheckedShiftL# (and# c1 0x3f##) 6#)
(and# c2 0x3f##))
decode4 :: Word8 -> Word8 -> Word8 -> Word#
decode4 (W8# c1) (W8# c2) (W8# c3) =
or# (uncheckedShiftL# (and# h 0x7##) 18#)
(or# (uncheckedShiftL# (and# c1 0x3f##) 12#)
(or# (uncheckedShiftL# (and# c2 0x3f##) 6#)
(and# c3 0x3f##))
)
data UTF8Char =
UTF8_1 !Word8
| UTF8_2 !Word8 !Word8
| UTF8_3 !Word8 !Word8 !Word8
| UTF8_4 !Word8 !Word8 !Word8 !Word8
asUTF8Char :: Char -> UTF8Char
asUTF8Char !c
| bool# (ltWord# x 0x80## ) = encode1
| bool# (ltWord# x 0x800## ) = encode2
| bool# (ltWord# x 0x10000##) = encode3
| otherwise = encode4
where
!(I# xi) = fromEnum c
!x = int2Word# xi
encode1 = UTF8_1 (W8# x)
encode2 =
let !x1 = W8# (or# (uncheckedShiftRL# x 6#) 0xc0##)
!x2 = toContinuation x
in UTF8_2 x1 x2
encode3 =
let !x1 = W8# (or# (uncheckedShiftRL# x 12#) 0xe0##)
!x2 = toContinuation (uncheckedShiftRL# x 6#)
!x3 = toContinuation x
in UTF8_3 x1 x2 x3
encode4 =
let !x1 = W8# (or# (uncheckedShiftRL# x 18#) 0xf0##)
!x2 = toContinuation (uncheckedShiftRL# x 12#)
!x3 = toContinuation (uncheckedShiftRL# x 6#)
!x4 = toContinuation x
in UTF8_4 x1 x2 x3 x4
toContinuation :: Word# -> Word8
toContinuation w = W8# (or# (and# w 0x3f##) 0x80##)
numBytes :: UTF8Char -> Size8
numBytes UTF8_1{} = Size 1
numBytes UTF8_2{} = Size 2
numBytes UTF8_3{} = Size 3
numBytes UTF8_4{} = Size 4
writeUTF8Char :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> UTF8Char -> prim ()
writeUTF8Char (MutableString mba) (Offset i) (UTF8_1 x1) =
C.mutUnsafeWrite mba i x1
writeUTF8Char (MutableString mba) (Offset i) (UTF8_2 x1 x2) = do
C.mutUnsafeWrite mba i x1
C.mutUnsafeWrite mba (i+1) x2
writeUTF8Char (MutableString mba) (Offset i) (UTF8_3 x1 x2 x3) = do
C.mutUnsafeWrite mba i x1
C.mutUnsafeWrite mba (i+1) x2
C.mutUnsafeWrite mba (i+2) x3
writeUTF8Char (MutableString mba) (Offset i) (UTF8_4 x1 x2 x3 x4) = do
C.mutUnsafeWrite mba i x1
C.mutUnsafeWrite mba (i+1) x2
C.mutUnsafeWrite mba (i+2) x3
C.mutUnsafeWrite mba (i+3) x4
write :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> Char -> prim Offset8
write (MutableString mba) (Offset i) c =
if bool# (ltWord# x 0x80## ) then encode1
else if bool# (ltWord# x 0x800## ) then encode2
else if bool# (ltWord# x 0x10000##) then encode3
else encode4
where
!(I# xi) = fromEnum c
!x = int2Word# xi
encode1 = C.mutUnsafeWrite mba i (W8# x) >> return (Offset $ i + 1)
encode2 = do
let x1 = or# (uncheckedShiftRL# x 6#) 0xc0##
x2 = toContinuation x
C.mutUnsafeWrite mba i (W8# x1)
C.mutUnsafeWrite mba (i+1) (W8# x2)
return $ Offset (i + 2)
encode3 = do
let x1 = or# (uncheckedShiftRL# x 12#) 0xe0##
x2 = toContinuation (uncheckedShiftRL# x 6#)
x3 = toContinuation x
C.mutUnsafeWrite mba i (W8# x1)
C.mutUnsafeWrite mba (i+1) (W8# x2)
C.mutUnsafeWrite mba (i+2) (W8# x3)
return $ Offset (i + 3)
encode4 = do
let x1 = or# (uncheckedShiftRL# x 18#) 0xf0##
x2 = toContinuation (uncheckedShiftRL# x 12#)
x3 = toContinuation (uncheckedShiftRL# x 6#)
x4 = toContinuation x
C.mutUnsafeWrite mba i (W8# x1)
C.mutUnsafeWrite mba (i+1) (W8# x2)
C.mutUnsafeWrite mba (i+2) (W8# x3)
C.mutUnsafeWrite mba (i+3) (W8# x4)
return $ Offset (i + 4)
toContinuation :: Word# -> Word#
toContinuation w = or# (and# w 0x3f##) 0x80##
freeze :: PrimMonad prim => MutableString (PrimState prim) -> prim String
freeze (MutableString mba) = String `fmap` C.unsafeFreeze mba
unsafeFreezeShrink :: PrimMonad prim => MutableString (PrimState prim) -> Size Word8 -> prim String
unsafeFreezeShrink (MutableString mba) s = String <$> Vec.unsafeFreezeShrink mba s
sToList :: String -> [Char]
sToList s = loop azero
where
!nbBytes = size s
end = azero `offsetPlusE` nbBytes
loop idx
| idx == end = []
| otherwise =
let (# c , idx' #) = next s idx in c : loop idx'
sFromList :: [Char] -> String
sFromList l = runST (new bytes >>= startCopy)
where
!bytes = C.foldl' (+) (Size 0) $ fmap (charToBytes . fromEnum) l
startCopy :: MutableString (PrimState (ST st)) -> ST st String
startCopy ms = loop azero l
where
loop _ [] = freeze ms
loop idx (c:xs) = write ms idx c >>= \idx' -> loop idx' xs
null :: String -> Bool
null (String ba) = C.length ba == 0
take :: Int -> String -> String
take n s@(String ba)
| n <= 0 = mempty
| n >= C.length ba = s
| otherwise = let (Offset o) = indexN n s in String $ Vec.take o ba
drop :: Int -> String -> String
drop n s@(String ba)
| n <= 0 = s
| n >= C.length ba = mempty
| otherwise = let (Offset o) = indexN n s in String $ Vec.drop o ba
splitAt :: Int -> String -> (String, String)
splitAt nI s@(String ba)
| nI <= 0 = (mempty, s)
| nI >= C.length ba = (s, mempty)
| otherwise =
let (Offset k) = indexN nI s
(v1,v2) = C.splitAt k ba
in (String v1, String v2)
indexN :: Int -> String -> Offset Word8
indexN nI (String ba) = Vec.unsafeDewrap goVec goAddr ba
where
!n = Size nI
end :: Offset Char
!end = Offset 0 `offsetPlusE` n
goVec :: ByteArray# -> Offset Word8 -> Offset Word8
goVec !ma !start = loop start (Offset 0)
where
!len = start `offsetPlusE` Vec.lengthSize ba
loop :: Offset Word8 -> Offset Char -> Offset Word8
loop !idx !i
| idx >= len || i >= end = sizeAsOffset (idx start)
| otherwise = loop (idx `offsetPlusE` d) (i + Offset 1)
where d = skipNextHeaderValue (primBaIndex ma idx)
goAddr :: Ptr Word8 -> Offset Word8 -> ST s (Offset Word8)
goAddr !(Ptr ptr) !start = return $ loop start (Offset 0)
where
!len = start `offsetPlusE` Vec.lengthSize ba
loop :: Offset Word8 -> Offset Char -> Offset Word8
loop !idx !i
| idx >= len || i >= end = sizeAsOffset (idx start)
| otherwise = loop (idx `offsetPlusE` d) (i + Offset 1)
where d = skipNextHeaderValue (primAddrIndex ptr idx)
revTake :: Int -> String -> String
revTake nbElems v = drop (length v nbElems) v
revDrop :: Int -> String -> String
revDrop nbElems v = take (length v nbElems) v
revSplitAt :: Int -> String -> (String, String)
revSplitAt n v = (drop idx v, take idx v)
where idx = length v n
splitOn :: (Char -> Bool) -> String -> [String]
splitOn predicate s
| sz == Size 0 = []
| otherwise = loop azero azero
where
!sz = size s
end = azero `offsetPlusE` sz
loop prevIdx idx
| idx == end = [sub s prevIdx idx]
| otherwise =
let (# c, idx' #) = next s idx
in if predicate c
then sub s prevIdx idx : loop idx' idx'
else loop prevIdx idx'
sub :: String -> Offset8 -> Offset8 -> String
sub (String ba) (Offset start) (Offset end) = String $ Vec.sub ba start end
splitIndex :: Offset8 -> String -> (String, String)
splitIndex (Offset idx) (String ba) = (String v1, String v2)
where (v1,v2) = C.splitAt idx ba
break :: (Char -> Bool) -> String -> (String, String)
break predicate s@(String ba) = runST $ Vec.unsafeIndexer ba go
where
!sz = size s
end = azero `offsetPlusE` sz
go :: (Offset Word8 -> Word8) -> ST st (String, String)
go getIdx = loop (Offset 0)
where
!nextI = nextWithIndexer getIdx
loop idx
| idx == end = return (s, mempty)
| otherwise = do
let (c, idx') = nextI idx
case predicate c of
True -> return $ splitIndex idx s
False -> loop idx'
#if MIN_VERSION_base(4,9,0)
#else
#endif
breakElem :: Char -> String -> (String, String)
breakElem !el s@(String ba) =
case asUTF8Char el of
UTF8_1 w -> let (# v1,v2 #) = Vec.splitElem w ba in (String v1, String v2)
_ -> runST $ Vec.unsafeIndexer ba go
where
sz = size s
end = azero `offsetPlusE` sz
go :: (Offset Word8 -> Word8) -> ST st (String, String)
go getIdx = loop (Offset 0)
where
!nextI = nextWithIndexer getIdx
loop idx
| idx == end = return (s, mempty)
| otherwise = do
let (c, idx') = nextI idx
case el == c of
True -> return $ splitIndex idx s
False -> loop idx'
elem :: Char -> String -> Bool
elem !el s@(String ba) =
case asUTF8Char el of
UTF8_1 w -> Vec.elem w ba
_ -> runST $ Vec.unsafeIndexer ba go
where
sz = size s
end = azero `offsetPlusE` sz
go :: (Offset Word8 -> Word8) -> ST st Bool
go getIdx = loop (Offset 0)
where
!nextI = nextWithIndexer getIdx
loop !idx
| idx == end = return False
| otherwise = do
let (c, idx') = nextI idx
case el == c of
True -> return True
False -> loop idx'
intersperse :: Char -> String -> String
intersperse sep src
| srcLen <= 1 = src
| otherwise = runST $ unsafeCopyFrom src dstBytes (go sep)
where
!srcBytes = size src
!srcLen = lengthSize src
dstBytes = (srcBytes :: Size8)
+ ((srcLen 1) `scale` charToBytes (fromEnum sep))
lastSrcI :: Offset Char
lastSrcI = 0 `offsetPlusE` (srcLen 1)
go :: Char -> String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8)
go sep' src' srcI srcIdx dst dstIdx
| srcI == lastSrcI = do
nextDstIdx <- write dst dstIdx c
return (nextSrcIdx, nextDstIdx)
| otherwise = do
nextDstIdx <- write dst dstIdx c
nextDstIdx' <- write dst nextDstIdx sep'
return (nextSrcIdx, nextDstIdx')
where
(# c, nextSrcIdx #) = next src' srcIdx
unsafeCopyFrom :: String
-> Size8
-> (String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8))
-> ST s String
unsafeCopyFrom src dstBytes f = new dstBytes >>= fill (Offset 0) (Offset 0) (Offset 0) f >>= freeze
where
srcLen = length src
end = Offset 0 `offsetPlusE` Size srcLen
fill srcI srcIdx dstIdx f' dst'
| srcI == end = return dst'
| otherwise = do (nextSrcIdx, nextDstIdx) <- f' src srcI srcIdx dst' dstIdx
fill (srcI + Offset 1) nextSrcIdx nextDstIdx f' dst'
span :: (Char -> Bool) -> String -> (String, String)
span predicate s = break (not . predicate) s
size :: String -> Size8
size (String ba) = Size $ C.length ba
lengthSize :: String -> Size Char
lengthSize (String ba)
| C.null ba = Size 0
| otherwise = Vec.unsafeDewrap goVec goAddr ba
where
goVec ma start = loop start (Size 0)
where
!end = start `offsetPlusE` Vec.lengthSize ba
loop !idx !i
| idx >= end = i
| otherwise = loop (idx `offsetPlusE` d) (i + Size 1)
where d = skipNextHeaderValue (primBaIndex ma idx)
goAddr (Ptr ptr) start = return $ loop start (Size 0)
where
!end = start `offsetPlusE` Vec.lengthSize ba
loop !idx !i
| idx >= end = i
| otherwise = loop (idx `offsetPlusE` d) (i + Size 1)
where d = skipNextHeaderValue (primAddrIndex ptr idx)
length :: String -> Int
length s = let (Size sz) = lengthSize s in sz
replicate :: Int -> Char -> String
replicate n c = runST (new nbBytes >>= fill)
where
end = azero `offsetPlusE` nbBytes
nbBytes = Size $ sz * n
(Size sz) = charToBytes (fromEnum c)
fill :: PrimMonad prim => MutableString (PrimState prim) -> prim String
fill ms = loop (Offset 0)
where
loop idx
| idx == end = freeze ms
| otherwise = write ms idx c >>= loop
copy :: String -> String
copy (String s) = String (Vec.copy s)
new :: PrimMonad prim
=> Size8
-> prim (MutableString (PrimState prim))
new n = MutableString `fmap` MVec.new n
create :: PrimMonad prim => Int -> (MutableString (PrimState prim) -> prim Int) -> prim String
create sz f = do
ms <- new (Size sz)
filled <- f ms
if filled == sz
then freeze ms
else C.take filled `fmap` freeze ms
charToBytes :: Int -> Size8
charToBytes c
| c < 0x80 = Size 1
| c < 0x800 = Size 2
| c < 0x10000 = Size 3
| c < 0x110000 = Size 4
| otherwise = error ("invalid code point: " `mappend` show c)
charMap :: (Char -> Char) -> String -> String
charMap f src =
let !(elems, nbBytes) = allocateAndFill [] (Offset 0) (Size 0)
in runST $ do
dest <- new nbBytes
copyLoop dest elems (Offset 0 `offsetPlusE` nbBytes)
freeze dest
where
!srcSz = size src
srcEnd = azero `offsetPlusE` srcSz
allocateAndFill :: [(String, Size8)]
-> Offset8
-> Size8
-> ([(String,Size8)], Size8)
allocateAndFill acc idx bytesWritten
| idx == srcEnd = (acc, bytesWritten)
| otherwise =
let (el@(_,addBytes), idx') = runST $ do
let !diffBytes = srcEnd idx
!allocatedBytes = if diffBytes <= Size 4 then Size 4 else diffBytes
ms <- new allocatedBytes
(dstIdx, srcIdx) <- fill ms allocatedBytes idx
s <- freeze ms
return ((s, dstIdx), srcIdx)
in allocateAndFill (el : acc) idx' (bytesWritten + addBytes)
fill :: PrimMonad prim
=> MutableString (PrimState prim)
-> Size8
-> Offset8
-> prim (Size8, Offset8)
fill mba dsz srcIdxOrig =
loop (Offset 0) srcIdxOrig
where
endDst = (Offset 0) `offsetPlusE` dsz
loop dstIdx srcIdx
| srcIdx == srcEnd = return (offsetAsSize dstIdx, srcIdx)
| dstIdx == endDst = return (offsetAsSize dstIdx, srcIdx)
| otherwise =
let (# c, srcIdx' #) = next src srcIdx
c' = f c
!nbBytes = charToBytes (fromEnum c')
in
if dstIdx `offsetPlusE` nbBytes <= sizeAsOffset dsz
then do dstIdx' <- write mba dstIdx c'
loop dstIdx' srcIdx'
else return (offsetAsSize dstIdx, srcIdx)
copyLoop _ [] (Offset 0) = return ()
copyLoop _ [] n = error ("charMap invalid: " <> show n)
copyLoop ms@(MutableString mba) ((String ba, sz):xs) end = do
let start = end `offsetMinusE` sz
Vec.unsafeCopyAtRO mba start ba (Offset 0) sz
copyLoop ms xs start
snoc :: String -> Char -> String
snoc s@(String ba) c
| len == Size 0 = C.singleton c
| otherwise = runST $ do
ms@(MutableString mba) <- new (len + nbBytes)
Vec.unsafeCopyAtRO mba (Offset 0) ba (Offset 0) len
_ <- write ms (azero `offsetPlusE` len) c
freeze ms
where
!len = size s
!nbBytes = charToBytes (fromEnum c)
cons :: Char -> String -> String
cons c s@(String ba)
| len == Size 0 = C.singleton c
| otherwise = runST $ do
ms@(MutableString mba) <- new (len + nbBytes)
idx <- write ms (Offset 0) c
Vec.unsafeCopyAtRO mba idx ba (Offset 0) len
freeze ms
where
!len = size s
!nbBytes = charToBytes (fromEnum c)
unsnoc :: String -> Maybe (String, Char)
unsnoc s
| null s = Nothing
| otherwise =
let (s1,s2) = revSplitAt 1 s
in case toList s1 of
[c] -> Just (s2, c)
_ -> internalError "unsnoc"
uncons :: String -> Maybe (Char, String)
uncons s
| null s = Nothing
| otherwise =
let (s1,s2) = splitAt 1 s
in case toList s1 of
[c] -> Just (c, s2)
_ -> internalError "uncons"
find :: (Char -> Bool) -> String -> Maybe Char
find predicate s = loop (Offset 0)
where
!sz = size s
end = Offset 0 `offsetPlusE` sz
loop idx
| idx == end = Nothing
| otherwise =
let (# c, idx' #) = next s idx
in case predicate c of
True -> Just c
False -> loop idx'
sortBy :: (Char -> Char -> Ordering) -> String -> String
sortBy sortF s = fromList $ Data.List.sortBy sortF $ toList s
filter :: (Char -> Bool) -> String -> String
filter p s = fromList $ Data.List.filter p $ toList s
reverse :: String -> String
reverse s@(String ba) = runST $ do
ms <- new len
loop ms (Offset 0) (Offset 0 `offsetPlusE` len)
where
!len = size s
loop :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> Offset8 -> prim String
loop ms@(MutableString mba) sidx@(Offset si) didx
| didx == Offset 0 = freeze ms
| otherwise = do
let !h = Vec.unsafeIndex ba si
!nb = Size (getNbBytes h + 1)
didx'@(Offset d) = didx `offsetMinusE` nb
case nb of
Size 1 -> C.mutUnsafeWrite mba d h
Size 2 -> do
C.mutUnsafeWrite mba d h
C.mutUnsafeWrite mba (d + 1) (Vec.unsafeIndex ba (si + 1))
Size 3 -> do
C.mutUnsafeWrite mba d h
C.mutUnsafeWrite mba (d + 1) (Vec.unsafeIndex ba (si + 1))
C.mutUnsafeWrite mba (d + 2) (Vec.unsafeIndex ba (si + 2))
Size 4 -> do
C.mutUnsafeWrite mba d h
C.mutUnsafeWrite mba (d + 1) (Vec.unsafeIndex ba (si + 1))
C.mutUnsafeWrite mba (d + 2) (Vec.unsafeIndex ba (si + 2))
C.mutUnsafeWrite mba (d + 3) (Vec.unsafeIndex ba (si + 3))
_ -> return ()
loop ms (sidx `offsetPlusE` nb) didx'
data Encoding
= ASCII7
| UTF8
| UTF16
| UTF32
| ISO_8859_1
deriving (Typeable, Data, Eq, Ord, Show, Enum, Bounded)
fromEncoderBytes :: ( Encoder.Encoding encoding
, Exception (Encoder.Error encoding)
, PrimType (Encoder.Unit encoding)
)
=> encoding
-> UArray Word8
-> (String, Maybe ValidationFailure, UArray Word8)
fromEncoderBytes enc bytes =
( String $ runST $ Encoder.convertFromTo enc EncoderUTF8 (Vec.recast bytes)
, Nothing
, mempty
)
fromBytes :: Encoding -> UArray Word8 -> (String, Maybe ValidationFailure, UArray Word8)
fromBytes ASCII7 bytes = fromEncoderBytes Encoder.ASCII7 bytes
fromBytes ISO_8859_1 bytes = fromEncoderBytes Encoder.ISO_8859_1 bytes
fromBytes UTF16 bytes = fromEncoderBytes Encoder.UTF16 bytes
fromBytes UTF32 bytes = fromEncoderBytes Encoder.UTF32 bytes
fromBytes UTF8 bytes
| C.null bytes = (mempty, Nothing, mempty)
| otherwise =
case validate bytes (Offset 0) (Size $ C.length bytes) of
(_, Nothing) -> (fromBytesUnsafe bytes, Nothing, mempty)
(Offset pos, Just vf) ->
let (b1, b2) = C.splitAt pos bytes
in (fromBytesUnsafe b1, toErr vf, b2)
where
toErr MissingByte = Nothing
toErr InvalidHeader = Just InvalidHeader
toErr InvalidContinuation = Just InvalidContinuation
fromBytesLenient :: UArray Word8 -> (String, UArray Word8)
fromBytesLenient bytes
| C.null bytes = (mempty, mempty)
| otherwise =
case validate bytes (Offset 0) (Size $ C.length bytes) of
(_, Nothing) -> (fromBytesUnsafe bytes, mempty)
(Offset pos, Just MissingByte) ->
let (b1,b2) = C.splitAt pos bytes
in (fromBytesUnsafe b1, b2)
(Offset pos, Just InvalidHeader) ->
let (b1,b2) = C.splitAt pos bytes
(_,b3) = C.splitAt 1 b2
(s3, r) = fromBytesLenient b3
in (mconcat [fromBytesUnsafe b1,replacement, s3], r)
(Offset pos, Just InvalidContinuation) ->
let (b1,b2) = C.splitAt pos bytes
(_,b3) = C.splitAt 1 b2
(s3, r) = fromBytesLenient b3
in (mconcat [fromBytesUnsafe b1,replacement, s3], r)
where
replacement :: String
!replacement = fromBytesUnsafe $ fromList [0xef,0xbf,0xbd]
fromChunkBytes :: [UArray Word8] -> [String]
fromChunkBytes l = loop l
where
loop [] = []
loop (bytes:[]) =
case validate bytes (Offset 0) (Size $ C.length bytes) of
(_, Nothing) -> [fromBytesUnsafe bytes]
(_, Just err) -> doErr err
loop (bytes:cs@(c1:c2)) =
case validate bytes (Offset 0) (Size $ C.length bytes) of
(_, Nothing) -> fromBytesUnsafe bytes : loop cs
(Offset pos, Just MissingByte) ->
let (b1,b2) = C.splitAt pos bytes
in fromBytesUnsafe b1 : loop ((b2 `mappend` c1) : c2)
(_, Just err) -> doErr err
doErr err = error ("fromChunkBytes: " <> show err)
fromBytesUnsafe :: UArray Word8 -> String
fromBytesUnsafe = String
toEncoderBytes :: ( Encoder.Encoding encoding
, PrimType (Encoder.Unit encoding)
, Exception (Encoder.Error encoding)
)
=> encoding
-> UArray Word8
-> UArray Word8
toEncoderBytes enc bytes = Vec.recast (runST $ Encoder.convertFromTo EncoderUTF8 enc bytes)
toBytes :: Encoding -> String -> UArray Word8
toBytes UTF8 (String bytes) = bytes
toBytes ASCII7 (String bytes) = toEncoderBytes Encoder.ASCII7 bytes
toBytes ISO_8859_1 (String bytes) = toEncoderBytes Encoder.ISO_8859_1 bytes
toBytes UTF16 (String bytes) = toEncoderBytes Encoder.UTF16 bytes
toBytes UTF32 (String bytes) = toEncoderBytes Encoder.UTF32 bytes
lines :: String -> [String]
lines = fmap fromList . Prelude.lines . toList
words :: String -> [String]
words = fmap fromList . Prelude.words . toList