{-# OPTIONS_GHC -funbox-strict-fields #-}
{-# LANGUAGE BangPatterns #-}
module Data.Unicode.Internal.NormalizeStream
(
D.DecomposeMode(..)
, stream
, unstream
, unstreamC
)
where
import Control.Monad (ap)
import Data.Char (chr, ord)
import Data.List (sortBy)
import Data.Ord (comparing)
import qualified Data.Text.Array as A
import Data.Text.Internal (Text (..))
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import Data.Text.Internal.Fusion.Size (betweenSize,
upperBound)
import Data.Text.Internal.Fusion.Types (Step (..), Stream (..))
import Data.Text.Internal.Private (runText)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Internal.Unsafe.Char (unsafeChr)
import Data.Text.Internal.Unsafe.Shift (shiftR)
import GHC.ST (ST (..))
import qualified Data.Unicode.Properties.CombiningClass as CC
import qualified Data.Unicode.Properties.Compositions as C
import qualified Data.Unicode.Properties.Decompose as D
import qualified Data.Unicode.Properties.DecomposeHangul as H
data ReBuf = Empty | One {-# UNPACK #-} !Char | Many [Char]
writeStr :: A.MArray s -> Int -> [Char] -> ST s Int
writeStr marr di str = go di str
where
go i [] = return i
go i (c : cs) = do
n <- unsafeWrite marr i c
go (i + n) cs
{-# INLINE writeReorderBuffer #-}
writeReorderBuffer :: A.MArray s -> Int -> ReBuf -> ST s Int
writeReorderBuffer _ di Empty = return di
writeReorderBuffer marr di (One c) = do
n <- unsafeWrite marr di c
return (di + n)
writeReorderBuffer marr di (Many str) = writeStr marr di str
decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s (Int, ReBuf)
decomposeCharHangul marr j c = do
case D.decomposeCharHangul c of
Left (l, v) -> do
n1 <- unsafeWrite marr j l
n2 <- unsafeWrite marr (j + n1) v
return ((j + n1 + n2), Empty)
Right (l, v, t) -> do
n1 <- unsafeWrite marr j l
n2 <- unsafeWrite marr (j + n1) v
n3 <- unsafeWrite marr (j + n1 + n2) t
return (j + n1 + n2 + n3, Empty)
{-# INLINE decomposeChar #-}
decomposeChar
:: D.DecomposeMode
-> A.MArray s
-> Int
-> ReBuf
-> Char
-> ST s (Int, ReBuf)
decomposeChar _ marr i reBuf c | D.isHangul c = do
j <- writeReorderBuffer marr i reBuf
decomposeCharHangul marr j c
decomposeChar mode marr index reBuf ch = do
case D.isDecomposable mode ch of
D.FalseA -> reorder marr index reBuf ch
D.TrueA -> decomposeAll marr index reBuf (D.decomposeChar mode ch)
_ -> reorder marr index reBuf ch
where
{-# INLINE decomposeAll #-}
decomposeAll _ i rbuf [] = return (i, rbuf)
decomposeAll arr i rbuf (x : xs) =
case D.isDecomposable mode x of
D.TrueA -> do
(i', rbuf') <- decomposeAll arr i rbuf
(D.decomposeChar mode x)
decomposeAll arr i' rbuf' xs
_ -> do
(i', rbuf') <- reorder arr i rbuf x
decomposeAll arr i' rbuf' xs
{-# INLINE reorder #-}
reorder _ i Empty c = return (i, One c)
reorder arr i (One c0) c | not (CC.isCombining c) = do
n1 <- unsafeWrite arr i c0
n2 <- unsafeWrite arr (i + n1) c
return ((i + n1 + n2), Empty)
reorder arr i (One c0) c | not (CC.isCombining c0) = do
n <- unsafeWrite arr i c0
return (i + n, One c)
reorder _ i (One c0) c = return (i, Many orderedPair)
where
orderedPair =
case inOrder c0 c of
True -> [c0, c]
False -> [c, c0]
inOrder c1 c2 =
CC.getCombiningClass c1 <= CC.getCombiningClass c2
reorder arr i rbuf c | not (CC.isCombining c) = do
j <- writeReorderBuffer arr i rbuf
n <- unsafeWrite arr j c
return (j + n, Empty)
reorder _ i (Many str) c = return (i, Many (sortCluster (str ++ [c])))
where
{-# INLINE sortCluster #-}
sortCluster = map fst
. sortBy (comparing snd)
. map (ap (,) CC.getCombiningClass)
stream :: Text -> Stream Char
stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
where
!end = off+len
{-# INLINE next #-}
next !i
| i >= end = Done
| (n `shiftR` 10) == 0x36 = Yield (U16.chr2 n n2) (i + 2)
| otherwise = Yield (unsafeChr n) (i + 1)
where
n = A.unsafeIndex arr i
n2 = A.unsafeIndex arr (i + 1)
{-# INLINE [0] stream #-}
unstream :: D.DecomposeMode -> Stream Char -> Text
unstream mode (Stream next0 s0 len) = runText $ \done -> do
let margin = 1 + maxDecomposeLen
mlen = (upperBound 4 len + margin)
arr0 <- A.new mlen
let outer !arr !maxi = encode
where
encode !si !di rbuf =
if maxi < di + margin
then realloc si di rbuf
else
case next0 si of
Done -> do
di' <- writeReorderBuffer arr di rbuf
done arr di'
Skip si' -> encode si' di rbuf
Yield c si' -> do
(di', rbuf') <- decomposeChar mode arr di rbuf c
encode si' di' rbuf'
{-# NOINLINE realloc #-}
realloc !si !di rbuf = do
let newlen = maxi * 2
arr' <- A.new newlen
A.copyM arr' 0 arr 0 di
outer arr' (newlen - 1) si di rbuf
outer arr0 (mlen - 1) s0 0 Empty
{-# INLINE [0] unstream #-}
maxDecomposeLen :: Int
maxDecomposeLen = 32
composeAndWrite
:: A.MArray s
-> Int
-> Char
-> ReBuf
-> Char
-> ST s (Int, Char)
composeAndWrite arr di st1 Empty st2 = do
n <- unsafeWrite arr di st1
return (di + n, st2)
composeAndWrite arr di st1 (One c) st2 =
composeAndWrite' arr di st1 [c] st2
composeAndWrite arr di st1 (Many str) st2 =
composeAndWrite' arr di st1 str st2
composeAndWrite'
:: A.MArray s
-> Int
-> Char
-> [Char]
-> Char
-> ST s (Int, Char)
composeAndWrite' arr di st1 str st2 = go di st1 [] 0 str
where
go i st [] _ [] =
case C.composePair st st2 of
Just x -> return (i, x)
Nothing -> do
n <- unsafeWrite arr i st
return (i + n, st2)
go i st uncs _ [] = do
j <- writeStr arr i (st : uncs)
return (j, st2)
go i st [] _ (c : cs) = do
case C.composePair st c of
Just x -> go i x [] 0 cs
Nothing -> do
go i st [c] (CC.getCombiningClass c) cs
go i st uncs cc (c : cs) = do
let ccc = CC.getCombiningClass c
if ccc > cc then
case C.composePair st c of
Just x -> go i x uncs cc cs
Nothing -> do
go i st (uncs ++ [c]) ccc cs
else go i st (uncs ++ [c]) ccc cs
writeStarterRbuf :: A.MArray s
-> Int
-> Maybe Char
-> ReBuf
-> ST s Int
writeStarterRbuf marr di st rbuf =
case st of
Nothing -> writeReorderBuffer marr di rbuf
Just starter ->
composeAndWrite marr di starter rbuf '\0' >>= (return . fst)
data JamoBuf
= JamoEmpty
| JamoLIndex {-# UNPACK #-} !Int
| JamoLV {-# UNPACK #-} !Char
{-# INLINE writeJamoBuf #-}
writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int
writeJamoBuf _ di JamoEmpty = return di
writeJamoBuf marr di (JamoLIndex i) = do
n <- unsafeWrite marr di (chr (D.jamoLFirst + i))
return (di + n)
writeJamoBuf marr di (JamoLV c) = do
n <- unsafeWrite marr di c
return (di + n)
{-# INLINE composeChar #-}
composeChar
:: D.DecomposeMode
-> A.MArray s
-> Int
-> Maybe Char
-> ReBuf
-> JamoBuf
-> Char
-> ST s (Int, Maybe Char, ReBuf, JamoBuf)
composeChar _ marr index st rbuf jbuf ch | H.isHangul ch || H.isJamo ch = do
j <- writeStarterRbuf marr index st rbuf
(k, jbuf') <- if H.isJamo ch then
composeCharJamo marr j jbuf ch
else
composeCharHangul marr j jbuf ch
return (k, Nothing, Empty, jbuf')
where
composeCharJamo arr i JamoEmpty c =
case H.jamoLIndex c of
Just li -> return (i, JamoLIndex li)
Nothing -> do
n <- unsafeWrite arr i c
return (i + n, JamoEmpty)
composeCharJamo arr i jb@(JamoLIndex li) c =
case H.jamoVIndex c of
Just vi -> do
let lvi = li * H.jamoNCount + vi * H.jamoTCount
return (i, JamoLV (chr (H.hangulFirst + lvi)))
Nothing -> do
ix <- writeJamoBuf arr i jb
composeCharJamo arr ix JamoEmpty c
composeCharJamo arr i jb@(JamoLV lv) c =
case H.jamoTIndex c of
Just ti -> do
n <- unsafeWrite arr i (chr ((ord lv) + ti))
return (i + n, JamoEmpty)
Nothing -> do
ix <- writeJamoBuf arr i jb
composeCharJamo arr ix JamoEmpty c
composeCharHangul arr i jb c = do
ix <- writeJamoBuf arr i jb
case H.isHangulLV c of
True -> return (ix, JamoLV c)
False -> do
n <- unsafeWrite arr ix c
return (ix + n, JamoEmpty)
composeChar mode marr index starter reBuf jbuf ch = do
index' <- writeJamoBuf marr index jbuf
case D.isDecomposable mode ch of
D.FalseA -> do
(i, st, rbuf) <- reorder marr index' starter reBuf ch
return (i, st, rbuf, JamoEmpty)
D.TrueA -> do
decomposeAll marr index' starter reBuf jbuf (D.decomposeChar mode ch)
_ -> do
(i, st, rbuf) <- reorder marr index' starter reBuf ch
return (i, st, rbuf, JamoEmpty)
where
{-# INLINE decomposeAll #-}
decomposeAll _ i st rbuf jb [] = return (i, st, rbuf, jb)
decomposeAll arr i st rbuf jb (x : xs) =
case D.isDecomposable mode x of
D.TrueA -> do
(i', st', rbuf', jb') <- decomposeAll arr i st rbuf jb
(D.decomposeChar mode x)
decomposeAll arr i' st' rbuf' jb' xs
_ -> do
(i', st', rbuf', jb') <- composeChar mode arr i st rbuf jb x
decomposeAll arr i' st' rbuf' jb' xs
{-# INLINE reorder #-}
reorder _ i st Empty c = return (i, st, One c)
reorder arr i (Just st) (One c0) c | not (CC.isCombining c) = do
case C.composePair st c0 of
Just x -> case C.composePair x c of
Just y -> return (i, Just y, Empty)
Nothing -> do
n <- unsafeWrite arr i x
return (i + n, Just c, Empty)
Nothing -> case CC.isCombining c0 of
True -> do
n1 <- unsafeWrite arr i st
n2 <- unsafeWrite arr (i + n1) c0
return (i + n1 + n2, Just c, Empty)
False -> do
n1 <- unsafeWrite arr i st
case C.composePair c0 c of
Just y -> return (i + n1, Just y, Empty)
Nothing -> do
n2 <- unsafeWrite arr (i + n1) c0
return (i + n1 + n2, Just c, Empty)
reorder arr i Nothing (One c0) c | not (CC.isCombining c) =
case C.composePair c0 c of
Just x -> return (i, Just x, Empty)
Nothing -> do
n <- unsafeWrite arr i c0
return (i + n, Just c, Empty)
reorder arr i (Just st) (One c0) c | not (CC.isCombining c0) = do
case C.composePair st c0 of
Just x -> return (i, Just x, One c)
Nothing -> do
n <- unsafeWrite arr i st
return (i + n, Just c0, One c)
reorder _arr i Nothing (One c0) c | not (CC.isCombining c0) = do
return (i, Just c0, One c)
reorder _ i st (One c0) c = return (i, st, Many orderedPair)
where
orderedPair =
case inOrder c0 c of
True -> [c0, c]
False -> [c, c0]
inOrder c1 c2 =
CC.getCombiningClass c1 <= CC.getCombiningClass c2
reorder arr i (Just st) rbuf c | not (CC.isCombining c) = do
(j, st2) <- composeAndWrite arr i st rbuf c
return (j, Just st2, Empty)
reorder arr i Nothing rbuf c | not (CC.isCombining c) = do
j <- writeReorderBuffer arr i rbuf
return (j, Just c, Empty)
reorder _ i st (Many str) c =
return (i, st, Many (sortCluster (str ++ [c])))
where
{-# INLINE sortCluster #-}
sortCluster = map fst
. sortBy (comparing snd)
. map (ap (,) CC.getCombiningClass)
unstreamC :: D.DecomposeMode -> Stream Char -> Text
unstreamC mode (Stream next0 s0 len) = runText $ \done -> do
let margin = 1 + maxDecomposeLen
mlen = (upperBound 4 len + margin)
arr0 <- A.new mlen
let outer !arr !maxi = encode
where
encode !si !di st rbuf jbuf =
if maxi < di + margin
then realloc si di st rbuf jbuf
else
case next0 si of
Done -> do
di' <- writeStarterRbuf arr di st rbuf
di'' <- writeJamoBuf arr di' jbuf
done arr di''
Skip si' -> encode si' di st rbuf jbuf
Yield c si' -> do
(di', st', rbuf', jbuf') <- composeChar mode arr di st rbuf jbuf c
encode si' di' st' rbuf' jbuf'
{-# NOINLINE realloc #-}
realloc !si !di st rbuf jbuf = do
let newlen = maxi * 2
arr' <- A.new newlen
A.copyM arr' 0 arr 0 di
outer arr' (newlen - 1) si di st rbuf jbuf
outer arr0 (mlen - 1) s0 0 Nothing Empty JamoEmpty
{-# INLINE [0] unstreamC #-}