{-# OPTIONS_GHC -funbox-strict-fields #-} {-# LANGUAGE BangPatterns #-} -- | -- Module : Data.Unicode.Internal.NormalizeStream -- Copyright : (c) 2016 Harendra Kumar -- -- License : BSD-style -- Maintainer : harendra.kumar@gmail.com -- Stability : experimental -- -- Stream based normalization. -- module Data.Unicode.Internal.NormalizeStream ( D.DecomposeMode(..) , stream , unstream , unstreamC ) where import Control.Monad (ap) import Data.Char (chr, ord) import Data.List (sortBy) import Data.Ord (comparing) import qualified Data.Text.Array as A import Data.Text.Internal (Text (..)) import qualified Data.Text.Internal.Encoding.Utf16 as U16 import Data.Text.Internal.Fusion.Size (betweenSize, upperBound) import Data.Text.Internal.Fusion.Types (Step (..), Stream (..)) import Data.Text.Internal.Private (runText) import Data.Text.Internal.Unsafe.Char (unsafeWrite) import Data.Text.Internal.Unsafe.Char (unsafeChr) import Data.Text.Internal.Unsafe.Shift (shiftR) import GHC.ST (ST (..)) import qualified Data.Unicode.Properties.CombiningClass as CC import qualified Data.Unicode.Properties.Compositions as C import qualified Data.Unicode.Properties.Decompose as D import qualified Data.Unicode.Properties.DecomposeHangul as H ------------------------------------------------------------------------------- -- Reorder buffer to hold characters till the next starter boundary ------------------------------------------------------------------------------- data ReBuf = Empty | One {-# UNPACK #-} !Char | Many [Char] writeStr :: A.MArray s -> Int -> [Char] -> ST s Int writeStr marr di str = go di str where go i [] = return i go i (c : cs) = do n <- unsafeWrite marr i c go (i + n) cs {-# INLINE writeReorderBuffer #-} writeReorderBuffer :: A.MArray s -> Int -> ReBuf -> ST s Int writeReorderBuffer _ di Empty = return di writeReorderBuffer marr di (One c) = do n <- unsafeWrite marr di c return (di + n) writeReorderBuffer marr di (Many str) = writeStr marr di str ------------------------------------------------------------------------------- -- Decomposition of Hangul characters is done algorithmically ------------------------------------------------------------------------------- -- {-# INLINE decomposeCharHangul #-} decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s (Int, ReBuf) decomposeCharHangul marr j c = do case D.decomposeCharHangul c of Left (l, v) -> do n1 <- unsafeWrite marr j l n2 <- unsafeWrite marr (j + n1) v return ((j + n1 + n2), Empty) Right (l, v, t) -> do n1 <- unsafeWrite marr j l n2 <- unsafeWrite marr (j + n1) v n3 <- unsafeWrite marr (j + n1 + n2) t return (j + n1 + n2 + n3, Empty) {-# INLINE decomposeChar #-} decomposeChar :: D.DecomposeMode -> A.MArray s -- destination array for decomposition -> Int -- array index -> ReBuf -- reorder buffer -> Char -- char to be decomposed -> ST s (Int, ReBuf) decomposeChar _ marr i reBuf c | D.isHangul c = do j <- writeReorderBuffer marr i reBuf decomposeCharHangul marr j c ------------------------------------------------------------------------------- -- Decomposition of characters other than Hangul ------------------------------------------------------------------------------- decomposeChar mode marr index reBuf ch = do -- TODO: return fully decomposed form case D.isDecomposable mode ch of D.FalseA -> reorder marr index reBuf ch D.TrueA -> decomposeAll marr index reBuf (D.decomposeChar mode ch) _ -> reorder marr index reBuf ch where {-# INLINE decomposeAll #-} decomposeAll _ i rbuf [] = return (i, rbuf) decomposeAll arr i rbuf (x : xs) = case D.isDecomposable mode x of D.TrueA -> do (i', rbuf') <- decomposeAll arr i rbuf (D.decomposeChar mode x) decomposeAll arr i' rbuf' xs _ -> do -- XXX calling reorder is wrong if decomposition results in -- a further decomposable Hangul char. In that case we will -- not go through the Hangul decompose for that char. -- To be strictly correct we have to call decomposeChar -- recursively here. (i', rbuf') <- reorder arr i rbuf x decomposeAll arr i' rbuf' xs -- Unicode 9.0.0: 3.11 -- D108 Reorderable pair: Two adjacent characters A and B in a coded -- character sequence are a Reorderable Pair if and only if -- ccc(A) > ccc(B) > 0. -- -- (array) (array index) (reorder buffer) (input char) {-# INLINE reorder #-} reorder _ i Empty c = return (i, One c) -- input char is a starter, flush the reorder buffer reorder arr i (One c0) c | not (CC.isCombining c) = do n1 <- unsafeWrite arr i c0 n2 <- unsafeWrite arr (i + n1) c return ((i + n1 + n2), Empty) -- input char is combining and there is a starter char in the buffer -- flush the starter char and add the combining char to the buffer reorder arr i (One c0) c | not (CC.isCombining c0) = do n <- unsafeWrite arr i c0 return (i + n, One c) -- optimized ordering for common case of two combining chars -- XXX replace many with Two here reorder _ i (One c0) c = return (i, Many orderedPair) where -- {-# INLINE orderedPair #-} orderedPair = case inOrder c0 c of True -> [c0, c] False -> [c, c0] inOrder c1 c2 = CC.getCombiningClass c1 <= CC.getCombiningClass c2 -- input char is a starter, flush the reorder buffer reorder arr i rbuf c | not (CC.isCombining c) = do j <- writeReorderBuffer arr i rbuf n <- unsafeWrite arr j c return (j + n, Empty) -- unoptimized generic sort for more than two combining chars reorder _ i (Many str) c = return (i, Many (sortCluster (str ++ [c]))) where {-# INLINE sortCluster #-} sortCluster = map fst . sortBy (comparing snd) . map (ap (,) CC.getCombiningClass) -- | /O(n)/ Convert a 'Text' into a 'Stream Char'. stream :: Text -> Stream Char stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) where !end = off+len {-# INLINE next #-} next !i | i >= end = Done -- shift generates only two branches instead of three in case of -- range check, works quite a bit faster with llvm backend. | (n `shiftR` 10) == 0x36 = Yield (U16.chr2 n n2) (i + 2) | otherwise = Yield (unsafeChr n) (i + 1) where n = A.unsafeIndex arr i n2 = A.unsafeIndex arr (i + 1) {-# INLINE [0] stream #-} -- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'. unstream :: D.DecomposeMode -> Stream Char -> Text unstream mode (Stream next0 s0 len) = runText $ \done -> do -- Before encoding each char we perform a buffer realloc check assuming -- worst case encoding size of two 16-bit units for the char. Just add an -- extra space to the buffer so that we do not end up reallocating even when -- all the chars are encoded as single unit. let margin = 1 + maxDecomposeLen mlen = (upperBound 4 len + margin) arr0 <- A.new mlen let outer !arr !maxi = encode where -- keep the common case loop as small as possible encode !si !di rbuf = -- simply check for the worst case if maxi < di + margin then realloc si di rbuf else case next0 si of Done -> do di' <- writeReorderBuffer arr di rbuf done arr di' Skip si' -> encode si' di rbuf Yield c si' -> do (di', rbuf') <- decomposeChar mode arr di rbuf c encode si' di' rbuf' -- n <- unsafeWrite arr di c -- encode si' (di + n) rbuf -- keep uncommon case separate from the common case code {-# NOINLINE realloc #-} realloc !si !di rbuf = do let newlen = maxi * 2 arr' <- A.new newlen A.copyM arr' 0 arr 0 di outer arr' (newlen - 1) si di rbuf outer arr0 (mlen - 1) s0 0 Empty {-# INLINE [0] unstream #-} -- we can generate this from UCD maxDecomposeLen :: Int maxDecomposeLen = 32 ------------------------------------------------------------------------------- -- Composition ------------------------------------------------------------------------------- composeAndWrite :: A.MArray s -> Int -> Char -> ReBuf -> Char -> ST s (Int, Char) -- return new index, new starter composeAndWrite arr di st1 Empty st2 = do n <- unsafeWrite arr di st1 return (di + n, st2) composeAndWrite arr di st1 (One c) st2 = composeAndWrite' arr di st1 [c] st2 composeAndWrite arr di st1 (Many str) st2 = composeAndWrite' arr di st1 str st2 composeAndWrite' :: A.MArray s -> Int -> Char -> [Char] -> Char -> ST s (Int, Char) composeAndWrite' arr di st1 str st2 = go di st1 [] 0 str where -- arguments: index, starter, uncombined chars, -- cc of prev uncombined char, unprocessed str go i st [] _ [] = case C.composePair st st2 of Just x -> return (i, x) Nothing -> do n <- unsafeWrite arr i st return (i + n, st2) go i st uncs _ [] = do j <- writeStr arr i (st : uncs) return (j, st2) go i st [] _ (c : cs) = do case C.composePair st c of Just x -> go i x [] 0 cs Nothing -> do go i st [c] (CC.getCombiningClass c) cs go i st uncs cc (c : cs) = do let ccc = CC.getCombiningClass c if ccc > cc then case C.composePair st c of Just x -> go i x uncs cc cs Nothing -> do go i st (uncs ++ [c]) ccc cs else go i st (uncs ++ [c]) ccc cs writeStarterRbuf :: A.MArray s -> Int -> Maybe Char -> ReBuf -> ST s Int writeStarterRbuf marr di st rbuf = case st of Nothing -> writeReorderBuffer marr di rbuf Just starter -> -- XXX null char hack composeAndWrite marr di starter rbuf '\0' >>= (return . fst) ------------------------------------------------------------------------------- -- Composition of Hangul Jamo characters, done algorithmically ------------------------------------------------------------------------------- -- Hold an L to wait for V, hold an LV to wait for T. data JamoBuf = JamoEmpty | JamoLIndex {-# UNPACK #-} !Int | JamoLV {-# UNPACK #-} !Char {-# INLINE writeJamoBuf #-} writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int writeJamoBuf _ di JamoEmpty = return di writeJamoBuf marr di (JamoLIndex i) = do n <- unsafeWrite marr di (chr (D.jamoLFirst + i)) return (di + n) writeJamoBuf marr di (JamoLV c) = do n <- unsafeWrite marr di c return (di + n) -- TODO Unify compose and decompose if possible with good perf -- TODO try unifying st, rbuf -- TODO try using Either for (st, rbuf)/jbuf -- or we can use different functions for hangul and non-hangul composition with -- diff signatures. In an outer function we check if the char is hangul and -- flush and switch the buffer before calling the appropriate function. -- If we are composing we do not need to first decompose Hangul. We can just -- compose assuming there could be some partially composed syllables e.g. LV -- syllable followed by a jamo T. We need to compose this case as well. -- -- XXX The unicode normalization test suite does not seem to have tests for a -- LV composed hangul syllable followed by a jamo T. {-# INLINE composeChar #-} composeChar :: D.DecomposeMode -> A.MArray s -- destination array for decomposition -> Int -- array index -> Maybe Char -- last starter -> ReBuf -- reorder buffer -> JamoBuf -- jamo buffer -> Char -- char to be decomposed -> ST s (Int, Maybe Char, ReBuf, JamoBuf) -- ^ index, starter, reorder buf, jamobuf composeChar _ marr index st rbuf jbuf ch | H.isHangul ch || H.isJamo ch = do j <- writeStarterRbuf marr index st rbuf (k, jbuf') <- if H.isJamo ch then composeCharJamo marr j jbuf ch else composeCharHangul marr j jbuf ch return (k, Nothing, Empty, jbuf') where composeCharJamo arr i JamoEmpty c = case H.jamoLIndex c of Just li -> return (i, JamoLIndex li) Nothing -> do n <- unsafeWrite arr i c return (i + n, JamoEmpty) composeCharJamo arr i jb@(JamoLIndex li) c = case H.jamoVIndex c of Just vi -> do let lvi = li * H.jamoNCount + vi * H.jamoTCount return (i, JamoLV (chr (H.hangulFirst + lvi))) Nothing -> do ix <- writeJamoBuf arr i jb composeCharJamo arr ix JamoEmpty c composeCharJamo arr i jb@(JamoLV lv) c = case H.jamoTIndex c of Just ti -> do n <- unsafeWrite arr i (chr ((ord lv) + ti)) return (i + n, JamoEmpty) Nothing -> do ix <- writeJamoBuf arr i jb composeCharJamo arr ix JamoEmpty c composeCharHangul arr i jb c = do ix <- writeJamoBuf arr i jb case H.isHangulLV c of True -> return (ix, JamoLV c) False -> do n <- unsafeWrite arr ix c return (ix + n, JamoEmpty) ------------------------------------------------------------------------------- -- Composition of characters other than Hangul ------------------------------------------------------------------------------- composeChar mode marr index starter reBuf jbuf ch = do index' <- writeJamoBuf marr index jbuf case D.isDecomposable mode ch of D.FalseA -> do (i, st, rbuf) <- reorder marr index' starter reBuf ch return (i, st, rbuf, JamoEmpty) D.TrueA -> do decomposeAll marr index' starter reBuf jbuf (D.decomposeChar mode ch) _ -> do (i, st, rbuf) <- reorder marr index' starter reBuf ch return (i, st, rbuf, JamoEmpty) where {-# INLINE decomposeAll #-} decomposeAll _ i st rbuf jb [] = return (i, st, rbuf, jb) decomposeAll arr i st rbuf jb (x : xs) = case D.isDecomposable mode x of D.TrueA -> do (i', st', rbuf', jb') <- decomposeAll arr i st rbuf jb (D.decomposeChar mode x) decomposeAll arr i' st' rbuf' jb' xs _ -> do -- XXX this recursive call here hurts performance -- We can make the hangul composition a separate function -- and call that or reorder here based on the type fo char (i', st', rbuf', jb') <- composeChar mode arr i st rbuf jb x decomposeAll arr i' st' rbuf' jb' xs -- Unicode 9.0.0: 3.11 -- D108 Reorderable pair: Two adjacent characters A and B in a coded -- character sequence are a Reorderable Pair if and only if -- ccc(A) > ccc(B) > 0. -- -- (array) (array index) (reorder buffer) (input char) {-# INLINE reorder #-} reorder _ i st Empty c = return (i, st, One c) -- Unicode 9.0.0: 3.11 -- D111: a starter can never become a non-starter after -- combining. If that happens we will potentially have to remember all -- previous starters so that the new non-starter can be combined with -- the previous starter. -- -- To compose, try to combine an unblocked char with the last starter -- and remove if combined. A char with combining class equal or lower -- than the previous char is blocked. It implies that only adjacent -- starters can be combined. -- -- input char is a starter -- does it combine with the previous starter? -- if no then flush and replace the last starter reorder arr i (Just st) (One c0) c | not (CC.isCombining c) = do case C.composePair st c0 of Just x -> case C.composePair x c of Just y -> return (i, Just y, Empty) Nothing -> do n <- unsafeWrite arr i x return (i + n, Just c, Empty) Nothing -> case CC.isCombining c0 of -- starter1 combining starter2 True -> do n1 <- unsafeWrite arr i st n2 <- unsafeWrite arr (i + n1) c0 return (i + n1 + n2, Just c, Empty) -- starter1 starter2 starter3 False -> do n1 <- unsafeWrite arr i st case C.composePair c0 c of Just y -> return (i + n1, Just y, Empty) Nothing -> do n2 <- unsafeWrite arr (i + n1) c0 return (i + n1 + n2, Just c, Empty) reorder arr i Nothing (One c0) c | not (CC.isCombining c) = case C.composePair c0 c of Just x -> return (i, Just x, Empty) Nothing -> do n <- unsafeWrite arr i c0 return (i + n, Just c, Empty) reorder arr i (Just st) (One c0) c | not (CC.isCombining c0) = do case C.composePair st c0 of Just x -> return (i, Just x, One c) Nothing -> do n <- unsafeWrite arr i st return (i + n, Just c0, One c) -- input char is combining and there is a starter char in the buffer -- flush the starter char and add the combining char to the buffer reorder _arr i Nothing (One c0) c | not (CC.isCombining c0) = do return (i, Just c0, One c) -- optimized ordering for common case of two combining chars -- XXX replace many with Two here reorder _ i st (One c0) c = return (i, st, Many orderedPair) where -- {-# INLINE orderedPair #-} orderedPair = case inOrder c0 c of True -> [c0, c] False -> [c, c0] inOrder c1 c2 = CC.getCombiningClass c1 <= CC.getCombiningClass c2 -- input char is a starter, flush the reorder buffer reorder arr i (Just st) rbuf c | not (CC.isCombining c) = do (j, st2) <- composeAndWrite arr i st rbuf c return (j, Just st2, Empty) reorder arr i Nothing rbuf c | not (CC.isCombining c) = do j <- writeReorderBuffer arr i rbuf return (j, Just c, Empty) -- unoptimized generic sort for more than two combining chars reorder _ i st (Many str) c = return (i, st, Many (sortCluster (str ++ [c]))) where {-# INLINE sortCluster #-} sortCluster = map fst . sortBy (comparing snd) . map (ap (,) CC.getCombiningClass) -- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'. unstreamC :: D.DecomposeMode -> Stream Char -> Text unstreamC mode (Stream next0 s0 len) = runText $ \done -> do -- Before encoding each char we perform a buffer realloc check assuming -- worst case encoding size of two 16-bit units for the char. Just add an -- extra space to the buffer so that we do not end up reallocating even when -- all the chars are encoded as single unit. let margin = 1 + maxDecomposeLen mlen = (upperBound 4 len + margin) arr0 <- A.new mlen let outer !arr !maxi = encode where -- keep the common case loop as small as possible encode !si !di st rbuf jbuf = -- simply check for the worst case if maxi < di + margin then realloc si di st rbuf jbuf else case next0 si of Done -> do -- Flush any leftover buffers, only one of rbuf/jbuf -- will have contents di' <- writeStarterRbuf arr di st rbuf di'' <- writeJamoBuf arr di' jbuf done arr di'' Skip si' -> encode si' di st rbuf jbuf Yield c si' -> do (di', st', rbuf', jbuf') <- composeChar mode arr di st rbuf jbuf c encode si' di' st' rbuf' jbuf' -- keep uncommon case separate from the common case code {-# NOINLINE realloc #-} realloc !si !di st rbuf jbuf = do let newlen = maxi * 2 arr' <- A.new newlen A.copyM arr' 0 arr 0 di outer arr' (newlen - 1) si di st rbuf jbuf outer arr0 (mlen - 1) s0 0 Nothing Empty JamoEmpty {-# INLINE [0] unstreamC #-}