{-# OPTIONS_GHC -funbox-strict-fields #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE TupleSections #-} -- | -- Module : Data.Unicode.Internal.NormalizeStream -- Copyright : (c) 2016 Harendra Kumar -- (c) 2020 Andrew Lelechenko -- -- License : BSD-3-Clause -- Maintainer : harendra.kumar@gmail.com -- Stability : experimental -- -- Stream based normalization. -- module Data.Unicode.Internal.NormalizeStream ( D.DecomposeMode(..) , stream , unstream , unstreamC ) where import Data.Char (chr, ord) import qualified Data.Text.Array as A import Data.Text.Internal (Text (..)) import qualified Data.Text.Internal.Encoding.Utf16 as U16 import Data.Text.Internal.Fusion.Size (betweenSize, upperBound) import Data.Text.Internal.Fusion.Types (Step (..), Stream (..)) import Data.Text.Internal.Private (runText) import Data.Text.Internal.Unsafe.Char (unsafeWrite) import Data.Text.Internal.Unsafe.Char (unsafeChr) import Data.Text.Internal.Unsafe.Shift (shiftR) import GHC.ST (ST (..)) import GHC.Types (SPEC(..)) import qualified Data.Unicode.Properties.CombiningClass as CC import qualified Data.Unicode.Properties.Compositions as C import qualified Data.Unicode.Properties.Decompose as D import qualified Data.Unicode.Properties.DecomposeHangul as H ------------------------------------------------------------------------------- -- Reorder buffer to hold characters till the next starter boundary ------------------------------------------------------------------------------- -- | A list of combining characters, ordered by 'CC.getCombiningClass'. -- Couple of top levels are unrolled and unpacked for efficiency. data ReBuf = Empty | One !Char | Many !Char !Char ![Char] {-# INLINE insertIntoReBuf #-} insertIntoReBuf :: Char -> ReBuf -> ReBuf insertIntoReBuf c Empty = One c insertIntoReBuf c (One c0) | CC.getCombiningClass c < CC.getCombiningClass c0 = Many c c0 [] | otherwise = Many c0 c [] insertIntoReBuf c (Many c0 c1 cs) | cc < CC.getCombiningClass c0 = Many c c0 (c1 : cs) | cc < CC.getCombiningClass c1 = Many c0 c (c1 : cs) | otherwise = Many c0 c1 (cs' ++ (c : cs'')) where cc = CC.getCombiningClass c (cs', cs'') = span ((<= cc) . CC.getCombiningClass) cs writeStr :: A.MArray s -> Int -> [Char] -> ST s Int writeStr marr di str = go di str where go i [] = return i go i (c : cs) = do n <- unsafeWrite marr i c go (i + n) cs {-# INLINE writeReorderBuffer #-} writeReorderBuffer :: A.MArray s -> Int -> ReBuf -> ST s Int writeReorderBuffer _ di Empty = return di writeReorderBuffer marr di (One c) = do n <- unsafeWrite marr di c return (di + n) writeReorderBuffer marr di (Many c1 c2 str) = do n1 <- unsafeWrite marr di c1 n2 <- unsafeWrite marr (di + n1) c2 writeStr marr (di + n1 + n2) str ------------------------------------------------------------------------------- -- Decomposition of Hangul characters is done algorithmically ------------------------------------------------------------------------------- -- {-# INLINE decomposeCharHangul #-} decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s Int decomposeCharHangul marr j c = if t == chr H.jamoTFirst then do n1 <- unsafeWrite marr j l n2 <- unsafeWrite marr (j + n1) v return (j + n1 + n2) else do n1 <- unsafeWrite marr j l n2 <- unsafeWrite marr (j + n1) v n3 <- unsafeWrite marr (j + n1 + n2) t return (j + n1 + n2 + n3) where (l, v, t) = D.decomposeCharHangul c {-# INLINE decomposeChar #-} decomposeChar :: D.DecomposeMode -> A.MArray s -- destination array for decomposition -> Int -- array index -> ReBuf -- reorder buffer -> Char -- char to be decomposed -> ST s (Int, ReBuf) decomposeChar mode marr index reBuf ch | D.isHangul ch = do j <- writeReorderBuffer marr index reBuf (, Empty) <$> decomposeCharHangul marr j ch | D.isDecomposable mode ch = decomposeAll marr index reBuf (D.decomposeChar mode ch) | otherwise = reorder marr index reBuf ch where {-# INLINE decomposeAll #-} decomposeAll _ i rbuf [] = return (i, rbuf) decomposeAll arr i rbuf (x : xs) | D.isDecomposable mode x = do (i', rbuf') <- decomposeAll arr i rbuf (D.decomposeChar mode x) decomposeAll arr i' rbuf' xs | otherwise = do (i', rbuf') <- reorder arr i rbuf x decomposeAll arr i' rbuf' xs {-# INLINE reorder #-} reorder arr i rbuf c | CC.isCombining c = return (i, insertIntoReBuf c rbuf) | otherwise = do j <- writeReorderBuffer arr i rbuf n <- unsafeWrite arr j c return (j + n, Empty) -- | /O(n)/ Convert a 'Text' into a 'Stream Char'. stream :: Text -> Stream Char stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) where !end = off+len {-# INLINE next #-} next !i | i >= end = Done -- shift generates only two branches instead of three in case of -- range check, works quite a bit faster with llvm backend. | (n `shiftR` 10) == 0x36 = Yield (U16.chr2 n n2) (i + 2) | otherwise = Yield (unsafeChr n) (i + 1) where n = A.unsafeIndex arr i n2 = A.unsafeIndex arr (i + 1) {-# INLINE [0] stream #-} -- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'. unstream :: D.DecomposeMode -> Stream Char -> Text unstream mode (Stream next0 s0 len) = runText $ \done -> do -- Before encoding each char we perform a buffer realloc check assuming -- worst case encoding size of two 16-bit units for the char. Just add an -- extra space to the buffer so that we do not end up reallocating even when -- all the chars are encoded as single unit. let margin = 1 + maxDecomposeLen mlen = (upperBound 4 len + margin) arr0 <- A.new mlen let outer !arr !maxi = encode where -- keep the common case loop as small as possible encode !si !di rbuf = -- simply check for the worst case if maxi < di + margin then realloc si di rbuf else case next0 si of Done -> do di' <- writeReorderBuffer arr di rbuf done arr di' Skip si' -> encode si' di rbuf Yield c si' -> do (di', rbuf') <- decomposeChar mode arr di rbuf c encode si' di' rbuf' -- n <- unsafeWrite arr di c -- encode si' (di + n) rbuf -- keep uncommon case separate from the common case code {-# NOINLINE realloc #-} realloc !si !di rbuf = do let newlen = maxi * 2 arr' <- A.new newlen A.copyM arr' 0 arr 0 di outer arr' (newlen - 1) si di rbuf outer arr0 (mlen - 1) s0 0 Empty {-# INLINE [0] unstream #-} -- we can generate this from UCD maxDecomposeLen :: Int maxDecomposeLen = 32 ------------------------------------------------------------------------------- -- Composition ------------------------------------------------------------------------------- -- If we are composing we do not need to first decompose Hangul. We can just -- compose assuming there could be some partially composed syllables e.g. LV -- syllable followed by a jamo T. We need to compose this case as well. -- Hold an L to wait for V, hold an LV to wait for T. data JamoBuf = Jamo !Char -- Jamo L, V or T | Hangul !Char -- Hangul Syllable LV or LVT | HangulLV !Char data RegBuf = RegOne !Char | RegMany !Char !Char ![Char] data ComposeState = ComposeNone | ComposeReg !RegBuf | ComposeJamo !JamoBuf ------------------------------------------------------------------------------- -- Composition of Jamo into Hangul syllables, done algorithmically ------------------------------------------------------------------------------- {-# INLINE writeJamoBuf #-} writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int writeJamoBuf arr i jbuf = do n <- unsafeWrite arr i (getCh jbuf) return (i + n) where getCh (Jamo ch) = ch getCh (Hangul ch) = ch getCh (HangulLV ch) = ch {-# INLINE initHangul #-} initHangul :: Char -> Int -> ST s (Int, ComposeState) initHangul c i = return (i, ComposeJamo (Hangul c)) {-# INLINE initJamo #-} initJamo :: Char -> Int -> ST s (Int, ComposeState) initJamo c i = return (i, ComposeJamo (Jamo c)) {-# INLINE insertJamo #-} insertJamo :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) insertJamo arr i jbuf ch | ich <= H.jamoLLast = do j <- writeJamoBuf arr i jbuf return (j, ComposeJamo (Jamo ch)) | ich < H.jamoVFirst = flushAndWrite arr i jbuf ch | ich <= H.jamoVLast = do case jbuf of Jamo c -> case H.jamoLIndex c of Just li -> let vi = ich - H.jamoVFirst lvi = li * H.jamoNCount + vi * H.jamoTCount lv = chr (H.hangulFirst + lvi) in return (i, ComposeJamo (HangulLV lv)) Nothing -> writeTwo arr i c ch Hangul c -> writeTwo arr i c ch HangulLV c -> writeTwo arr i c ch | ich <= H.jamoTFirst = do flushAndWrite arr i jbuf ch | otherwise = do let ti = ich - H.jamoTFirst case jbuf of Jamo c -> writeTwo arr i c ch Hangul c | H.isHangulLV c -> do writeLVT arr i c ti | otherwise -> writeTwo arr i c ch HangulLV c -> writeLVT arr i c ti where ich = ord ch {-# INLINE flushAndWrite #-} flushAndWrite marr ix jb c = do j <- writeJamoBuf marr ix jb n <- unsafeWrite marr j c return (j + n, ComposeNone) {-# INLINE writeLVT #-} writeLVT marr ix lv ti = do n <- unsafeWrite marr ix (chr ((ord lv) + ti)) return (ix + n, ComposeNone) {-# INLINE writeTwo #-} writeTwo marr ix c1 c2 = do n <- unsafeWrite marr ix c1 m <- unsafeWrite marr (ix + n) c2 return ((ix + n + m), ComposeNone) {-# INLINE insertHangul #-} insertHangul :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) insertHangul arr i jbuf ch = do j <- writeJamoBuf arr i jbuf return (j, ComposeJamo (Hangul ch)) {-# INLINE insertIntoRegBuf #-} insertIntoRegBuf :: Char -> RegBuf -> RegBuf insertIntoRegBuf c (RegOne c0) | CC.getCombiningClass c < CC.getCombiningClass c0 = RegMany c c0 [] | otherwise = RegMany c0 c [] insertIntoRegBuf c (RegMany c0 c1 cs) | cc < CC.getCombiningClass c0 = RegMany c c0 (c1 : cs) | cc < CC.getCombiningClass c1 = RegMany c0 c (c1 : cs) | otherwise = RegMany c0 c1 (cs' ++ (c : cs'')) where cc = CC.getCombiningClass c (cs', cs'') = span ((<= cc) . CC.getCombiningClass) cs {-# INLINE writeRegBuf #-} writeRegBuf :: A.MArray s -> Int -> RegBuf -> ST s Int writeRegBuf arr i = \case RegOne c -> do n <- unsafeWrite arr i c return (i + n) RegMany st c [] -> case C.composePair st c of Just x -> do n <- unsafeWrite arr i x return (i + n) Nothing -> do n <- unsafeWrite arr i st m <- unsafeWrite arr (i + n) c return (i + n + m) RegMany st0 c0 cs0 -> go [] st0 (c0 : cs0) where -- arguments: uncombined chars, starter, unprocessed str go uncs st [] = writeStr arr i (st : uncs) go uncs st (c : cs) = case C.composePair st c of Nothing -> go (uncs ++ (c : same)) st bigger Just x -> go uncs x cs where cc = CC.getCombiningClass c (same, bigger) = span ((== cc) . CC.getCombiningClass) cs {-# INLINE flushComposeState #-} flushComposeState :: A.MArray s -> Int -> ComposeState -> ST s Int flushComposeState arr i = \case ComposeNone -> pure i ComposeReg rbuf -> writeRegBuf arr i rbuf ComposeJamo jbuf -> writeJamoBuf arr i jbuf {-# INLINE composeChar #-} composeChar :: D.DecomposeMode -> A.MArray s -- destination array for composition -> Char -- input char -> Int -- array index -> ComposeState -> ST s (Int, ComposeState) composeChar mode marr = go0 where go0 ch !i !st = case st of ComposeReg rbuf | ich < H.jamoLFirst -> composeReg rbuf ch i st | ich <= H.jamoLast -> do j <- writeRegBuf marr i rbuf initJamo ch j | ich < H.hangulFirst -> composeReg rbuf ch i st | ich <= H.hangulLast -> do j <- writeRegBuf marr i rbuf initHangul ch j | otherwise -> composeReg rbuf ch i st ComposeJamo jbuf | ich < H.jamoLFirst -> do jamoToReg marr i jbuf ch | ich <= H.jamoLast -> do insertJamo marr i jbuf ch | ich < H.hangulFirst -> jamoToReg marr i jbuf ch | ich <= H.hangulLast -> do insertHangul marr i jbuf ch | otherwise -> jamoToReg marr i jbuf ch ComposeNone | ich < H.jamoLFirst -> initReg ch i | ich <= H.jamoLast -> initJamo ch i | ich < H.hangulFirst -> initReg ch i | ich <= H.hangulLast -> initHangul ch i | otherwise -> initReg ch i where ich = ord ch {-# INLINE jamoToReg #-} jamoToReg arr i jbuf ch = do j <- writeJamoBuf arr i jbuf initReg ch j {-# INLINE initReg #-} initReg !ch !i | D.isDecomposable mode ch = go (D.decomposeChar mode ch) i ComposeNone | otherwise = pure (i, ComposeReg (RegOne ch)) {-# INLINE composeReg #-} composeReg rbuf !ch !i !st | D.isDecomposable mode ch = go (D.decomposeChar mode ch) i st | CC.isCombining ch = do pure (i, ComposeReg (insertIntoRegBuf ch rbuf)) -- The first char in RegBuf may or may not be a starter. In -- case it is not we rely on composeStarterPair failing. | RegOne s <- rbuf , C.isSecondStarter ch , Just x <- C.composeStarterPair s ch = pure (i, (ComposeReg (RegOne x))) | otherwise = do j <- writeRegBuf marr i rbuf pure (j, ComposeReg (RegOne ch)) go [] !i !st = pure (i, st) go (ch : rest) i st = case st of ComposeReg rbuf | H.isHangul ch -> do j <- writeRegBuf marr i rbuf (k, s) <- initHangul ch j go rest k s | H.isJamo ch -> do j <- writeRegBuf marr i rbuf (k, s) <- initJamo ch j go rest k s | D.isDecomposable mode ch -> go (D.decomposeChar mode ch ++ rest) i st | CC.isCombining ch -> do go rest i (ComposeReg (insertIntoRegBuf ch rbuf)) | RegOne s <- rbuf , C.isSecondStarter ch , Just x <- C.composeStarterPair s ch -> go rest i (ComposeReg (RegOne x)) | otherwise -> do j <- writeRegBuf marr i rbuf go rest j (ComposeReg (RegOne ch)) ComposeJamo jbuf | H.isJamo ch -> do (j, s) <- insertJamo marr i jbuf ch go rest j s | H.isHangul ch -> do (j, s) <- insertHangul marr i jbuf ch go rest j s | otherwise -> do j <- writeJamoBuf marr i jbuf case () of _ | D.isDecomposable mode ch -> go (D.decomposeChar mode ch ++ rest) j ComposeNone | otherwise -> go rest j (ComposeReg (RegOne ch)) ComposeNone | H.isHangul ch -> do (j, s) <- initHangul ch i go rest j s | H.isJamo ch -> do (j, s) <- initJamo ch i go rest j s | D.isDecomposable mode ch -> go (D.decomposeChar mode ch ++ rest) i st | otherwise -> go rest i (ComposeReg (RegOne ch)) -- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'. unstreamC :: D.DecomposeMode -> Stream Char -> Text unstreamC mode (Stream next0 s0 len) = runText $ \done -> do -- Before encoding each char we perform a buffer realloc check assuming -- worst case encoding size of two 16-bit units for the char. Just add an -- extra space to the buffer so that we do not end up reallocating even when -- all the chars are encoded as single unit. let margin = 1 + maxDecomposeLen mlen = (upperBound 4 len + margin) arr0 <- A.new mlen let outer !arr !maxi = encode SPEC where -- keep the common case loop as small as possible encode !_ !si !di st = -- simply check for the worst case if maxi < di + margin then realloc si di st else case next0 si of Done -> do di' <- flushComposeState arr di st done arr di' Skip si' -> encode SPEC si' di st Yield c si' -> do (di', st') <- composeChar mode arr c di st encode SPEC si' di' st' -- keep uncommon case separate from the common case code {-# NOINLINE realloc #-} realloc !si !di st = do let newlen = maxi * 2 arr' <- A.new newlen A.copyM arr' 0 arr 0 di outer arr' (newlen - 1) si di st outer arr0 (mlen - 1) s0 0 ComposeNone {-# INLINE [0] unstreamC #-}