{-# OPTIONS_GHC -funbox-strict-fields #-}
{-# LANGUAGE BangPatterns #-}
-- |
-- Module      : Data.Unicode.Internal.NormalizeStream
-- Copyright   : (c) 2016 Harendra Kumar
--
-- License     : BSD-style
-- Maintainer  : harendra.kumar@gmail.com
-- Stability   : experimental
--
-- Stream based normalization.
--
module Data.Unicode.Internal.NormalizeStream
    (
      D.DecomposeMode(..)
    , stream
    , unstream
    , unstreamC
    )
    where

import           Control.Monad                          (ap)
import           Data.Char                              (chr, ord)
import           Data.List                              (sortBy)
import           Data.Ord                               (comparing)
import qualified Data.Text.Array                        as A
import           Data.Text.Internal                     (Text (..))
import qualified Data.Text.Internal.Encoding.Utf16      as U16
import           Data.Text.Internal.Fusion.Size         (betweenSize,
                                                         upperBound)
import           Data.Text.Internal.Fusion.Types        (Step (..), Stream (..))
import           Data.Text.Internal.Private             (runText)
import           Data.Text.Internal.Unsafe.Char         (unsafeWrite)
import           Data.Text.Internal.Unsafe.Char         (unsafeChr)
import           Data.Text.Internal.Unsafe.Shift        (shiftR)
import           GHC.ST                                 (ST (..))

import qualified Data.Unicode.Properties.CombiningClass  as CC
import qualified Data.Unicode.Properties.Compositions    as C
import qualified Data.Unicode.Properties.Decompose       as D
import qualified Data.Unicode.Properties.DecomposeHangul as H

-------------------------------------------------------------------------------
-- Reorder buffer to hold characters till the next starter boundary
-------------------------------------------------------------------------------

data ReBuf = Empty | One {-# UNPACK #-} !Char | Many [Char]

writeStr :: A.MArray s -> Int -> [Char] -> ST s Int
writeStr marr di str = go di str
    where
        go i [] = return i
        go i (c : cs) = do
            n <- unsafeWrite marr i c
            go (i + n) cs

{-# INLINE writeReorderBuffer #-}
writeReorderBuffer :: A.MArray s -> Int -> ReBuf -> ST s Int
writeReorderBuffer _ di Empty = return di

writeReorderBuffer marr di (One c) = do
    n <- unsafeWrite marr di c
    return (di + n)

writeReorderBuffer marr di (Many str) = writeStr marr di str

-------------------------------------------------------------------------------
-- Decomposition of Hangul characters is done algorithmically
-------------------------------------------------------------------------------

-- {-# INLINE decomposeCharHangul #-}
decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s (Int, ReBuf)
decomposeCharHangul marr j c = do
    case D.decomposeCharHangul c of
        Left  (l, v)    -> do
            n1 <- unsafeWrite marr j l
            n2 <- unsafeWrite marr (j + n1) v
            return ((j + n1 + n2), Empty)
        Right (l, v, t) -> do
            n1 <- unsafeWrite marr j l
            n2 <- unsafeWrite marr (j + n1) v
            n3 <- unsafeWrite marr (j + n1 + n2) t
            return (j + n1 + n2 + n3, Empty)

{-# INLINE decomposeChar #-}
decomposeChar
    :: D.DecomposeMode
    -> A.MArray s       -- destination array for decomposition
    -> Int              -- array index
    -> ReBuf            -- reorder buffer
    -> Char             -- char to be decomposed
    -> ST s (Int, ReBuf)
decomposeChar _ marr i reBuf c | D.isHangul c = do
    j <- writeReorderBuffer marr i reBuf
    decomposeCharHangul marr j c

-------------------------------------------------------------------------------
-- Decomposition of characters other than Hangul
-------------------------------------------------------------------------------

decomposeChar mode marr index reBuf ch = do
    -- TODO: return fully decomposed form
    case D.isDecomposable mode ch of
      D.FalseA -> reorder marr index reBuf ch
      D.TrueA  -> decomposeAll marr index reBuf (D.decomposeChar mode ch)
      _ -> reorder marr index reBuf ch

    where
        {-# INLINE decomposeAll #-}
        decomposeAll _ i rbuf [] = return (i, rbuf)
        decomposeAll arr i rbuf (x : xs)  =
            case D.isDecomposable mode x of
                D.TrueA  -> do
                    (i', rbuf') <- decomposeAll arr i rbuf
                                                (D.decomposeChar mode x)
                    decomposeAll arr i' rbuf' xs
                _ -> do
                    -- XXX calling reorder is wrong if decomposition results in
                    -- a further decomposable Hangul char. In that case we will
                    -- not go through the Hangul decompose for that char.
                    -- To be strictly correct we have to call decomposeChar
                    -- recursively here.
                    (i', rbuf') <- reorder arr i rbuf x
                    decomposeAll arr i' rbuf' xs

        -- Unicode 9.0.0: 3.11
        -- D108 Reorderable pair: Two adjacent characters A and B in a coded
        -- character sequence <A,B> are a Reorderable Pair if and only if
        -- ccc(A) > ccc(B) > 0.
        --
        -- (array) (array index) (reorder buffer) (input char)
        {-# INLINE reorder #-}
        reorder _ i Empty c = return (i, One c)

        -- input char is a starter, flush the reorder buffer
        reorder arr i (One c0) c | not (CC.isCombining c) = do
            n1 <- unsafeWrite arr i c0
            n2 <- unsafeWrite arr (i + n1) c
            return ((i + n1 + n2), Empty)

        -- input char is combining and there is a starter char in the buffer
        -- flush the starter char and add the combining char to the buffer
        reorder arr i (One c0) c | not (CC.isCombining c0) = do
            n <- unsafeWrite arr i c0
            return (i + n, One c)

        -- optimized ordering for common case of two combining chars
        -- XXX replace many with Two here
        reorder  _ i (One c0) c = return (i, Many orderedPair)
            where
                -- {-# INLINE orderedPair #-}
                orderedPair =
                    case inOrder c0 c of
                        True  -> [c0, c]
                        False -> [c, c0]

                inOrder c1 c2 =
                    CC.getCombiningClass c1 <= CC.getCombiningClass c2

        -- input char is a starter, flush the reorder buffer
        reorder arr i rbuf c | not (CC.isCombining c) = do
            j <- writeReorderBuffer arr i rbuf
            n <- unsafeWrite arr j c
            return (j + n, Empty)

        -- unoptimized generic sort for more than two combining chars
        reorder _ i (Many str) c = return (i, Many (sortCluster (str ++ [c])))
            where
                {-# INLINE sortCluster #-}
                sortCluster =   map fst
                              . sortBy (comparing snd)
                              . map (ap (,) CC.getCombiningClass)

-- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
stream :: Text -> Stream Char
stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
    where
      !end = off+len
      {-# INLINE next #-}
      next !i
          | i >= end                   = Done
          -- shift generates only two branches instead of three in case of
          -- range check, works quite a bit faster with llvm backend.
          | (n `shiftR` 10) == 0x36    = Yield (U16.chr2 n n2) (i + 2)
          | otherwise                  = Yield (unsafeChr n) (i + 1)
          where
            n  = A.unsafeIndex arr i
            n2 = A.unsafeIndex arr (i + 1)
{-# INLINE [0] stream #-}

-- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'.
unstream :: D.DecomposeMode -> Stream Char -> Text
unstream mode (Stream next0 s0 len) = runText $ \done -> do
  -- Before encoding each char we perform a buffer realloc check assuming
  -- worst case encoding size of two 16-bit units for the char. Just add an
  -- extra space to the buffer so that we do not end up reallocating even when
  -- all the chars are encoded as single unit.
  let margin = 1 + maxDecomposeLen
      mlen = (upperBound 4 len + margin)
  arr0 <- A.new mlen
  let outer !arr !maxi = encode
       where
        -- keep the common case loop as small as possible
        encode !si !di rbuf =
            -- simply check for the worst case
            if maxi < di + margin
            then realloc si di rbuf
            else
                case next0 si of
                    Done -> do
                        di' <- writeReorderBuffer arr di rbuf
                        done arr di'
                    Skip si'    -> encode si' di rbuf
                    Yield c si' -> do
                                (di', rbuf') <- decomposeChar mode arr di rbuf c
                                encode si' di' rbuf'
                                -- n <- unsafeWrite arr di c
                                -- encode si' (di + n) rbuf

        -- keep uncommon case separate from the common case code
        {-# NOINLINE realloc #-}
        realloc !si !di rbuf = do
            let newlen = maxi * 2
            arr' <- A.new newlen
            A.copyM arr' 0 arr 0 di
            outer arr' (newlen - 1) si di rbuf

  outer arr0 (mlen - 1) s0 0 Empty
{-# INLINE [0] unstream #-}

-- we can generate this from UCD
maxDecomposeLen :: Int
maxDecomposeLen = 32

-------------------------------------------------------------------------------
-- Composition
-------------------------------------------------------------------------------

composeAndWrite
    :: A.MArray s
    -> Int
    -> Char
    -> ReBuf
    -> Char
    -> ST s (Int, Char) -- return new index, new starter
composeAndWrite arr di st1 Empty st2 = do
    n <- unsafeWrite arr di st1
    return (di + n, st2)

composeAndWrite arr di st1 (One c) st2 =
    composeAndWrite' arr di st1 [c] st2

composeAndWrite arr di st1 (Many str) st2 =
    composeAndWrite' arr di st1 str st2

composeAndWrite'
    :: A.MArray s
    -> Int
    -> Char
    -> [Char]
    -> Char
    -> ST s (Int, Char)
composeAndWrite' arr di st1 str st2 = go di st1 [] 0 str
    where
        -- arguments: index, starter, uncombined chars,
        -- cc of prev uncombined char, unprocessed str
        go i st [] _ [] =
                case C.composePair st st2 of
                    Just x  -> return (i, x)
                    Nothing -> do
                        n <- unsafeWrite arr i st
                        return (i + n, st2)

        go i st uncs _ [] = do
            j <- writeStr arr i (st : uncs)
            return (j, st2)

        go i st [] _ (c : cs) = do
            case C.composePair st c of
                Just x  -> go i x [] 0 cs
                Nothing -> do
                    go i st [c] (CC.getCombiningClass c) cs

        go i st uncs cc (c : cs) = do
            let ccc = CC.getCombiningClass c
            if ccc > cc then
                case C.composePair st c of
                    Just x  -> go i x uncs cc cs
                    Nothing -> do
                        go i st (uncs ++ [c]) ccc cs
            else go i st (uncs ++ [c]) ccc cs

writeStarterRbuf :: A.MArray s
                 -> Int
                 -> Maybe Char
                 -> ReBuf
                 -> ST s Int
writeStarterRbuf marr di st rbuf =
    case st of
        Nothing -> writeReorderBuffer marr di rbuf
        Just starter ->
            -- XXX null char hack
            composeAndWrite marr di starter rbuf '\0' >>= (return . fst)

-------------------------------------------------------------------------------
-- Composition of Hangul Jamo characters, done algorithmically
-------------------------------------------------------------------------------

-- Hold an L to wait for V, hold an LV to wait for T.
data JamoBuf
    = JamoEmpty
    | JamoLIndex {-# UNPACK #-} !Int
    | JamoLV     {-# UNPACK #-} !Char

{-# INLINE writeJamoBuf #-}
writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int
writeJamoBuf _ di JamoEmpty = return di

writeJamoBuf marr di (JamoLIndex i) = do
    n <- unsafeWrite marr di (chr (D.jamoLFirst + i))
    return (di + n)

writeJamoBuf marr di (JamoLV c) = do
    n <- unsafeWrite marr di c
    return (di + n)

-- TODO Unify compose and decompose if possible with good perf
-- TODO try unifying st, rbuf
-- TODO try using Either for (st, rbuf)/jbuf
-- or we can use different functions for hangul and non-hangul composition with
-- diff signatures. In an outer function we check if the char is hangul and
-- flush and switch the buffer before calling the appropriate function.

-- If we are composing we do not need to first decompose Hangul. We can just
-- compose assuming there could be some partially composed syllables e.g. LV
-- syllable followed by a jamo T. We need to compose this case as well.
--
-- XXX The unicode normalization test suite does not seem to have tests for a
-- LV composed hangul syllable followed by a jamo T.

{-# INLINE composeChar #-}

composeChar
    :: D.DecomposeMode
    -> A.MArray s       -- destination array for decomposition
    -> Int              -- array index
    -> Maybe Char       -- last starter
    -> ReBuf            -- reorder buffer
    -> JamoBuf          -- jamo buffer
    -> Char             -- char to be decomposed
    -> ST s (Int, Maybe Char, ReBuf, JamoBuf)
    -- ^ index, starter, reorder buf, jamobuf
composeChar _ marr index st rbuf jbuf ch | H.isHangul ch || H.isJamo ch = do
    j <- writeStarterRbuf marr index st rbuf
    (k, jbuf') <- if H.isJamo ch then
        composeCharJamo marr j jbuf ch
    else
        composeCharHangul marr j jbuf ch
    return (k, Nothing, Empty, jbuf')
    where
        composeCharJamo arr i JamoEmpty c =
            case H.jamoLIndex c of
                Just li -> return (i, JamoLIndex li)
                Nothing -> do
                    n <- unsafeWrite arr i c
                    return (i + n, JamoEmpty)

        composeCharJamo arr i jb@(JamoLIndex li) c =
            case H.jamoVIndex c of
                Just vi -> do
                    let lvi = li * H.jamoNCount + vi * H.jamoTCount
                    return (i, JamoLV (chr (H.hangulFirst + lvi)))
                Nothing -> do
                    ix <- writeJamoBuf arr i jb
                    composeCharJamo arr ix JamoEmpty c

        composeCharJamo arr i jb@(JamoLV lv) c =
            case H.jamoTIndex c of
                Just ti -> do
                    n <- unsafeWrite arr i (chr ((ord lv) + ti))
                    return (i + n, JamoEmpty)
                Nothing -> do
                    ix <- writeJamoBuf arr i jb
                    composeCharJamo arr ix JamoEmpty c

        composeCharHangul arr i jb c = do
            ix <- writeJamoBuf arr i jb
            case H.isHangulLV c of
                True -> return (ix, JamoLV c)
                False -> do
                    n <- unsafeWrite arr ix c
                    return (ix + n, JamoEmpty)

-------------------------------------------------------------------------------
-- Composition of characters other than Hangul
-------------------------------------------------------------------------------

composeChar mode marr index starter reBuf jbuf ch = do
    index' <- writeJamoBuf marr index jbuf
    case D.isDecomposable mode ch of
        D.FalseA -> do
            (i, st, rbuf) <- reorder marr index' starter reBuf ch
            return (i, st, rbuf, JamoEmpty)
        D.TrueA  -> do
            decomposeAll marr index' starter reBuf jbuf (D.decomposeChar mode ch)
        _ -> do
            (i, st, rbuf) <- reorder marr index' starter reBuf ch
            return (i, st, rbuf, JamoEmpty)
    where
        {-# INLINE decomposeAll #-}
        decomposeAll _ i st rbuf jb [] = return (i, st, rbuf, jb)
        decomposeAll arr i st rbuf jb (x : xs)  =
            case D.isDecomposable mode x of
                D.TrueA  -> do
                    (i', st', rbuf', jb') <- decomposeAll arr i st rbuf jb
                                                (D.decomposeChar mode x)
                    decomposeAll arr i' st' rbuf' jb' xs
                _ -> do
                    -- XXX this recursive call here hurts performance
                    -- We can make the hangul composition a separate function
                    -- and call that or reorder here based on the type fo char
                    (i', st', rbuf', jb') <- composeChar mode arr i st rbuf jb x
                    decomposeAll arr i' st' rbuf' jb' xs

        -- Unicode 9.0.0: 3.11
        -- D108 Reorderable pair: Two adjacent characters A and B in a coded
        -- character sequence <A,B> are a Reorderable Pair if and only if
        -- ccc(A) > ccc(B) > 0.
        --
        -- (array) (array index) (reorder buffer) (input char)
        {-# INLINE reorder #-}
        reorder _ i st Empty c = return (i, st, One c)

        -- Unicode 9.0.0: 3.11
        -- D111: a starter can never become a non-starter after
        -- combining. If that happens we will potentially have to remember all
        -- previous starters so that the new non-starter can be combined with
        -- the previous starter.
        --
        -- To compose, try to combine an unblocked char with the last starter
        -- and remove if combined. A char with combining class equal or lower
        -- than the previous char is blocked. It implies that only adjacent
        -- starters can be combined.
        --
        -- input char is a starter
        -- does it combine with the previous starter?
        -- if no then flush and replace the last starter
        reorder arr i (Just st) (One c0) c | not (CC.isCombining c) = do
            case C.composePair st c0 of
                Just x  -> case C.composePair x c of
                    Just y -> return (i, Just y, Empty)
                    Nothing -> do
                        n <- unsafeWrite arr i x
                        return (i + n, Just c, Empty)
                Nothing -> case CC.isCombining c0 of
                    -- starter1 combining starter2
                    True -> do
                        n1 <- unsafeWrite arr i st
                        n2 <- unsafeWrite arr (i + n1) c0
                        return (i + n1 + n2, Just c, Empty)
                    -- starter1 starter2 starter3
                    False -> do
                        n1 <- unsafeWrite arr i st
                        case C.composePair c0 c of
                            Just y -> return (i + n1, Just y, Empty)
                            Nothing -> do
                                n2 <- unsafeWrite arr (i + n1) c0
                                return (i + n1 + n2, Just c, Empty)

        reorder arr i Nothing (One c0) c | not (CC.isCombining c) =
            case C.composePair c0 c of
                Just x  -> return (i, Just x, Empty)
                Nothing -> do
                    n <- unsafeWrite arr i c0
                    return (i + n, Just c, Empty)

        reorder arr i (Just st) (One c0) c | not (CC.isCombining c0) = do
            case C.composePair st c0 of
                Just x  -> return (i, Just x, One c)
                Nothing -> do
                    n <- unsafeWrite arr i st
                    return (i + n, Just c0, One c)

        -- input char is combining and there is a starter char in the buffer
        -- flush the starter char and add the combining char to the buffer
        reorder _arr i Nothing (One c0) c | not (CC.isCombining c0) = do
            return (i, Just c0, One c)

        -- optimized ordering for common case of two combining chars
        -- XXX replace many with Two here
        reorder  _ i st (One c0) c = return (i, st, Many orderedPair)
            where
                -- {-# INLINE orderedPair #-}
                orderedPair =
                    case inOrder c0 c of
                        True  -> [c0, c]
                        False -> [c, c0]

                inOrder c1 c2 =
                    CC.getCombiningClass c1 <= CC.getCombiningClass c2

        -- input char is a starter, flush the reorder buffer
        reorder arr i (Just st) rbuf c | not (CC.isCombining c) = do
            (j, st2) <- composeAndWrite arr i st rbuf c
            return (j, Just st2, Empty)

        reorder arr i Nothing rbuf c | not (CC.isCombining c) = do
            j <- writeReorderBuffer arr i rbuf
            return (j, Just c, Empty)

        -- unoptimized generic sort for more than two combining chars
        reorder _ i st (Many str) c =
            return (i, st, Many (sortCluster (str ++ [c])))
            where
                {-# INLINE sortCluster #-}
                sortCluster =   map fst
                              . sortBy (comparing snd)
                              . map (ap (,) CC.getCombiningClass)

-- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'.
unstreamC :: D.DecomposeMode -> Stream Char -> Text
unstreamC mode (Stream next0 s0 len) = runText $ \done -> do
  -- Before encoding each char we perform a buffer realloc check assuming
  -- worst case encoding size of two 16-bit units for the char. Just add an
  -- extra space to the buffer so that we do not end up reallocating even when
  -- all the chars are encoded as single unit.
  let margin = 1 + maxDecomposeLen
      mlen = (upperBound 4 len + margin)
  arr0 <- A.new mlen
  let outer !arr !maxi = encode
       where
        -- keep the common case loop as small as possible
        encode !si !di st rbuf jbuf =
            -- simply check for the worst case
            if maxi < di + margin
               then realloc si di st rbuf jbuf
            else
                case next0 si of
                    Done -> do
                        -- Flush any leftover buffers, only one of rbuf/jbuf
                        -- will have contents
                        di'  <- writeStarterRbuf arr di st rbuf
                        di'' <- writeJamoBuf arr di' jbuf
                        done arr di''
                    Skip si'    -> encode si' di st rbuf jbuf
                    Yield c si' -> do
                        (di', st', rbuf', jbuf') <- composeChar mode arr di st rbuf jbuf c
                        encode si' di' st' rbuf' jbuf'

        -- keep uncommon case separate from the common case code
        {-# NOINLINE realloc #-}
        realloc !si !di st rbuf jbuf = do
            let newlen = maxi * 2
            arr' <- A.new newlen
            A.copyM arr' 0 arr 0 di
            outer arr' (newlen - 1) si di st rbuf jbuf

  outer arr0 (mlen - 1) s0 0 Nothing Empty JamoEmpty
{-# INLINE [0] unstreamC #-}