{-# OPTIONS_HADDOCK show-extensions #-}
{-# OPTIONS_GHC -funbox-strict-fields -fobject-code #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE UnboxedTuples #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE FlexibleInstances #-}

-- |
-- Module      :  Data.Phonetic.Languages.Syllables
-- Copyright   :  (c) OleksandrZhabenko 2021
-- License     :  MIT
-- Stability   :  Experimental
-- Maintainer  :  olexandr543@yahoo.com
--
-- This module works with syllable segmentation. The generalized version for the module
-- 'Languages.Phonetic.Ukrainian.Syllable.Arr' from @ukrainian-phonetics-basic-array@ package.
-- 

module Data.Phonetic.Languages.Syllables (
  -- * Data types and type synonyms
  PRS(..)
  , PhoneticType(..)
  , CharPhoneticClassification
  , StringRepresentation
  , SegmentationInfo1(..)
  , SegmentationPredFunction(..)
  , SegmentationPredFData(..)
  , SegmentationFDP
  , Eval2Bool(..)
  , DListFunctionResult
  , SegmentationLineFunction(..)
  , SegmentationRules1(..)
  , SegmentRulesG
  , DListRepresentation(..)
  -- * Basic functions
  , str2PRSs
  , sndGroups
  , groupSnds
  , divCnsnts
  , reSyllableCntnts
  , divVwls
  , createSyllablesPL
  -- * Auxiliary functions
  , gBF4
  , findC
  , isVowel1
  , isSonorous1
  , isVoicedC1
  , isVoicelessC1
  , isNotVowel2
  , notEqC
  , fromPhoneticType
) where

import Prelude hiding (mappend)
import Data.Monoid
import qualified Data.List as L (groupBy,find,intercalate)
import Data.Phonetic.Languages.Base
import CaseBi.Arr
import GHC.Arr
import GHC.Exts
import Data.List.InnToOut.Basic (mapI)
import Data.Maybe (mapMaybe,fromJust)
import GHC.Int
import Text.Read (readMaybe)
import Data.Char (isLetter)

-- Inspired by: https://github.com/OleksandrZhabenko/mm1/releases/tag/0.2.0.0

-- CAUTION: Please, do not mix with the show7s functions, they are not interoperable.

data PRS = SylS {
  charS :: !Char, -- ^ Phonetic languages phenomenon representation. Usually, a phoneme, but it can be otherwise something different.
  phoneType :: !PhoneticType -- ^ Some encoded type.
} deriving ( Eq, Read )

instance Ord PRS where
  compare (SylS x1 y1) (SylS x2 y2) =
    case compare x1 x2 of
      EQ -> compare y1 y2
      ~z -> z

instance Show PRS where
  show (SylS c (P x)) = "SylS \'" `mappend` (c:'\'':' ':show x)

data PhoneticType = P !Int8 deriving (Eq, Ord, Read)

instance Show PhoneticType where
  show (P x) = "P " `mappend` show x

fromPhoneticType :: PhoneticType -> Int
fromPhoneticType (P x) = fromEnum x

{-| The 'Array' 'Int' must be sorted in the ascending order to be used in the module correctly.
-}
type CharPhoneticClassification = Array Int PRS

{-| The 'String' of converted phonetic language representation 'Char' data is converted to this type to apply syllable
segmentation or other transformations.
-}
type StringRepresentation = [PRS]

-- | Is somewhat rewritten from the 'CaseBi.Arr.gBF3' function (not exported) from the @mmsyn2-array@ package.
gBF4
  :: (Ix i) => (# Int#, PRS #)
  -> (# Int#, PRS #)
  -> Char
  -> Array i PRS
  -> Maybe PRS
gBF4 (# !i#, k #) (# !j#, m #) c arr
 | isTrue# ((j# -# i#) ># 1# ) = 
    case compare c (charS p) of
     GT -> gBF4 (# n#, p #) (# j#, m #) c arr
     LT  -> gBF4 (# i#, k #) (# n#, p #) c arr
     _ -> Just p
 | c == charS m = Just m
 | c == charS k = Just k
 | otherwise = Nothing
     where !n# = (i# +# j#) `quotInt#` 2#
           !p = unsafeAt arr (I# n#)
{-# INLINABLE gBF4 #-}

findC
  :: Char
  -> Array Int PRS
  -> Maybe PRS
findC c arr = gBF4 (# i#, k #) (# j#, m #) c arr 
     where !(I# i#,I# j#) = bounds arr
           !k = unsafeAt arr (I# i#)
           !m = unsafeAt arr (I# i#)

str2PRSs :: CharPhoneticClassification -> String -> StringRepresentation
str2PRSs arr = map (\c -> fromJust . findC c $ arr)
  
-- | Function-predicate 'isVowel1' checks whether its argument is a vowel representation in the 'PRS' format.
isVowel1 :: PRS -> Bool
isVowel1 = (== P 0) . phoneType
{-# INLINE isVowel1 #-}

-- | Function-predicate 'isSonorous1' checks whether its argument is a sonorous consonant representation in the 'PRS' format.
isSonorous1 :: PRS -> Bool
isSonorous1 =  (`elem` [P 1,P 2]) . phoneType
{-# INLINE isSonorous1 #-}

-- | Function-predicate 'isVoicedC1' checks whether its argument is a voiced consonant representation in the 'PRS' format.
isVoicedC1 ::  PRS -> Bool
isVoicedC1 = (`elem` [P 3,P 4]) . phoneType
{-# INLINE isVoicedC1 #-}

-- | Function-predicate 'isVoiceless1' checks whether its argument is a voiceless consonant representation in the 'PRS' format.
isVoicelessC1 ::  PRS -> Bool
isVoicelessC1 =  (`elem` [P 5,P 6]) . phoneType
{-# INLINE isVoicelessC1 #-}

-- | Binary function-predicate 'isNotVowel2' checks whether its arguments are both consonant representations in the 'PRS' format.
isNotVowel2 :: PRS -> PRS -> Bool
isNotVowel2 x y
  | phoneType x == P 0 || phoneType y == P 0 = False
  | otherwise = True
{-# INLINE isNotVowel2 #-}

-- | Binary function-predicate 'notEqC' checks whether its arguments are not the same consonant sound representations (not taking palatalization into account).
notEqC
 :: [(Char,Char)] -- ^ The pairs of the 'Char' that corresponds to the similar phonetic languages consonant phenomenon (e. g. allophones). Must be sorted in the ascending order to be used correctly. 
 -> PRS
 -> PRS
 -> Bool
notEqC xs x y
  | (== cy) . getBFstLSorted' cx xs $ cx = False
  | otherwise = cx /= cy
      where !cx = charS x
            !cy = charS y

-- | Function 'sndGroups' converts a word being a list of 'PRS' to the list of phonetically similar (consonants grouped with consonants and each vowel separately)
-- sounds representations in 'PRS' format.
sndGroups :: [PRS] -> [[PRS]]
sndGroups ys@(_:_) = L.groupBy isNotVowel2 ys
sndGroups _ = []

groupSnds :: [PRS] -> [[PRS]]
groupSnds = L.groupBy (\x y -> ((== P 0) . phoneType $ x) == ((== P 0) . phoneType $ y))

data SegmentationInfo1 = SI {
 fieldN :: !Int8,  -- ^ Number of fields in the pattern matching that are needed to apply the segmentation rules. Not less than 1.
 predicateN :: Int8 -- ^ Number of predicates in the definition for the 'fieldN' that are needed to apply the segmentation rules.
} deriving (Eq)

instance Show SegmentationInfo1 where
  show (SI m n) = L.intercalate " " ["SI", show m, show n]

instance PhoneticElement SegmentationInfo1 where
  readPEMaybe rs
    | not . any isLetter $ rs = Nothing
    | otherwise = let (ys:yss) = words rs in case ys of
        "SI" -> case yss of
           [xs,ts] -> case (readMaybe xs::Maybe Int8) of
               Just m -> case (readMaybe ts::Maybe Int8) of
                 Just n -> Just (SI m n)
                 _ -> Nothing
               _ -> Nothing
           _ -> Nothing
        _ -> Nothing

{-| We can think of 'SegmentationPredFunction' in terms of @f ('SI' fN pN) ks [x_{1},x_{2},...,x_{i},...,x_{fN}]@. Comparing with
'divCnsnts' from the @ukrainian-phonetics-basics-array@ we can postulate that it consists of the following logical terms in
the symbolic form:

1) 'phoneType' x_{i} \`'elem'\` (X{...} = 'map' 'P' ['Int8'])

2) 'notEqC' ks x_{i} x_{j} (j /= i)

combined with the standard logic Boolean operations of '(&&)', '(||)' and 'not'. Further, the 'not' can be transformed into the
positive (affirmative) form using the notion of the universal set for the task. This transformation needs that the similar
phonetic phenomenae (e. g. the double sounds -- the prolonged ones) belong to the one syllable and not to the different ones
(so they are not related to different syllables, but just to the one and the same). Since such assumption has been used,
we can further represent the function by the following data type and operations with it, see 'SegmentationPredFData'.
-}
data SegmentationPredFunction = PF (SegmentationInfo1 -> [(Char, Char)] -> [PRS] -> Bool)

data SegmentationPredFData a b = L Int [Int] (Array Int a) | NEC Int Int (Array Int a) [b] | C (SegmentationPredFData a b) (SegmentationPredFData a b) |
  D (SegmentationPredFData a b) (SegmentationPredFData a b) deriving (Eq, Read, Show)

class Eval2Bool a where
  eval2Bool :: a -> Bool

type SegmentationFDP = SegmentationPredFData PRS (Char, Char)

instance Eval2Bool (SegmentationPredFData PRS (Char, Char)) where
  eval2Bool (L i js arr)
    | all (<= n) js && i <= n && i >= 1 && all (>=1) js = fromPhoneticType (phoneType (unsafeAt arr $ i - 1)) `elem` js
    | otherwise = error "Data.Phonetic.Languages.Syllables.eval2Bool: 'L' element is not properly defined. "
        where n = numElements arr
  eval2Bool (NEC i j arr ks)
    | i >= 1 && j >= 1 && i /= j && i <= n && j <= n = notEqC ks (unsafeAt arr $ i - 1) (unsafeAt arr $ j - 1)
    | otherwise = error "Data.Phonetic.Languages.Syllables.eval2Bool: 'NEC' element is not properly defined. "
        where n = numElements arr
  eval2Bool (C x y) = eval2Bool x && eval2Bool y
  eval2Bool (D x y) = eval2Bool x || eval2Bool y

type DListFunctionResult = ([PRS] -> [PRS],[PRS] -> [PRS])

class DListRepresentation a b where
  toDLR :: b -> [a] -> ([a] -> [a], [a] -> [a])

instance DListRepresentation PRS Int8 where
  toDLR left xs
    | null xs = (id,id)
    | null ts =  (id,(zs `mappend`))
    | null zs = ((`mappend` ts), id)
    | otherwise = ((`mappend` ts), (zs `mappend`))
        where (ts,zs) = splitAt (fromEnum left) xs
           
data SegmentationLineFunction = LFS {
  infoSP :: SegmentationInfo1,
  predF :: SegmentationFDP,  -- ^ The predicate to check the needed rule for segmentation.
  resF :: Int8 -- ^ The result argument to be appended to the left of the group of consonants if the 'predF' returns 'True' for its arguments. Is an argument to the 'toDLR'.
}

data SegmentationRules1 = SR1 {
  infoS :: SegmentationInfo1, 
  lineFs :: [SegmentationLineFunction] -- ^ The list must be sorted in the appropriate order of the guards usage for the predicates.
  -- The length of the list must be equal to the ('fromEnum' . 'predicateN' . 'infoS') value.
}  

{-| List of the 'SegmentationRules1' sorted in the descending order by the 'fieldN' 'SegmentationInfo1' data and where the
length of all the 'SegmentationPredFunction' lists of 'PRS' are equal to the 'fieldN' 'SegmentationInfo1' data by definition.
-}
type SegmentRulesG = [SegmentationRules1]

-- | Function 'divCnsnts' is used to divide groups of consonants into two-elements lists that later are made belonging to
-- different neighbour syllables if the group is between two vowels in a word. The group must be not empty, but this is not checked.
-- The example phonetical information for the proper performance in Ukrainian can be found from the:
-- https://msn.khnu.km.ua/pluginfile.php/302375/mod_resource/content/1/%D0%9B.3.%D0%86%D0%86.%20%D0%A1%D0%BA%D0%BB%D0%B0%D0%B4.%D0%9D%D0%B0%D0%B3%D0%BE%D0%BB%D0%BE%D1%81.pdf
-- The example of the 'divCnsnts' can be found at: https://hackage.haskell.org/package/ukrainian-phonetics-basic-array-0.1.2.0/docs/src/Languages.Phonetic.Ukrainian.Syllable.Arr.html#divCnsnts
divCnsnts :: [(Char,Char)] -> SegmentRulesG -> [PRS] -> DListFunctionResult
divCnsnts ks gs xs@(_:_) = toDLR left xs
  where !js = fromJust . L.find ((== length xs) . fromEnum . fieldN . infoS) $ gs -- js :: SegmentationRules1
        !left = resF . fromJust . L.find (eval2Bool . predF). lineFs $ js
divCnsnts _ _ [] = (id,id)

reSyllableCntnts :: [(Char,Char)] -> SegmentRulesG -> [[PRS]] -> [[PRS]]
reSyllableCntnts ks gs (xs:ys:zs:xss)
  | (/= P 0) . phoneType . last $ ys = fst (divCnsnts ks gs ys) xs:reSyllableCntnts ks gs (snd (divCnsnts ks gs ys) zs:xss)
  | otherwise = reSyllableCntnts ks gs ((xs `mappend` ys):zs:xss)
reSyllableCntnts _ _ (xs:ys:_) = [(xs `mappend` ys)]
reSyllableCntnts _ _ xss = xss

divVwls :: [[PRS]] -> [[PRS]]
divVwls = mapI (\ws -> (length . filter ((== P 0) . phoneType) $ ws) > 1) h3
  where h3 us = [ys `mappend` take 1 zs] `mappend` (L.groupBy (\x y -> phoneType x == P 0 && phoneType y /= P 0) . drop 1 $ zs)
                  where (ys,zs) = span (\t -> phoneType t /= P 0) us

createSyllablesPL
  :: GWritingSystemPRPLX
  -> [(Char,Char)]
  -> CharPhoneticClassification
  -> SegmentRulesG
  -> String -- ^ Corresponds to the \'0\' symbol delimiter in the @ukrainian-phonetics-basic-array@ package.
  -> String -- ^ Corresponds to the \'1\' and \'-\' symbol delimiters in the @ukrainian-phonetics-basic-array@ package.
  -> String -- ^ Actually the converted 'String'.
  -> [[[PRS]]]
createSyllablesPL wrs ks arr gs us vs = map (divVwls . reSyllableCntnts ks gs . groupSnds . str2PRSs arr) . words1 . mapMaybe g . convertToProperPL . map (\x -> if x == '-' then ' ' else x)
  where g x
          | x `elem` us = Nothing
          | x `notElem` vs = Just x
          | otherwise = Just ' '
        words1 xs = if null ts then [] else w : words1 s'' -- Practically this is an optimized version for this case 'words' function from Prelude.
          where ts = dropWhile (== ' ') xs
                (w, s'') = span (/= ' ') ts
        {-# NOINLINE words1 #-}
        convertToProperPL = map char . stringToXG wrs
{-# INLINE createSyllablesPL #-}