{-# OPTIONS_HADDOCK show-extensions #-} {-# OPTIONS_GHC -funbox-strict-fields -fobject-code #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE FlexibleInstances #-} -- | -- Module : Data.Phonetic.Languages.Syllables -- Copyright : (c) OleksandrZhabenko 2021 -- License : MIT -- Stability : Experimental -- Maintainer : olexandr543@yahoo.com -- -- This module works with syllable segmentation. The generalized version for the module -- 'Languages.Phonetic.Ukrainian.Syllable.Arr' from @ukrainian-phonetics-basic-array@ package. -- module Data.Phonetic.Languages.Syllables ( -- * Data types and type synonyms PRS(..) , PhoneticType(..) , CharPhoneticClassification , StringRepresentation , SegmentationInfo1(..) , SegmentationPredFunction(..) , SegmentationPredFData(..) , SegmentationFDP , Eval2Bool(..) , DListFunctionResult , SegmentationLineFunction(..) , SegmentationRules1(..) , SegmentRulesG , DListRepresentation(..) -- * Basic functions , str2PRSs , sndGroups , groupSnds , divCnsnts , reSyllableCntnts , divVwls , createSyllablesPL -- * Auxiliary functions , gBF4 , findC , isVowel1 , isSonorous1 , isVoicedC1 , isVoicelessC1 , isNotVowel2 , notEqC , fromPhoneticType ) where import Prelude hiding (mappend) import Data.Monoid import qualified Data.List as L (groupBy,find,intercalate) import Data.Phonetic.Languages.Base import CaseBi.Arr import GHC.Arr import GHC.Exts import Data.List.InnToOut.Basic (mapI) import Data.Maybe (mapMaybe,fromJust) import GHC.Int import Text.Read (readMaybe) import Data.Char (isLetter) -- Inspired by: https://github.com/OleksandrZhabenko/mm1/releases/tag/0.2.0.0 -- CAUTION: Please, do not mix with the show7s functions, they are not interoperable. data PRS = SylS { charS :: !Char, -- ^ Phonetic languages phenomenon representation. Usually, a phoneme, but it can be otherwise something different. phoneType :: !PhoneticType -- ^ Some encoded type. } deriving ( Eq, Read ) instance Ord PRS where compare (SylS x1 y1) (SylS x2 y2) = case compare x1 x2 of EQ -> compare y1 y2 ~z -> z instance Show PRS where show (SylS c (P x)) = "SylS \'" `mappend` (c:'\'':' ':show x) data PhoneticType = P !Int8 deriving (Eq, Ord, Read) instance Show PhoneticType where show (P x) = "P " `mappend` show x fromPhoneticType :: PhoneticType -> Int fromPhoneticType (P x) = fromEnum x {-| The 'Array' 'Int' must be sorted in the ascending order to be used in the module correctly. -} type CharPhoneticClassification = Array Int PRS {-| The 'String' of converted phonetic language representation 'Char' data is converted to this type to apply syllable segmentation or other transformations. -} type StringRepresentation = [PRS] -- | Is somewhat rewritten from the 'CaseBi.Arr.gBF3' function (not exported) from the @mmsyn2-array@ package. gBF4 :: (Ix i) => (# Int#, PRS #) -> (# Int#, PRS #) -> Char -> Array i PRS -> Maybe PRS gBF4 (# !i#, k #) (# !j#, m #) c arr | isTrue# ((j# -# i#) ># 1# ) = case compare c (charS p) of GT -> gBF4 (# n#, p #) (# j#, m #) c arr LT -> gBF4 (# i#, k #) (# n#, p #) c arr _ -> Just p | c == charS m = Just m | c == charS k = Just k | otherwise = Nothing where !n# = (i# +# j#) `quotInt#` 2# !p = unsafeAt arr (I# n#) {-# INLINABLE gBF4 #-} findC :: Char -> Array Int PRS -> Maybe PRS findC c arr = gBF4 (# i#, k #) (# j#, m #) c arr where !(I# i#,I# j#) = bounds arr !k = unsafeAt arr (I# i#) !m = unsafeAt arr (I# i#) str2PRSs :: CharPhoneticClassification -> String -> StringRepresentation str2PRSs arr = map (\c -> fromJust . findC c $ arr) -- | Function-predicate 'isVowel1' checks whether its argument is a vowel representation in the 'PRS' format. isVowel1 :: PRS -> Bool isVowel1 = (== P 0) . phoneType {-# INLINE isVowel1 #-} -- | Function-predicate 'isSonorous1' checks whether its argument is a sonorous consonant representation in the 'PRS' format. isSonorous1 :: PRS -> Bool isSonorous1 = (`elem` [P 1,P 2]) . phoneType {-# INLINE isSonorous1 #-} -- | Function-predicate 'isVoicedC1' checks whether its argument is a voiced consonant representation in the 'PRS' format. isVoicedC1 :: PRS -> Bool isVoicedC1 = (`elem` [P 3,P 4]) . phoneType {-# INLINE isVoicedC1 #-} -- | Function-predicate 'isVoiceless1' checks whether its argument is a voiceless consonant representation in the 'PRS' format. isVoicelessC1 :: PRS -> Bool isVoicelessC1 = (`elem` [P 5,P 6]) . phoneType {-# INLINE isVoicelessC1 #-} -- | Binary function-predicate 'isNotVowel2' checks whether its arguments are both consonant representations in the 'PRS' format. isNotVowel2 :: PRS -> PRS -> Bool isNotVowel2 x y | phoneType x == P 0 || phoneType y == P 0 = False | otherwise = True {-# INLINE isNotVowel2 #-} -- | Binary function-predicate 'notEqC' checks whether its arguments are not the same consonant sound representations (not taking palatalization into account). notEqC :: [(Char,Char)] -- ^ The pairs of the 'Char' that corresponds to the similar phonetic languages consonant phenomenon (e. g. allophones). Must be sorted in the ascending order to be used correctly. -> PRS -> PRS -> Bool notEqC xs x y | (== cy) . getBFstLSorted' cx xs $ cx = False | otherwise = cx /= cy where !cx = charS x !cy = charS y -- | Function 'sndGroups' converts a word being a list of 'PRS' to the list of phonetically similar (consonants grouped with consonants and each vowel separately) -- sounds representations in 'PRS' format. sndGroups :: [PRS] -> [[PRS]] sndGroups ys@(_:_) = L.groupBy isNotVowel2 ys sndGroups _ = [] groupSnds :: [PRS] -> [[PRS]] groupSnds = L.groupBy (\x y -> ((== P 0) . phoneType $ x) == ((== P 0) . phoneType $ y)) data SegmentationInfo1 = SI { fieldN :: !Int8, -- ^ Number of fields in the pattern matching that are needed to apply the segmentation rules. Not less than 1. predicateN :: Int8 -- ^ Number of predicates in the definition for the 'fieldN' that are needed to apply the segmentation rules. } deriving (Eq) instance Show SegmentationInfo1 where show (SI m n) = L.intercalate " " ["SI", show m, show n] instance PhoneticElement SegmentationInfo1 where readPEMaybe rs | not . any isLetter $ rs = Nothing | otherwise = let (ys:yss) = words rs in case ys of "SI" -> case yss of [xs,ts] -> case (readMaybe xs::Maybe Int8) of Just m -> case (readMaybe ts::Maybe Int8) of Just n -> Just (SI m n) _ -> Nothing _ -> Nothing _ -> Nothing _ -> Nothing {-| We can think of 'SegmentationPredFunction' in terms of @f ('SI' fN pN) ks [x_{1},x_{2},...,x_{i},...,x_{fN}]@. Comparing with 'divCnsnts' from the @ukrainian-phonetics-basics-array@ we can postulate that it consists of the following logical terms in the symbolic form: 1) 'phoneType' x_{i} \`'elem'\` (X{...} = 'map' 'P' ['Int8']) 2) 'notEqC' ks x_{i} x_{j} (j /= i) combined with the standard logic Boolean operations of '(&&)', '(||)' and 'not'. Further, the 'not' can be transformed into the positive (affirmative) form using the notion of the universal set for the task. This transformation needs that the similar phonetic phenomenae (e. g. the double sounds -- the prolonged ones) belong to the one syllable and not to the different ones (so they are not related to different syllables, but just to the one and the same). Since such assumption has been used, we can further represent the function by the following data type and operations with it, see 'SegmentationPredFData'. -} data SegmentationPredFunction = PF (SegmentationInfo1 -> [(Char, Char)] -> [PRS] -> Bool) data SegmentationPredFData a b = L Int [Int] (Array Int a) | NEC Int Int (Array Int a) [b] | C (SegmentationPredFData a b) (SegmentationPredFData a b) | D (SegmentationPredFData a b) (SegmentationPredFData a b) deriving (Eq, Read, Show) class Eval2Bool a where eval2Bool :: a -> Bool type SegmentationFDP = SegmentationPredFData PRS (Char, Char) instance Eval2Bool (SegmentationPredFData PRS (Char, Char)) where eval2Bool (L i js arr) | all (<= n) js && i <= n && i >= 1 && all (>=1) js = fromPhoneticType (phoneType (unsafeAt arr $ i - 1)) `elem` js | otherwise = error "Data.Phonetic.Languages.Syllables.eval2Bool: 'L' element is not properly defined. " where n = numElements arr eval2Bool (NEC i j arr ks) | i >= 1 && j >= 1 && i /= j && i <= n && j <= n = notEqC ks (unsafeAt arr $ i - 1) (unsafeAt arr $ j - 1) | otherwise = error "Data.Phonetic.Languages.Syllables.eval2Bool: 'NEC' element is not properly defined. " where n = numElements arr eval2Bool (C x y) = eval2Bool x && eval2Bool y eval2Bool (D x y) = eval2Bool x || eval2Bool y type DListFunctionResult = ([PRS] -> [PRS],[PRS] -> [PRS]) class DListRepresentation a b where toDLR :: b -> [a] -> ([a] -> [a], [a] -> [a]) instance DListRepresentation PRS Int8 where toDLR left xs | null xs = (id,id) | null ts = (id,(zs `mappend`)) | null zs = ((`mappend` ts), id) | otherwise = ((`mappend` ts), (zs `mappend`)) where (ts,zs) = splitAt (fromEnum left) xs data SegmentationLineFunction = LFS { infoSP :: SegmentationInfo1, predF :: SegmentationFDP, -- ^ The predicate to check the needed rule for segmentation. resF :: Int8 -- ^ The result argument to be appended to the left of the group of consonants if the 'predF' returns 'True' for its arguments. Is an argument to the 'toDLR'. } data SegmentationRules1 = SR1 { infoS :: SegmentationInfo1, lineFs :: [SegmentationLineFunction] -- ^ The list must be sorted in the appropriate order of the guards usage for the predicates. -- The length of the list must be equal to the ('fromEnum' . 'predicateN' . 'infoS') value. } {-| List of the 'SegmentationRules1' sorted in the descending order by the 'fieldN' 'SegmentationInfo1' data and where the length of all the 'SegmentationPredFunction' lists of 'PRS' are equal to the 'fieldN' 'SegmentationInfo1' data by definition. -} type SegmentRulesG = [SegmentationRules1] -- | Function 'divCnsnts' is used to divide groups of consonants into two-elements lists that later are made belonging to -- different neighbour syllables if the group is between two vowels in a word. The group must be not empty, but this is not checked. -- The example phonetical information for the proper performance in Ukrainian can be found from the: -- https://msn.khnu.km.ua/pluginfile.php/302375/mod_resource/content/1/%D0%9B.3.%D0%86%D0%86.%20%D0%A1%D0%BA%D0%BB%D0%B0%D0%B4.%D0%9D%D0%B0%D0%B3%D0%BE%D0%BB%D0%BE%D1%81.pdf -- The example of the 'divCnsnts' can be found at: https://hackage.haskell.org/package/ukrainian-phonetics-basic-array-0.1.2.0/docs/src/Languages.Phonetic.Ukrainian.Syllable.Arr.html#divCnsnts divCnsnts :: [(Char,Char)] -> SegmentRulesG -> [PRS] -> DListFunctionResult divCnsnts ks gs xs@(_:_) = toDLR left xs where !js = fromJust . L.find ((== length xs) . fromEnum . fieldN . infoS) $ gs -- js :: SegmentationRules1 !left = resF . fromJust . L.find (eval2Bool . predF). lineFs $ js divCnsnts _ _ [] = (id,id) reSyllableCntnts :: [(Char,Char)] -> SegmentRulesG -> [[PRS]] -> [[PRS]] reSyllableCntnts ks gs (xs:ys:zs:xss) | (/= P 0) . phoneType . last $ ys = fst (divCnsnts ks gs ys) xs:reSyllableCntnts ks gs (snd (divCnsnts ks gs ys) zs:xss) | otherwise = reSyllableCntnts ks gs ((xs `mappend` ys):zs:xss) reSyllableCntnts _ _ (xs:ys:_) = [(xs `mappend` ys)] reSyllableCntnts _ _ xss = xss divVwls :: [[PRS]] -> [[PRS]] divVwls = mapI (\ws -> (length . filter ((== P 0) . phoneType) $ ws) > 1) h3 where h3 us = [ys `mappend` take 1 zs] `mappend` (L.groupBy (\x y -> phoneType x == P 0 && phoneType y /= P 0) . drop 1 $ zs) where (ys,zs) = span (\t -> phoneType t /= P 0) us createSyllablesPL :: GWritingSystemPRPLX -> [(Char,Char)] -> CharPhoneticClassification -> SegmentRulesG -> String -- ^ Corresponds to the \'0\' symbol delimiter in the @ukrainian-phonetics-basic-array@ package. -> String -- ^ Corresponds to the \'1\' and \'-\' symbol delimiters in the @ukrainian-phonetics-basic-array@ package. -> String -- ^ Actually the converted 'String'. -> [[[PRS]]] createSyllablesPL wrs ks arr gs us vs = map (divVwls . reSyllableCntnts ks gs . groupSnds . str2PRSs arr) . words1 . mapMaybe g . convertToProperPL . map (\x -> if x == '-' then ' ' else x) where g x | x `elem` us = Nothing | x `notElem` vs = Just x | otherwise = Just ' ' words1 xs = if null ts then [] else w : words1 s'' -- Practically this is an optimized version for this case 'words' function from Prelude. where ts = dropWhile (== ' ') xs (w, s'') = span (/= ' ') ts {-# NOINLINE words1 #-} convertToProperPL = map char . stringToXG wrs {-# INLINE createSyllablesPL #-}