-- | -- Module : Languages.Phonetic.Ukrainian.PrepareText -- Copyright : (c) OleksandrZhabenko 2020 -- License : MIT -- Stability : Experimental -- Maintainer : olexandr543@yahoo.com -- -- Helps to order the 7 or less Ukrainian words (or their concatenations) -- to obtain (to some extent) suitable for poetry or music text. -- Earlier it has been a module DobutokO.Poetry.Ukrainian.PrepareText -- from the @dobutokO-poetry@ package. -- In particular, this module can be used to prepare the Ukrainian text -- by applying the most needed grammar to avoid misunderstanding -- for the produced text. The attention is paid to the prepositions, pronouns, conjunctions -- and particles that are most commonly connected (or not) in a significant way -- with the next text. -- Uses the information from: -- https://uk.wikipedia.org/wiki/%D0%A1%D0%BF%D0%BE%D0%BB%D1%83%D1%87%D0%BD%D0%B8%D0%BA -- and -- https://uk.wikipedia.org/wiki/%D0%A7%D0%B0%D1%81%D1%82%D0%BA%D0%B0_(%D0%BC%D0%BE%D0%B2%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D1%81%D1%82%D0%B2%D0%BE) -- module Languages.Phonetic.Ukrainian.PrepareText ( -- * Basic functions prepareText , prepareTextN , complexWords , splitLines , splitLinesN , auxiliary1 , isPreposition , isConcatenated , isSpC , concatenated2 , jottedConv -- * Used to transform after convertToProperUkrainian from mmsyn6ukr package , aux4 , aux0 ) where import CaseBi (getBFst') import Data.List.InnToOut.Basic (mapI) import Data.Char (isAlpha,toLower) import qualified Data.Vector as V -- | Is used to convert a Ukrainian text into list of 'String' each of which is ready to be -- used by the functions from the other modules in the package. -- It applies minimal grammar links and connections between the most commonly used Ukrainian -- words that \"should\" be paired and not dealt with separately -- to avoid the misinterpretation and preserve maximum of the semantics for the -- \"phonetic\" language on the Ukrainian basis. prepareText :: String -> [String] prepareText = filter (any isUkrainianL) . splitLines . map (unwords . concatenated2. auxiliary1 . complexWords . words . filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines -- | Concatenates complex words in Ukrainian so that they are not separated further by possible words order rearrangements (because they are treated -- as a single word). This is needed to preserve basic grammar in phonetic languages. complexWords :: [String] -> [String] complexWords (xs:ys:zs:ts:xss) = getBFst' (xs:complexWords (ys:zs:ts:xss),V.fromList [("\1074", if ys == "\1084\1110\1088\1091" && zs == "\1090\1086\1075\1086" && ts == "\1103\1082" then (xs ++ ys ++ zs ++ ts ++ (concat . take 1 $ xss)): complexWords (drop 1 xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1076\1072\1088\1084\1072", if ys == "\1097\1086" then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1076\1083\1103", if ys == "\1090\1086\1075\1086" && zs == "\1097\1086\1073" then (xs ++ ys ++ zs ++ ts):complexWords xss else (xs ++ ys):complexWords (zs:ts:xss)), ("\1079", case ys of "\1090\1080\1084" -> if zs == "\1097\1086\1073" then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss) "\1090\1086\1075\1086" -> if zs == "\1095\1072\1089\1091" && ts == "\1103\1082" then (xs ++ ys ++ zs ++ "\1081\1072\1082" ++ (concat . take 1 $ xss)):complexWords (drop 1 xss) else (xs ++ ys):complexWords (zs:ts:xss) _ -> (xs ++ ys):complexWords (zs:ts:xss)), ("\1079\1072\1084\1110\1089\1090\1100", if ys == "\1090\1086\1075\1086" && zs == "\1097\1086\1073" then (xs ++ ys ++ zs ++ ts):complexWords xss else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1086\1087\1088\1080", if ys == "\1090\1077" && zs == "\1097\1086" then (xs ++ ys ++ zs ++ ts):complexWords xss else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1088\1080", if ys == "\1094\1100\1086\1084\1091" then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1110\1089\1083\1103", if ys == "\1090\1086\1075\1086" && zs == "\1103\1082" then (xs ++ ys ++ zs ++ ts):complexWords xss else (xs ++ ys):complexWords (zs:ts:xss)), ("\1090\1072\1082", if ys == "\1097\1086" then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1090\1080\1084\1095\1072\1089\1086\1084", if ys == "\1103\1082" then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1090\1086\1084\1091", if ys == "\1103\1082" then (xs ++ "\1081\1072\1082" ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1091", if ys == "\1084\1110\1088\1091" && zs == "\1090\1086\1075\1086" && ts == "\1103\1082" then (xs ++ ys ++ zs ++ ts ++ (concat . take 1 $ xss)):complexWords (drop 1 xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1093\1086\1095", if ys == "\1073\1080" then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1093\1086\1095", if ys == "\1073\1080" then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)), ("\1095\1077\1088\1077\1079", if ys == "\1090\1077" && zs == "\1097\1086" then (xs ++ ys ++ zs ++ ts):complexWords xss else (xs ++ ys):complexWords (zs:ts:xss))]) xs complexWords (xs:ys:zs:_) = getBFst' ([xs,ys,zs],V.fromList [ ("\1076\1072\1088\1084\1072", if ys == "\1097\1086" then [xs ++ ys ++ zs] else [xs ++ ys,zs]), ("\1079", case ys of "\1090\1080\1084" -> if zs == "\1097\1086\1073" then [xs ++ ys ++ zs] else [xs ++ ys,zs] _ -> [xs ++ ys,zs]), ("\1087\1088\1080", if ys == "\1094\1100\1086\1084\1091" then [xs ++ ys ++ zs] else [xs ++ ys,zs]), ("\1090\1072\1082", if ys == "\1097\1086" then [xs ++ ys ++ zs] else [xs ++ ys,zs]), ("\1090\1080\1084\1095\1072\1089\1086\1084", if ys == "\1103\1082" then [xs ++ ys ++ zs] else [xs ++ ys, zs]), ("\1090\1086\1084\1091", if ys == "\1103\1082" then [xs ++ "\1081\1072\1082" ++ zs] else [xs ++ ys,zs]), ("\1093\1086\1095", if ys == "\1073\1080" then [xs ++ ys ++ zs] else [xs ++ ys,zs]), ("\1093\1086\1095", if ys == "\1073\1080" then [xs ++ ys ++ zs] else [xs ++ ys,zs])]) xs complexWords xss = xss -- | Since 0.2.1.0 version the function is recursive and is applied so that all returned elements ('String') are no longer than 7 words in them. splitLines :: [String] -> [String] splitLines xss | null xss = [] | otherwise = mapI (\xs -> compare (length . words $ xs) 7 == GT) (\xs -> let yss = words xs in splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss -- | A generalized variant of the 'splitLines' with the arbitrary maximum number of the words in the lines given as the first argument. splitLinesN :: Int -> [String] -> [String] splitLinesN n xss | null xss || n <= 0 = [] | otherwise = mapI (\xs -> compare (length . words $ xs) n == GT) (\xs -> let yss = words xs in splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss -- | A generalized variant of the 'prepareText' with the arbitrary maximum number of the words in the lines given as the first argument. prepareTextN :: Int -> String -> [String] prepareTextN n = filter (any isUkrainianL) . splitLinesN n . map (unwords . concatenated2. auxiliary1 . complexWords . words . filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines auxiliary1 :: [String] -> [String] auxiliary1 (xs:ys:zs:xss) | isConcatenated ys || isPreposition ys = auxiliary1 (xs:auxiliary1 ((ys ++ (drop 1 . jottedConv $ ' ':zs)):xss)) | isConcatenated xs || isPreposition xs = auxiliary1 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):zs:xss) | otherwise = xs:auxiliary1 (ys:zs:xss) auxiliary1 x@(xs:ys:xss) | isConcatenated xs || isPreposition xs = auxiliary1 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):xss) | otherwise = x auxiliary1 xss = xss isPreposition :: String -> Bool isPreposition ts = getBFst' (False, V.fromList . zip ["\1030\1079", "\1041\1077\1079", "\1041\1110\1083\1103", "\1042", "\1042\1110\1076", "\1044\1083\1103", "\1044\1086", "\1047", "\1047\1072", "\1047\1072\1088\1072\1076\1080", "\1047\1110", "\1050", "\1050\1086\1083\1086", "\1050\1088\1110\1079\1100", "\1050\1088\1110\1084", "\1052\1077\1078", "\1052\1077\1078\1080", "\1052\1110\1078", "\1053\1072", "\1053\1072\1076", "\1054", "\1054\1073", "\1054\1076", "\1054\1082\1088\1110\1084", "\1055\1077\1088\1077\1076", "\1055\1086", "\1055\1088\1080", "\1055\1088\1086", "\1055\1088\1086\1090\1080", "\1055\1110\1076", "\1055\1110\1089\1083\1103", "\1057\1077\1088\1077\1076", "\1057\1077\1088\1077\1076\1080", "\1059", "\1063\1077\1088\1077\1079", "\1073\1077\1079", "\1073\1110\1083\1103", "\1074", "\1074\1110\1076", "\1076\1083\1103", "\1076\1086", "\1079", "\1079\1072", "\1079\1072\1088\1072\1076\1080", "\1079\1110", "\1082", "\1082\1086\1083\1086", "\1082\1088\1110\1079\1100", "\1082\1088\1110\1084", "\1084\1077\1078", "\1084\1077\1078\1080", "\1084\1110\1078", "\1085\1072", "\1085\1072\1076", "\1086", "\1086\1073", "\1086\1076", "\1086\1082\1088\1110\1084", "\1087\1077\1088\1077\1076", "\1087\1086", "\1087\1088\1080", "\1087\1088\1086", "\1087\1088\1086\1090\1080", "\1087\1110\1076", "\1087\1110\1089\1083\1103", "\1089\1077\1088\1077\1076", "\1089\1077\1088\1077\1076\1080", "\1091", "\1095\1077\1088\1077\1079", "\1110\1079"] $ replicate 200 True) ts {-# INLINE isPreposition #-} -- | Since the dobutokO-poetry version 0.16.3.0 the (||) operator has been changed to the (&&). -- The idea is that these words are the ones that are pronouns and they \"should\" be treated -- (by the author's understanding) as independent words. isConcatenated :: String -> Bool isConcatenated ts | null ts = False | otherwise = compare (length ts) 2 /= GT && getBFst' (True, V.fromList . zip ["\1028", "\1042\1080", "\1052\1080", "\1058\1080", "\1058\1110", "\1062\1110", "\1071", "\1074\1080", "\1084\1080", "\1090\1080", "\1090\1110", "\1094\1110", "\1103", "\1108"] $ replicate 14 False) ts && (head ts `notElem` "\1031\1111") {-# INLINE isConcatenated #-} concatenated2 :: [String] -> [String] concatenated2 (xs:ys:xss) = getBFst' (xs:concatenated2 (ys:xss), V.fromList . zip ["\1040\1073\1086","\1040\1076\1078\1077", "\1040\1083\1077","\1040\1085\1110\1078","\1041\1086\1076\1072\1081", "\1041\1091\1094\1110\1084\1090\1086","\1042\1078\1077","\1042\1080\1082\1083\1102\1095\1085\1086", "\1042\1083\1072\1089\1085\1077","\1042\1090\1110\1084","\1044\1072\1074\1072\1081", "\1047\1072\1090\1077","\1050\1086\1083\1080","\1051\1077\1076\1074\1077","\1051\1080\1096\1077", "\1052\1072\1081\1078\1077","\1052\1086\1074","\1052\1086\1074\1073\1080", "\1052\1086\1074\1073\1080\1090\1086","\1053\1072\1074\1110\1090\1100", "\1053\1072\1089\1082\1110\1083\1100\1082\1080","\1053\1072\1095\1077","\1053\1072\1095\1077\1073", "\1053\1072\1095\1077\1073\1090\1086","\1053\1077\1074\1078\1077","\1053\1077\1084\1086\1074", "\1053\1077\1084\1086\1074\1073\1080","\1053\1077\1084\1086\1074\1073\1080\1090\1086", "\1053\1077\1085\1072\1095\1077","\1053\1077\1085\1072\1095\1077\1073\1090\1086", "\1053\1077\1093\1072\1081","\1053\1090\1078\1077","\1053\1110\1073\1080", "\1053\1110\1073\1080\1090\1086","\1053\1110\1078","\1054\1090\1086\1078", "\1055\1088\1080\1090\1086\1084\1091","\1055\1088\1080\1090\1110\1084", "\1055\1088\1080\1095\1086\1084\1091","\1055\1088\1080\1095\1110\1084", "\1055\1088\1086\1090\1077","\1057\1072\1084\1077","\1057\1077\1073\1090\1086", "\1058\1072\1082\1080","\1058\1086\1073\1090\1086","\1058\1110\1083\1100\1082\1080", "\1061\1072\1081","\1061\1086\1095","\1061\1110\1073\1072","\1062\1077\1073\1090\1086", "\1065\1086\1073","\1071\1082\1073\1080","\1071\1082\1088\1072\1079","\1071\1082\1097\1086", "\1072\1073\1086","\1072\1076\1078\1077","\1072\1083\1077","\1072\1085\1110\1078", "\1073\1086\1076\1072\1081","\1073\1091\1094\1110\1084\1090\1086","\1074\1078\1077", "\1074\1080\1082\1083\1102\1095\1085\1086","\1074\1083\1072\1089\1085\1077", "\1074\1090\1110\1084","\1076\1072\1074\1072\1081","\1079\1072\1090\1077","\1082\1086\1083\1080", "\1083\1077\1076\1074\1077","\1083\1080\1096\1077","\1084\1072\1081\1078\1077","\1084\1086\1074", "\1084\1086\1074\1073\1080","\1084\1086\1074\1073\1080\1090\1086","\1085\1072\1074\1110\1090\1100", "\1085\1072\1089\1082\1110\1083\1100\1082\1080","\1085\1072\1095\1077","\1085\1072\1095\1077\1073", "\1085\1072\1095\1077\1073\1090\1086","\1085\1077\1074\1078\1077","\1085\1077\1084\1086\1074", "\1085\1077\1084\1086\1074\1073\1080","\1085\1077\1084\1086\1074\1073\1080\1090\1086", "\1085\1077\1085\1072\1095\1077","\1085\1077\1085\1072\1095\1077\1073\1090\1086", "\1085\1077\1093\1072\1081","\1085\1110\1073\1080","\1085\1110\1073\1080\1090\1086", "\1085\1110\1078","\1086\1090\1078\1077","\1086\1090\1086\1078","\1087\1088\1080\1090\1086\1084\1091", "\1087\1088\1080\1090\1110\1084","\1087\1088\1080\1095\1086\1084\1091","\1087\1088\1080\1095\1110\1084", "\1087\1088\1086\1090\1077","\1089\1072\1084\1077","\1089\1077\1073\1090\1086","\1090\1072\1082\1080", "\1090\1086\1073\1090\1086","\1090\1110\1083\1100\1082\1080","\1093\1072\1081","\1093\1086\1095", "\1093\1110\1073\1072","\1094\1077\1073\1090\1086","\1097\1086\1073","\1103\1082\1073\1080", "\1103\1082\1088\1072\1079","\1103\1082\1097\1086"] $ replicate 200 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):concatenated2 xss)) xs concatenated2 xss = xss isSpC :: Char -> Bool isSpC x = x == '\'' || x == ' ' || x == '\x2019' || x == '\x02BC' || x == '-' {-# INLINE isSpC #-} jottedConv :: String -> String jottedConv (x:y:xs) | isSpC x = x:(getBFst' (jottedConv (y:xs), V.fromList [('\1028', '\1049':'\1077':jottedConv xs), ('\1031', '\1049':'\1110':jottedConv xs), ('\1070', '\1049':'\1091':jottedConv xs), ('\1071', '\1049':'\1072':jottedConv xs), ('\1102', '\1081':'\1091':jottedConv xs), ('\1103', '\1081':'\1072':jottedConv xs), ('\1108', '\1081':'\1077':jottedConv xs), ('\1111', '\1081':'\1110':jottedConv xs)]) y) | otherwise = x:jottedConv (y:xs) jottedConv xs = xs -- | Can be used to prepare the text after 'convertToProperUkrainian' from 'Melodics.Ukrainian' module from @mmsyn6ukr@ package so that all the 'String' can -- be represented as unique 'Char'. aux4 :: String -> Char aux4 xs | xs == "\1076\1078" = 'j' | xs == "\1076\1079" = 'z' | xs == "\1089\1100" = 's' | xs == "\1094\1100" = 'c' | null xs = error "Languages.Phonetic.Ukrainian.PrepareText.aux4: Empty String. " | otherwise = head xs -- | Can be used to prepare the text after 'convertToProperUkrainian' from 'Melodics.Ukrainian' module from @mmsyn6ukr@ package so that all the 'String' can -- be represented as unique 'Char'. Afterwards, all the palatalized or semi-palatalized sounds are represented just the same as not palatalized ones. aux0 :: V.Vector String -> V.Vector Char aux0 v | V.null v = V.empty | otherwise = V.map head . V.filter (not. null) . V.map (\xs -> case xs of "\1089\1100" -> "\1089" "\1094\1100" -> "\1094" "\1076\1078" -> "j" "\1076\1079" -> "z" _ -> xs) . V.filter (/= "\1100") $ v -- | Is taken from the @mmsyn6ukr@ package version 0.8.1.0 so that the amount of dependencies are reduced (and was slightly modified). isUkrainianL :: Char -> Bool isUkrainianL y | (y >= '\1040' && y <= '\1065') || (y >= '\1070' && y <= '\1097') = True | otherwise = getBFst' (False, V.fromList . map (\x -> (x, True)) $ "\1028\1030\1031\1068\1100\1102\1103\1108\1110\1111\1168\1169\8217") y