-- |
-- Module      :  Phonetic.Languages.Ukrainian.PrepareText
-- Copyright   :  (c) OleksandrZhabenko 2020-2021
-- License     :  MIT
-- Stability   :  Experimental
-- Maintainer  :  olexandr543@yahoo.com
--
-- Helps to order the 7 or less Ukrainian words (or their concatenations)
-- to obtain (to some extent) suitable for poetry or music text.
-- Earlier it has been a module DobutokO.Poetry.Ukrainian.PrepareText
-- from the @dobutokO-poetry@ package.
-- In particular, this module can be used to prepare the Ukrainian text
-- by applying the most needed grammar to avoid misunderstanding
-- for the produced text. The attention is paid to the prepositions, pronouns, conjunctions
-- and particles that are most commonly connected (or not) in a significant way
-- with the next text.
-- Uses the information from:
-- https://uk.wikipedia.org/wiki/%D0%A1%D0%BF%D0%BE%D0%BB%D1%83%D1%87%D0%BD%D0%B8%D0%BA
-- and
-- https://uk.wikipedia.org/wiki/%D0%A7%D0%B0%D1%81%D1%82%D0%BA%D0%B0_(%D0%BC%D0%BE%D0%B2%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D1%81%D1%82%D0%B2%D0%BE)
--
-- Uses arrays instead of vectors.

module Phonetic.Languages.Ukrainian.PrepareText (
  -- * Basic functions
  complexWords
  , splitLines
  , splitLinesN
  , auxiliary1
  , isPreposition
  , isConcatenated
  , isSpC
  , concatenated2
  , jottedConv
  -- * The end-user functions
  , prepareText
  , prepareTextN
  , growLinesN
  , prepareGrowTextMN
  -- * Used to transform after convertToProperUkrainian from mmsyn6ukr package
  , aux4
) where

import CaseBi.Arr (getBFstL')
import Data.List.InnToOut.Basic (mapI)
import Data.Char (isAlpha,toLower)
import GHC.Arr
import GHC.List (scanl1)

-- | Is used to convert a Ukrainian text into list of 'String' each of which is ready to be
-- used by the functions of the modules for the phonetic languages approach.
-- It applies minimal grammar links and connections between the most commonly used Ukrainian
-- words that \"should\" be paired and not dealt with separately
-- to avoid the misinterpretation and preserve maximum of the semantics for the
-- \"phonetic\" language on the Ukrainian basis.
prepareText :: String -> [String]
prepareText = prepareTextN 7
{-# INLINE prepareText #-}

-- | Concatenates complex words in Ukrainian so that they are not separated further by possible words order rearrangements (because they are treated
-- as a single word). This is needed to preserve basic grammar in phonetic languages.
complexWords :: [String] -> [String]
complexWords wwss@(xs:ys:zs:ts:xss) =
 getBFstL' (xs:complexWords (ys:zs:ts:xss)) ([("\1074",
    if ys == "\1084\1110\1088\1091" && zs == "\1090\1086\1075\1086" &&
    ts == "\1103\1082" then auxiliary2Inner (xs ++ ys ++ zs ++ ts) xss else auxiliary4 wwss),
    ("\1076\1072\1088\1084\1072", if ys == "\1097\1086"
    then auxiliary3 wwss else auxiliary4 wwss), ("\1076\1083\1103",
    if ys == "\1090\1086\1075\1086" && zs == "\1097\1086\1073"
    then auxiliary2Inner (xs ++ ys ++ zs ++ ts) xss
    else auxiliary4 wwss), ("\1079",
      case ys of
        "\1090\1080\1084" -> if zs == "\1097\1086\1073"
          then auxiliary2Inner (xs ++ ys ++ zs) (ts:xss)
          else auxiliary4 wwss
        "\1090\1086\1075\1086" -> if zs == "\1095\1072\1089\1091" && ts == "\1103\1082"
          then auxiliary2Inner (xs ++ ys ++ zs ++ "\1081\1072\1082") xss
          else auxiliary4 wwss
        _ -> auxiliary4 wwss), ("\1079\1072\1083\1077\1078\1085\1086",
    if ys == "\1074\1110\1076" then auxiliary3 wwss else auxiliary4 wwss),
    ("\1079\1072\1084\1110\1089\1090\1100",
    if ys == "\1090\1086\1075\1086" && zs == "\1097\1086\1073"
    then auxiliary2Inner (xs ++ ys ++ zs) (ts:xss)
    else auxiliary4 wwss), ("\1087\1086\1087\1088\1080",
    if ys == "\1090\1077" && zs == "\1097\1086" then auxiliary2Inner (xs ++ ys ++ zs) (ts:xss)
    else auxiliary4 wwss),("\1085\1077\1079\1072\1083\1077\1078\1085\1086",
    if ys == "\1074\1110\1076" then auxiliary3 wwss else auxiliary4 wwss),
    ("\1085\1077\1079\1074\1072\1078\1072\1102\1095\1080", if ys == "\1085\1072" then
    if zs == "\1090\1077" && ts == "\1097\1086" then auxiliary2Inner (xs ++ ys ++ zs ++ ts) xss
    else auxiliary3 wwss else auxiliary4 wwss),
    ("\1087\1088\1080",
    if ys == "\1094\1100\1086\1084\1091" then auxiliary3 wwss
    else auxiliary4 wwss), ("\1087\1110\1089\1083\1103",
    if ys == "\1090\1086\1075\1086" && zs == "\1103\1082" then auxiliary2Inner (xs ++ ys ++ zs) (ts:xss)
    else auxiliary4 wwss), ("\1090\1072\1082", if ys == "\1097\1086" then auxiliary3 wwss else auxiliary4 wwss),
    ("\1090\1072\1082", if ys == "\1103\1082" then auxiliary3 wwss else auxiliary4 wwss),
    ("\1090\1080\1084\1095\1072\1089\1086\1084", if ys == "\1103\1082"
    then auxiliary3 wwss else auxiliary4 wwss),
    ("\1090\1086\1084\1091", if ys == "\1103\1082" then auxiliary2Inner (xs ++ "\1081\1072\1082") (zs:ts:xss)
    else auxiliary4 wwss), ("\1091",
    if (ys == "\1079\1074'\1103\1079\1082\1091" || ys == "\1079\1074\x02BC\1103\1079\1082\1091") && zs == "\1079"
    then auxiliary2Inner (xs ++ "\1079\1074\1081\1072\1079\1082\1091" ++ zs) (ts:xss)
    else auxiliary2Inner "\1091" (ys:zs:ts:xss)),
    ("\1091", if ys == "\1084\1110\1088\1091" &&
    zs == "\1090\1086\1075\1086" && ts == "\1103\1082" then auxiliary2Inner (xs ++ ys ++ zs ++ ts) xss
    else auxiliary4 wwss),
    ("\1093\1086\1095", if ys == "\1073\1080" then auxiliary3 wwss
    else auxiliary4 wwss), ("\1093\1086\1095\1072", if ys == "\1073"
    then auxiliary3 wwss else auxiliary4 wwss),
    ("\1095\1077\1088\1077\1079", if ys == "\1090\1077" && zs == "\1097\1086"
    then auxiliary2Inner (xs ++ ys ++ zs) (ts:xss) else auxiliary4 wwss)]) xs
complexWords wwss@(xs:ys:zs:xss) =
 getBFstL' (xs:complexWords [ys,zs]) ([("\1076\1072\1088\1084\1072", if ys == "\1097\1086"
    then auxiliary3 wwss else auxiliary4 wwss), ("\1079\1072\1083\1077\1078\1085\1086",
    if ys == "\1074\1110\1076" then auxiliary3 wwss else auxiliary4 wwss),
    ("\1085\1077\1079\1072\1083\1077\1078\1085\1086",
    if ys == "\1074\1110\1076" then auxiliary3 wwss else auxiliary4 wwss),
    ("\1087\1088\1080",
    if ys == "\1094\1100\1086\1084\1091" then auxiliary3 wwss
    else auxiliary4 wwss), ("\1090\1072\1082", if ys == "\1097\1086" then auxiliary3 wwss else auxiliary4 wwss),
    ("\1090\1072\1082", if ys == "\1103\1082" then auxiliary3 wwss else auxiliary4 wwss),
    ("\1090\1080\1084\1095\1072\1089\1086\1084", if ys == "\1103\1082"
    then auxiliary3 wwss else auxiliary4 wwss),
    ("\1090\1086\1084\1091", if ys == "\1103\1082" then auxiliary2Inner (xs ++ "\1081\1072\1082") (zs:xss)
    else auxiliary4 wwss), ("\1093\1086\1095", if ys == "\1073\1080" then auxiliary3 wwss
    else auxiliary4 wwss), ("\1093\1086\1095\1072", if ys == "\1073"
    then auxiliary3 wwss else auxiliary4 wwss)]) xs
complexWords [xs,ys]
 | ys == "\1073" || ys == "\1073\1060" = [xs ++ ys]
 | xs == "\1076\1072\1088\1084\1072" && ys == "\1097\1086" = [xs ++ ys]
 | otherwise = [xs,ys]
complexWords xss = xss

auxiliary2Inner :: String -> [String] -> [String]
auxiliary2Inner ts (xs:ys:xss)
  | (concatenated2 . auxiliary1 $ [xs,ys]) /= [xs,ys] = let (rs,ws) = splitAt 1 (concatenated2 . auxiliary1 $ [xs,ys]) in
       (ts ++ head rs):complexWords (concat [ws,xss])
  | otherwise = (ts ++ xs):complexWords (ys:xss)
auxiliary2Inner ts [xs] = [ts ++ xs]
auxiliary2Inner ts _ = [ts]

auxiliary3 :: [String] -> [String]
auxiliary3 (xs:ys:xss) = auxiliary2Inner (xs ++ ys) xss
auxiliary3 xss = xss
{-# INLINE auxiliary3 #-}

auxiliary4 :: [String] -> [String]
auxiliary4 (xs:xss) = auxiliary2Inner xs xss
auxiliary4 xss = xss
{-# INLINE auxiliary4 #-}

-- | Since 0.2.1.0 version the function is recursive and is applied so that all returned elements ('String') are no longer than 7 words in them.
splitLines :: [String] -> [String]
splitLines = splitLinesN 7
{-# INLINE splitLines #-}

-- | A generalized variant of the 'splitLines' with the arbitrary maximum number of the words in the lines given as the first argument.
splitLinesN :: Int -> [String] -> [String]
splitLinesN n xss
 | null xss || n <= 0 = []
 | otherwise = mapI (\xs -> compare (length . words $ xs) n == GT) (\xs -> let yss = words xs in
     splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss

-- | A generalized variant of the 'prepareText' with the arbitrary maximum number of the words in the lines given as the first argument.
prepareTextN :: Int -> String -> [String]
prepareTextN n = filter (any isUkrainianL) . splitLinesN n .
  map (unwords . concatenated2. auxiliary1 . complexWords . words . filter (\t -> isAlpha t || isSpC t)) .
       filter (not . null) . lines

auxiliary1 :: [String] -> [String]
auxiliary1 (xs:ys:zs:xss)
  | isConcatenated ys || isPreposition ys =
      auxiliary1 (xs:auxiliary1 ((ys ++ (drop 1 . jottedConv $ ' ':zs)):xss))
  | isConcatenated xs || isPreposition xs = auxiliary1 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):zs:xss)
  | otherwise = xs:auxiliary1 (ys:zs:xss)
auxiliary1 x@(xs:ys:xss)
  | isConcatenated xs || isPreposition xs = auxiliary1 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):xss)
  | otherwise = x
auxiliary1 xss = xss

isPreposition :: String -> Bool
isPreposition ts =
  getBFstL' False
   (zip ["\1030\1079", "\1041\1077\1079", "\1041\1110\1083\1103", "\1042",
    "\1042\1110\1076", "\1044\1083\1103", "\1044\1086", "\1047",
    "\1047\1072", "\1047\1072\1088\1072\1076\1080", "\1047\1110",
    "\1050", "\1050\1086\1083\1086", "\1050\1088\1110\1079\1100",
    "\1050\1088\1110\1084", "\1052\1077\1078", "\1052\1077\1078\1080",
    "\1052\1110\1078", "\1053\1072", "\1053\1072\1076", "\1054",
    "\1054\1073", "\1054\1076", "\1054\1082\1088\1110\1084",
    "\1055\1077\1088\1077\1076", "\1055\1086", "\1055\1088\1080",
    "\1055\1088\1086", "\1055\1088\1086\1090\1080",
    "\1055\1110\1076", "\1055\1110\1089\1083\1103",
    "\1057\1077\1088\1077\1076", "\1057\1077\1088\1077\1076\1080",
    "\1059", "\1063\1077\1088\1077\1079", "\1073\1077\1079",
    "\1073\1110\1083\1103", "\1074", "\1074\1110\1076",
    "\1076\1083\1103", "\1076\1086", "\1079", "\1079\1072",
    "\1079\1072\1088\1072\1076\1080", "\1079\1110",
    "\1082", "\1082\1086\1083\1086", "\1082\1088\1110\1079\1100",
    "\1082\1088\1110\1084", "\1084\1077\1078", "\1084\1077\1078\1080",
    "\1084\1110\1078", "\1085\1072", "\1085\1072\1076", "\1086",
    "\1086\1073", "\1086\1076", "\1086\1082\1088\1110\1084",
    "\1087\1077\1088\1077\1076", "\1087\1086", "\1087\1088\1080",
    "\1087\1088\1086", "\1087\1088\1086\1090\1080", "\1087\1110\1076",
    "\1087\1110\1089\1083\1103", "\1089\1077\1088\1077\1076",
    "\1089\1077\1088\1077\1076\1080", "\1091",
    "\1095\1077\1088\1077\1079", "\1110\1079"] $
                           replicate 200 True) ts
{-# INLINE isPreposition #-}

-- | Since the dobutokO-poetry version 0.16.3.0 the (||) operator has been changed to the (&&).
-- The idea is that these words are the ones that are pronouns and they \"should\" be treated
-- (by the author's understanding) as independent words.
isConcatenated :: String -> Bool
isConcatenated ts
 | null ts = False
 | otherwise = compare (length ts) 2 /= GT &&
    getBFstL' True (zip ["\1028", "\1042\1080", "\1052\1080", "\1058\1080", "\1058\1110",
     "\1062\1110", "\1071", "\1074\1080", "\1084\1080", "\1090\1080", "\1090\1110",
      "\1094\1110", "\1103", "\1108"] $ replicate 14 False) ts && (head ts `notElem` "\1031\1111")
{-# INLINE isConcatenated #-}

concatenated2 :: [String] -> [String]
concatenated2 (xs:ys:xss) =
 getBFstL' (xs:concatenated2 (ys:xss)) (zip ["\1040\1073\1086","\1040\1076\1078\1077",
 "\1040\1083\1077","\1040\1085\1110\1078","\1041\1086\1076\1072\1081",
 "\1041\1091\1094\1110\1084\1090\1086","\1042\1078\1077","\1042\1080\1082\1083\1102\1095\1085\1086",
 "\1042\1083\1072\1089\1085\1077","\1042\1090\1110\1084","\1044\1072\1074\1072\1081",
 "\1047\1072\1090\1077","\1050\1086\1083\1080","\1051\1077\1076\1074\1077","\1051\1080\1096\1077",
 "\1052\1072\1081\1078\1077","\1052\1086\1074","\1052\1086\1074\1073\1080",
 "\1052\1086\1074\1073\1080\1090\1086","\1053\1072\1074\1110\1090\1100",
 "\1053\1072\1089\1082\1110\1083\1100\1082\1080","\1053\1072\1095\1077","\1053\1072\1095\1077\1073",
 "\1053\1072\1095\1077\1073\1090\1086","\1053\1077\1074\1078\1077","\1053\1077\1084\1086\1074",
 "\1053\1077\1084\1086\1074\1073\1080","\1053\1077\1084\1086\1074\1073\1080\1090\1086",
 "\1053\1077\1085\1072\1095\1077","\1053\1077\1085\1072\1095\1077\1073\1090\1086",
 "\1053\1077\1093\1072\1081","\1053\1090\1078\1077","\1053\1110\1073\1080",
 "\1053\1110\1073\1080\1090\1086","\1053\1110\1078","\1054\1090\1086\1078",
 "\1055\1088\1080\1090\1086\1084\1091","\1055\1088\1080\1090\1110\1084",
 "\1055\1088\1080\1095\1086\1084\1091","\1055\1088\1080\1095\1110\1084",
 "\1055\1088\1086\1090\1077","\1057\1072\1084\1077","\1057\1077\1073\1090\1086",
 "\1058\1072\1082\1080","\1058\1086\1073\1090\1086","\1058\1110\1083\1100\1082\1080",
 "\1061\1072\1081","\1061\1086\1095","\1061\1110\1073\1072","\1062\1077\1073\1090\1086",
 "\1065\1086\1073","\1071\1082\1073\1080","\1071\1082\1088\1072\1079","\1071\1082\1097\1086",
 "\1072\1073\1086","\1072\1076\1078\1077","\1072\1083\1077","\1072\1085\1110\1078",
 "\1073\1086\1076\1072\1081","\1073\1091\1094\1110\1084\1090\1086","\1074\1078\1077",
 "\1074\1080\1082\1083\1102\1095\1085\1086","\1074\1083\1072\1089\1085\1077",
 "\1074\1090\1110\1084","\1076\1072\1074\1072\1081","\1079\1072\1090\1077","\1082\1086\1083\1080",
 "\1083\1077\1076\1074\1077","\1083\1080\1096\1077","\1084\1072\1081\1078\1077","\1084\1086\1074",
 "\1084\1086\1074\1073\1080","\1084\1086\1074\1073\1080\1090\1086","\1085\1072\1074\1110\1090\1100",
 "\1085\1072\1089\1082\1110\1083\1100\1082\1080","\1085\1072\1095\1077","\1085\1072\1095\1077\1073",
 "\1085\1072\1095\1077\1073\1090\1086","\1085\1077\1074\1078\1077","\1085\1077\1084\1086\1074",
 "\1085\1077\1084\1086\1074\1073\1080","\1085\1077\1084\1086\1074\1073\1080\1090\1086",
 "\1085\1077\1085\1072\1095\1077","\1085\1077\1085\1072\1095\1077\1073\1090\1086",
 "\1085\1077\1093\1072\1081","\1085\1110\1073\1080","\1085\1110\1073\1080\1090\1086",
 "\1085\1110\1078","\1086\1090\1078\1077","\1086\1090\1086\1078","\1087\1088\1080\1090\1086\1084\1091",
 "\1087\1088\1080\1090\1110\1084","\1087\1088\1080\1095\1086\1084\1091","\1087\1088\1080\1095\1110\1084",
 "\1087\1088\1086\1090\1077","\1089\1072\1084\1077","\1089\1077\1073\1090\1086","\1090\1072\1082\1080",
 "\1090\1086\1073\1090\1086","\1090\1110\1083\1100\1082\1080","\1093\1072\1081","\1093\1086\1095",
 "\1093\1110\1073\1072","\1094\1077\1073\1090\1086","\1097\1086\1073","\1103\1082\1073\1080",
 "\1103\1082\1088\1072\1079","\1103\1082\1097\1086"] $ replicate 200 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):concatenated2 xss)) xs
concatenated2 xss = xss

isSpC :: Char -> Bool
isSpC x = x == '\'' || x == ' ' || x == '\x2019' || x == '\x02BC' || x == '-'
{-# INLINE isSpC #-}

jottedConv :: String -> String
jottedConv (x:y:xs)
  | isSpC x = x:(getBFstL' (jottedConv (y:xs))
     [('\1028', '\1049':'\1077':jottedConv xs),
      ('\1031', '\1049':'\1110':jottedConv xs),
      ('\1070', '\1049':'\1091':jottedConv xs),
      ('\1071', '\1049':'\1072':jottedConv xs),
      ('\1102', '\1081':'\1091':jottedConv xs),
      ('\1103', '\1081':'\1072':jottedConv xs),
      ('\1108', '\1081':'\1077':jottedConv xs),
      ('\1111', '\1081':'\1110':jottedConv xs)] y)
  | otherwise = x:jottedConv (y:xs)
jottedConv xs = xs

-- | Can be used to prepare the text after 'convertToProperUkrainian' from 'Melodics.Ukrainian' module from @mmsyn6ukr@ package so that all the 'String' can
-- be represented as unique 'Char'.
aux4 :: String -> Char
aux4 xs
  | xs == "\1076\1078" = 'j'
  | xs == "\1076\1079" = 'z'
  | xs == "\1089\1100" = 's'
  | xs == "\1094\1100" = 'c'
  | null xs = error "Phonetic.Languages.Ukrainian.PrepareText.aux4: Empty String. "
  | otherwise = head xs

-- | Is taken from the @mmsyn6ukr@ package version 0.8.1.0 so that the amount of dependencies are reduced (and was slightly modified).
isUkrainianL :: Char -> Bool
isUkrainianL y | (y >= '\1040' && y <= '\1065') || (y >= '\1070' && y <= '\1097') = True
               | otherwise = getBFstL' False (map (\x -> (x, True)) $ "\1028\1030\1031\1068\1100\1102\1103\1108\1110\1111\1168\1169\8217") y

-------------------------------------

{-| @ since 0.2.0.0
Given a positive number and a list tries to rearrange the list's 'String's by concatenation of the several elements of the list
so that the number of words in every new 'String' in the resulting list is not greater than the 'Int' argument. If some of the
'String's have more than that number quantity of the words then these 'String's are preserved.
-}
growLinesN :: Int -> [String] -> [String]
growLinesN n xss
 | null xss || n < 0 = []
 | otherwise = unwords yss : growLinesN n zss
     where l = length . takeWhile (<= n) . scanl1 (+) . map (length . words) $ xss -- the maximum number of lines to be taken
           (yss,zss) = splitAt (max l 1) xss

{-| @ since 0.2.0.0
The function combines the 'prepareTextN' and 'growLinesN' function. Applies needed phonetic language preparations
to the Ukrainian text and tries to \'grow\' the resulting 'String's in the list so that the number of the words in every
of them is no greater than the given first 'Int' number.
-}
prepareGrowTextMN
  :: Int -- ^ A maximum number of the words or their concatenations in the resulting list of 'String's.
  -> Int -- ^ A number of words in every 'String' that the function firstly forms. To have some sense of usage, must be less than the first argument.
  -> String
  -> [String]
prepareGrowTextMN m n = growLinesN m . prepareTextN n
{-# INLINE prepareGrowTextMN #-}