-- |
-- Module      :  Data.Phonetic.Languages.PrepareText
-- Copyright   :  (c) OleksandrZhabenko 2020-2021
-- License     :  MIT
-- Stability   :  Experimental
-- Maintainer  :  olexandr543@yahoo.com
--
-- Helps to order the 7 or less phonetic language words (or their concatenations)
-- to obtain (to some extent) suitable for poetry or music text.
-- Earlier it has been a module DobutokO.Poetry.Ukrainian.PrepareText
-- from the @dobutokO-poetry@ package.
-- In particular, this module can be used to prepare the phonetic language text
-- by applying the most needed grammar to avoid misunderstanding
-- for the produced text. The attention is paid to the prepositions, pronouns, conjunctions
-- and particles that are most commonly connected (or not) in a significant way
-- with the next text.
-- Uses the information from:
-- https://uk.wikipedia.org/wiki/%D0%A1%D0%BF%D0%BE%D0%BB%D1%83%D1%87%D0%BD%D0%B8%D0%BA
-- and
-- https://uk.wikipedia.org/wiki/%D0%A7%D0%B0%D1%81%D1%82%D0%BA%D0%B0_(%D0%BC%D0%BE%D0%B2%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D1%81%D1%82%D0%B2%D0%BE)
--
-- Uses arrays instead of vectors.
-- A list of basic (but, probably not complete and needed to be extended as needed) English words (the articles, pronouns,
-- particles, conjunctions etc.) the corresponding phonetic language translations of which are intended to be used as a
-- 'Concatenations' here is written to the file EnglishConcatenated.txt in the source tarball.

module Data.Phonetic.Languages.PrepareText (
  Concatenations
  -- * Basic functions
  , prepareText
  , prepareTextN
  , complexWords
  , splitLines
  , splitLinesN
  , isSpC
  -- * Used to transform after convertToProperphonetic language from mmsyn6ukr package
  , isPLL
) where

import CaseBi.Arr (getBFstL')
import Data.List.InnToOut.Basic (mapI)
import Data.Char (isAlpha,toLower)
import GHC.Arr

{-| The lists in the list are sorted in the descending order by the word counts in the inner 'String's. All the 'String's
in each inner list have the same number of words, and if there is no 'String' with some intermediate number of words (e. g. there
are not empty 'String's for 4 and 2 words, but there is no one for 3 words 'String's) then such corresponding list is empty, but
it is, nevertheless, present. Probably the maximum number of words can be no more than 4, and the minimum number can be
probably no less than 1, but it depends (especially for the maximum). The 'String's in the inner lists must be (unlike the inner
lists themselves) sorted in the ascending order for the data type to work correctly in the functions of the module.
-}
type Concatenations = [[String]]

-- | Is used to convert a phonetic language text into list of 'String' each of which is ready to be
-- used by the functions from the other modules in the package.
-- It applies minimal grammar links and connections between the most commonly used phonetic language
-- words that \"should\" be paired and not dealt with separately
-- to avoid the misinterpretation and preserve maximum of the semantics for the
-- \"phonetic\" language on the phonetic language basis.
prepareText :: Concatenations -> String -> String -> [String]
prepareText ysss xs = filter (any (isPLL xs)) . splitLines . map (unwords . complexWords ysss ysss . words .
  filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines

-- | Concatenates complex words in phonetic language so that they are not separated further by possible words order rearrangements (because they are treated
-- as a single word). This is needed to preserve basic grammar in phonetic languages.
complexWords :: Concatenations -> Concatenations -> [String] -> [String]
complexWords rsss ysss@(yss:tsss) zss@(ts:xss)
 | null yss = complexWords rsss tsss zss
 | otherwise =
    let y = length . words . head $ yss
        uwxs = unwords . take y $ zss in getBFstL' (complexWords rsss tsss zss) (map (\ys ->
           (ys,complexWords rsss tsss ((filter (/= ' ') ys ++ ts):xss))) yss) uwxs
complexWords rsss [] zss@(xs:xss) = xs:complexWords rsss rsss xss
complexWords _ _ [] = []

-- | A generalized variant of the 'prepareText' with the arbitrary maximum number of the words in the lines given as the first argument.
prepareTextN :: Int -> Concatenations -> String -> String -> [String]
prepareTextN n ysss xs = filter (any (isPLL xs)) . splitLinesN n . map (unwords . complexWords ysss ysss . words .
  filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines

isSpC :: Char -> Bool
isSpC x = x == '\'' || x == ' ' || x == '\x2019' || x == '\x02BC' || x == '-'
{-# INLINE isSpC #-}

-- | The first argument must be a 'String' of sorted 'Char's in the ascending order of all possible symbols that can be
-- used for the text in the phonetic language selected. Can be prepared beforehand, or read from the file. 
isPLL :: String -> Char -> Bool
isPLL xs y = getBFstL' False (zip xs . replicate 10000 $ True) y

-- | The function is recursive and is applied so that all returned elements ('String') are no longer than 7 words in them.
splitLines :: [String] -> [String]
splitLines xss
 | null xss = []
 | otherwise = mapI (\xs -> compare (length . words $ xs) 7 == GT) (\xs -> let yss = words xs in
     splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss

-- | A generalized variant of the 'splitLines' with the arbitrary maximum number of the words in the lines given as the first argument.
splitLinesN :: Int -> [String] -> [String]
splitLinesN n xss
 | null xss || n <= 0 = []
 | otherwise = mapI (\xs -> compare (length . words $ xs) n == GT) (\xs -> let yss = words xs in
     splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss