{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}

-- |
-- Module      :  SLynx.Simulate.Simulate
-- Description :  Simulate multiple sequence alignments
-- Copyright   :  (c) Dominik Schrempf 2020
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Mon Jan 28 14:12:52 2019.
module SLynx.Simulate.Simulate
  ( simulateCmd,
  )
where

import Control.Applicative ((<|>))
import Control.Concurrent
import Control.Concurrent.Async
import Control.Monad
  ( unless,
    when,
  )
import Control.Monad.IO.Class
import Control.Monad.Logger
import Control.Monad.Trans.Class
import Control.Monad.Trans.Reader (ask)
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List.NonEmpty (toList)
import Data.Maybe
import qualified Data.Set as Set
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.Encoding as LT
import qualified Data.Vector.Unboxed as V
import ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.MarkovProcess.GammaRateHeterogeneity
import qualified ELynx.Data.MarkovProcess.MixtureModel as M
import qualified ELynx.Data.MarkovProcess.PhyloModel as P
import qualified ELynx.Data.MarkovProcess.SubstitutionModel as SM
import qualified ELynx.Data.Sequence.Alignment as A
import qualified ELynx.Data.Sequence.Sequence as Seq hiding
  ( name,
  )
import ELynx.Tree
import ELynx.Export.Sequence.Fasta
import ELynx.Import.MarkovProcess.EDMModelPhylobayes
import ELynx.Import.MarkovProcess.SiteprofilesPhylobayes
import ELynx.Simulate.MarkovProcessAlongTree
import ELynx.Tools
import Numeric.LinearAlgebra hiding
  ( toList,
    (<>),
  )
import SLynx.Simulate.Options
import SLynx.Simulate.PhyloModel
import System.Random.MWC
import Text.Printf

-- Simulate a 'Alignment' for a given phylogenetic model,
-- phylogenetic tree, and alignment length.
simulateAlignment ::
  (Measurable e, Named a) =>
  P.PhyloModel ->
  Tree e a ->
  Int ->
  GenIO ->
  IO A.Alignment
simulateAlignment pm t' n g = do
  let t = getLen <$> toTreeBranchLabels t'
  c <- getNumCapabilities
  gs <- splitGen c g
  let chunks = getChunks c n
  leafStatesS <- case pm of
    -- TODO @performace: This parallelization is not very intelligent, because
    -- the matrix exponentiation is done in all threads. So ten threads will
    -- exponentiate the same matrix ten times.
    P.SubstitutionModel sm ->
      mapConcurrently
        (\(num, gen) -> simulateAndFlatten num d e t gen)
        (zip chunks gs)
      where
        d = SM.stationaryDistribution sm
        e = SM.exchangeabilityMatrix sm
    -- P.MixtureModel mm      -> mapConcurrently
    --   (\(num, gen) -> simulateAndFlattenNSitesAlongTreeMixtureModel num ws ds es t gen) (zip chunks gs)
    P.MixtureModel mm -> simulateAndFlattenMixtureModelPar n ws ds es t g
      where
        ws = vector . toList $ M.getWeights mm
        ds = map SM.stationaryDistribution $ toList $ M.getSubstitutionModels mm
        es = map SM.exchangeabilityMatrix $ toList $ M.getSubstitutionModels mm
  -- XXX @performace. The horizontal concatenation might be slow. If so,
  -- 'concatenateSeqs' or 'concatenateAlignments' can be used, which directly
  -- appends vectors.
  let leafStates = horizontalConcat leafStatesS
      leafNames = map getName $ leaves t'
      code = P.getAlphabet pm
      -- XXX: Probably use type safe stuff here?
      alph = A.all $ alphabetSpec code
      sequences =
        [ Seq.Sequence sName "" code (V.fromList $ map (`Set.elemAt` alph) ss)
          | (sName, ss) <- zip leafNames leafStates
        ]
  return $ either error id $ A.fromSequences sequences

-- Summarize EDM components; line to be printed to screen or log.
summarizeEDMComponents :: [EDMComponent] -> BL.ByteString
summarizeEDMComponents cs =
  BL.pack $
    "Empiricial distribution mixture model with "
      ++ show (length cs)
      ++ " components."

-- XXX. Maybe provide human readable model file. But then, why is this
-- necessary. A human readable summary is reported anyways, and for Protein
-- models the exchangeabilities are too many.
reportModel :: P.PhyloModel -> ELynx SimulateArguments ()
reportModel m = do
  bn <- outFileBaseName . global <$> ask
  case bn of
    Nothing ->
      $(logInfo)
        "No output file provided; omit writing machine-readable phylogenetic model."
    Just _ ->
      out "model definition (machine readable)" (BL.pack (show m) <> "\n") ".model.gz"

pretty :: BranchLength -> String
pretty = printf "%.5f"

prettyRow :: String -> String -> BL.ByteString
prettyRow name val = alignLeft 33 n <> alignRight 8 v
  where
    n = BL.pack name
    v = BL.pack val

-- | Examine branches of a tree.
summarizeBranchLengths :: Measurable e => Tree e a -> BL.ByteString
summarizeBranchLengths t =
  BL.intercalate
    "\n"
    [ prettyRow "Origin height: " $ pretty h,
      prettyRow "Average distance origin to leaves: " $ pretty h',
      prettyRow "Total branch length: " $ pretty b
    ]
  where
    n = length $ leaves t
    h = height t
    h' = sum (distancesOriginLeaves t) / fromIntegral n
    b = totalBranchLength t

-- Summarize a substitution model; lines to be printed to screen or log.
summarizeSM :: SM.SubstitutionModel -> [BL.ByteString]
summarizeSM sm =
  map BL.pack $
    (show (SM.alphabet sm) ++ " substitution model: " ++ SM.name sm ++ ".") :
    ["Parameters: " ++ show (SM.params sm) ++ "." | not (null (SM.params sm))]
      ++ case SM.alphabet sm of
        DNA ->
          [ "Stationary distribution: "
              ++ dispv precision (SM.stationaryDistribution sm)
              ++ ".",
            "Exchangeability matrix:\n"
              ++ dispmi 2 precision (SM.exchangeabilityMatrix sm)
              ++ ".",
            "Scale: " ++ show (roundN precision $ SM.totalRate sm) ++ "."
          ]
        Protein ->
          [ "Stationary distribution: "
              ++ dispv precision (SM.stationaryDistribution sm)
              ++ ".",
            "Scale: " ++ show (roundN precision $ SM.totalRate sm) ++ "."
          ]
        _ ->
          error
            "Extended character sets are not supported with substitution models."

-- Summarize a mixture model component; lines to be printed to screen or log.
summarizeMMComponent :: M.Component -> [BL.ByteString]
summarizeMMComponent c =
  BL.pack "Weight: "
    <> (BB.toLazyByteString . BB.doubleDec $ M.weight c) :
  summarizeSM (M.substModel c)

-- Summarize a mixture model; lines to be printed to screen or log.
summarizeMM :: M.MixtureModel -> [BL.ByteString]
summarizeMM m =
  [ BL.pack $ "Mixture model: " ++ M.name m ++ ".",
    BL.pack $ "Number of components: " ++ show n ++ "."
  ]
    ++ detail
  where
    n = length $ M.components m
    detail =
      if n <= 100
        then
          concat
            [ BL.pack ("Component " ++ show i ++ ":") : summarizeMMComponent c
              | (i, c) <- zip [1 :: Int ..] (toList $ M.components m)
            ]
        else []

-- Summarize a phylogenetic model; lines to be printed to screen or log.
summarizePM :: P.PhyloModel -> [BL.ByteString]
summarizePM (P.MixtureModel mm) = summarizeMM mm
summarizePM (P.SubstitutionModel sm) = summarizeSM sm

-- | Simulate sequences.
simulateCmd :: ELynx SimulateArguments ()
simulateCmd = do
  l <- local <$> ask
  let treeFile = argsTreeFile l
  $(logInfo) ""
  $(logInfo) $ T.pack $ "Read tree from file '" ++ treeFile ++ "'."
  tree <- liftIO $ parseFileWith (newick Standard) treeFile
  let t' = either error id $ phyloToLengthTree tree
  $(logInfo) $ T.pack $ "Number of leaves: " ++ show (length $ leaves t')
  $(logInfo) $ LT.toStrict $ LT.decodeUtf8 $ summarizeBranchLengths t'
  let edmFile = argsEDMFile l
  let sProfileFiles = argsSiteprofilesFiles l
  $(logInfo) ""
  $(logDebug) "Read EDM file or siteprofile files."
  when (isJust edmFile && isJust sProfileFiles) $
    error "Got both: --edm-file and --siteprofile-files."
  edmCs <- case edmFile of
    Nothing -> return Nothing
    Just edmF -> do
      $(logInfo) "Read EDM file."
      liftIO $ Just <$> parseFileWith phylobayes edmF
  maybe
    (return ())
    ($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents)
    edmCs
  sProfiles <- case sProfileFiles of
    Nothing -> return Nothing
    Just fns -> do
      $(logInfo) $
        T.pack $
          "Read siteprofiles from "
            ++ show (length fns)
            ++ " file(s)."
      $(logDebug) $ T.pack $ "The file names are:" ++ show fns
      xs <- liftIO $ mapM (parseFileWith siteprofiles) fns
      return $ Just $ concat xs
  maybe
    (return ())
    ($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents)
    sProfiles
  let edmCsOrSiteprofiles = edmCs <|> sProfiles
  $(logInfo) "Read model string."
  let ms = argsSubstitutionModelString l
      mm = argsMixtureModelString l
      mws = argsMixtureWeights l
      eitherPhyloModel' = getPhyloModel ms mm mws edmCsOrSiteprofiles
  phyloModel' <- case eitherPhyloModel' of
    Left err -> lift $ error err
    Right pm -> return pm
  let maybeGammaParams = argsGammaParams l
  phyloModel <- case maybeGammaParams of
    Nothing -> do
      $(logInfo) $
        LT.toStrict $
          LT.decodeUtf8 $
            BL.unlines $
              summarizePM
                phyloModel'
      return phyloModel'
    Just (n, alpha) -> do
      $(logInfo) $
        LT.toStrict $
          LT.decodeUtf8 $
            BL.intercalate "\n" $
              summarizePM phyloModel'
      $(logInfo) ""
      $(logInfo) $
        LT.toStrict $
          LT.decodeUtf8 $
            BL.intercalate "\n" $
              summarizeGammaRateHeterogeneity n alpha
      return $ expand n alpha phyloModel'
  -- -- XXX: Do not report possibly huge empirical distribution mixture models
  -- -- for now, because it takes too long and uses too much disk space :).
  unless
    (isJust sProfiles)
    ( do
        $(logInfo) ""
        reportModel phyloModel
        $(logInfo) ""
    )
  $(logInfo) "Simulate alignment."
  let alignmentLength = argsLength l
  $(logInfo) $ T.pack $ "Length: " <> show alignmentLength <> "."
  gen <- case argsSeed l of
    Random ->
      error "simulateCmd: seed not available; please contact maintainer."
    Fixed s -> liftIO $ initialize s
  alignment <- liftIO $ simulateAlignment phyloModel t' alignmentLength gen
  let output = (sequencesToFasta . A.toSequences) alignment
  $(logInfo) ""
  out "simulated multi sequence alignment" output ".fasta"