{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}

-- |
--
-- Module      :  Analyze.Analyze
-- Description :  Parse sequence file formats and analyze them
-- Copyright   :  (c) Dominik Schrempf 2018
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Fri Oct  5 08:41:05 2018.
module SLynx.Examine.Examine
  ( examineCmd,
  )
where

import Control.Monad.Logger
import Control.Monad.Trans.Reader (ask)
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Set as S
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Data.Alphabet.Alphabet as A
import qualified ELynx.Data.Alphabet.Character as C
import qualified ELynx.Data.Sequence.Alignment as M
import qualified ELynx.Data.Sequence.Distance as D
import qualified ELynx.Data.Sequence.Sequence as Seq
import ELynx.Tools
import SLynx.Examine.Options
import SLynx.Tools
import qualified Statistics.Sample as Sm
import Text.Printf

pRow :: String -> String -> BL.ByteString
pRow name val = alignLeft 50 n <> alignRight 10 v
  where
    n = BL.pack name
    v = BL.pack val

examineAlignment :: Bool -> M.Alignment -> BL.ByteString
examineAlignment perSiteFlag a =
  BL.unlines
    [ BL.pack
        "Sequences have equal length (multi sequence alignment, or single sequence).",
      BL.pack "Number of columns in alignment:",
      pRow "  Total:" $ show aL,
      pRow "  Constant:" $ show nConstant,
      pRow "  Constant (including gaps or unknowns):" $ show nConstantSoft,
      pRow "  Without gaps:" $ show (M.length aNoGaps),
      pRow "  With standard characters only:" $
        show (M.length aOnlyStd),
      BL.empty,
      pRow "Total number of characters:" $ show nTot,
      pRow "Standard (i.e., not extended IUPAC) characters:" $
        show (nTot - nIUPAC - nGaps - nUnknowns),
      pRow "Extended IUPAC characters:" $ show nIUPAC,
      pRow "Gaps:" $ show nGaps,
      pRow "Unknowns:" $ show nUnknowns,
      BL.empty,
      pRow "Percentage of standard characters:" $
        printf "%2.2f" (100.0 - percentIUPAC - percentGaps - percentUnknowns),
      pRow "Percentage of extended IUPAC characters:" $
        printf "%2.2f" percentIUPAC,
      pRow "Percentage of gaps:" $ printf "%2.2f" percentGaps,
      pRow "Percentage of unknowns:" $ printf "%2.2f" percentUnknowns,
      BL.empty,
      BL.pack "Distribution of characters:",
      -- Holy crap.
      BL.pack $
        concatMap ((: "     ") . C.toChar) $
          S.toList $
            A.std $
              A.alphabetSpec $
                M.alphabet a,
      BL.pack $ unwords $ map (printf "%.3f") charFreqs,
      BL.empty,
      BL.pack "Pairwise hamming distances (per site):",
      pRow "  Mean:" $ printf "%.3f" hMean,
      pRow "  Standard deviation:" $ printf "%.3f" $ sqrt hVar,
      pRow "  Minimum:" $ printf "%.3f" hMin,
      pRow "  Maximum:" $ printf "%.3f" hMax,
      BL.empty,
      BL.pack "Mean effective number of states (measured using entropy):",
      pRow "Across whole alignment:" $ printf "%.3f" kEffMean,
      pRow "Across columns without gaps:" $ printf "%.3f" kEffMeanNoGaps,
      pRow "Across columns without extended IUPAC characters:" $
        printf "%.3f" kEffMeanOnlyStd,
      BL.empty,
      BL.pack "Mean effective number of states (measured using homoplasy):",
      pRow "Across whole alignment:" $ printf "%.3f" kEffMeanHomo,
      pRow "Across columns without gaps:" $ printf "%.3f" kEffMeanNoGapsHomo,
      pRow "Across columns without extended IUPAC characters:" $
        printf "%.3f" kEffMeanOnlyStdHomo
    ]
    <> perSiteBS
  where
    aL = M.length a
    nConstant = M.length $ M.filterColsConstant a
    nConstantSoft = M.length $ M.filterColsConstantSoft a
    nTot = M.length a * M.nSequences a
    nIUPAC = M.countIUPACChars a
    nGaps = M.countGaps a
    nUnknowns = M.countUnknowns a
    percentIUPAC = 100 * fromIntegral nIUPAC / fromIntegral nTot :: Double
    percentGaps = 100 * fromIntegral nGaps / fromIntegral nTot :: Double
    percentUnknowns = 100 * fromIntegral nUnknowns / fromIntegral nTot :: Double
    aNoGaps = M.filterColsNoGaps a
    aNoGapsFreq = M.toFrequencyData aNoGaps
    aOnlyStd = M.filterColsOnlyStd aNoGaps
    aOnlyStdFreq = M.toFrequencyData aOnlyStd
    charFreqsPerSite = M.toFrequencyData a
    charFreqs = M.distribution charFreqsPerSite
    seqs = M.toSequences a
    normlz x = fromIntegral x / fromIntegral aL
    pairwiseHamming =
      V.fromList
        [ either error normlz $ D.hamming x y
          | x <- seqs,
            y <- seqs,
            x /= y
        ]
    (hMean, hVar) = Sm.meanVariance pairwiseHamming
    hMin = V.minimum pairwiseHamming
    hMax = V.maximum pairwiseHamming
    kEffs = M.kEffEntropy charFreqsPerSite
    kEffsNoGaps = M.kEffEntropy aNoGapsFreq
    kEffsOnlyStd = M.kEffEntropy aOnlyStdFreq
    kEffMean = sum kEffs / fromIntegral (length kEffs)
    kEffMeanNoGaps = sum kEffsNoGaps / fromIntegral (length kEffsNoGaps)
    kEffMeanOnlyStd = sum kEffsOnlyStd / fromIntegral (length kEffsOnlyStd)
    kEffsHomo = M.kEffHomoplasy charFreqsPerSite
    kEffsNoGapsHomo = M.kEffHomoplasy aNoGapsFreq
    kEffsOnlyStdHomo = M.kEffHomoplasy aOnlyStdFreq
    kEffMeanHomo = sum kEffsHomo / fromIntegral (length kEffsHomo)
    kEffMeanNoGapsHomo =
      sum kEffsNoGapsHomo / fromIntegral (length kEffsNoGapsHomo)
    kEffMeanOnlyStdHomo =
      sum kEffsOnlyStdHomo / fromIntegral (length kEffsOnlyStdHomo)
    perSiteBS =
      if perSiteFlag
        then
          BL.unlines
            [ BL.pack "Effective number of used states per site:",
              BL.pack . show $ kEffs
            ]
        else BL.empty

examine :: Bool -> [Seq.Sequence] -> BL.ByteString
examine perSiteFlag ss =
  Seq.summarizeSequences ss <> case M.fromSequences ss of
    Left _ -> BL.empty
    Right a -> BL.pack "\n" <> examineAlignment perSiteFlag a

-- | Examine sequences.
examineCmd :: ELynx ExamineArguments ()
examineCmd = do
  (ExamineArguments al inFile perSiteFlag) <- local <$> ask
  $(logInfo) "Command: Examine sequences."
  ss <- readSeqs al inFile
  let result = examine perSiteFlag ss
  out "result of examination" result ".out"