{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell   #-}

{- |

Module      :  Analyze.Analyze
Description :  Parse sequence file formats and analyze them
Copyright   :  (c) Dominik Schrempf 2018
License     :  GPL-3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  portable

Creation date: Fri Oct  5 08:41:05 2018.

-}

module Examine.Examine
  ( examineCmd
  )
  where

import           Control.Monad.Logger
import           Control.Monad.Trans.Class
import           Control.Monad.Trans.Reader
import qualified Data.ByteString.Lazy.Char8                 as L
import           Text.Printf

import           Examine.Options
import           Tools

import           ELynx.Data.Sequence.MultiSequenceAlignment
import           ELynx.Data.Sequence.Sequence
import           ELynx.Tools.InputOutput

examineMSA :: Bool -> MultiSequenceAlignment -> L.ByteString
examineMSA perSiteFlag msa =
  L.unlines [ L.pack $ "Total number of columns in alignment: "
              ++ show (msaLength msa)
            , L.pack $ "Number of columns without gaps: "
              ++ show (msaLength msaNoGaps)
            , L.pack $ "Number of columns with standard characters only: "
              ++ show (msaLength msaOnlyStd)
            , L.empty
            , L.pack $ "Total number of characters: " ++ show nTot
            , L.pack $ "Standard (i.e., not extended IUPAC) characters: "
              ++ show (nTot - nIUPAC - nGaps - nUnknowns)
            , L.pack $ "Extended IUPAC characters: " ++ show nIUPAC
            , L.pack $ "Gaps: " ++ show nGaps
            , L.pack $ "Unknowns: " ++ show nUnknowns
            , L.pack $ "Percentage of standard characters: "
              ++ printf "%.3f" (1.0 - percentageIUPAC - percentageGaps - percentageUnknowns)
            , L.pack $ "Percentage of extended IUPAC characters: "
              ++ printf "%.3f" percentageIUPAC
            , L.pack $ "Percentage of gaps: "
              ++ printf "%.3f" percentageGaps
            , L.pack $ "Percentage of unknowns: "
              ++ printf "%.3f" percentageUnknowns
            , L.empty
            , L.pack "Mean effective number of states (measured using entropy):"
            , L.pack "Across whole alignment: "
              <> L.pack (printf "%.3f" kEffMean)
            , L.pack "Across columns without gaps: "
              <> L.pack (printf "%.3f" kEffMeanNoGaps)
            , L.pack "Across columns without extended IUPAC characters: "
              <> L.pack (printf "%.3f" kEffMeanOnlyStd)
            ]
  <> perSiteBS
  where
    nTot                = msaLength msa * msaNSequences msa
    nIUPAC              = countIUPACChars msa
    nGaps               = countGaps msa
    nUnknowns           = countUnknowns msa
    percentageIUPAC     = fromIntegral nIUPAC    / fromIntegral nTot :: Double
    percentageGaps      = fromIntegral nGaps     / fromIntegral nTot :: Double
    percentageUnknowns  = fromIntegral nUnknowns / fromIntegral nTot :: Double
    msaNoGaps           = filterColumnsNoGaps msa
    msaOnlyStd          = filterColumnsOnlyStd msaNoGaps
    kEffs               = kEffEntropy . toFrequencyData $ msa
    kEffsNoGaps         = kEffEntropy . toFrequencyData $ msaNoGaps
    kEffsOnlyStd        = kEffEntropy . toFrequencyData $ msaOnlyStd
    kEffMean            = sum kEffs / fromIntegral (length kEffs)
    kEffMeanNoGaps      = sum kEffsNoGaps  / fromIntegral (length kEffsNoGaps)
    kEffMeanOnlyStd     = sum kEffsOnlyStd / fromIntegral (length kEffsOnlyStd)
    perSiteBS           = if perSiteFlag
                          then L.unlines [ L.pack "Effective number of used states per site:"
                                         , L.pack . show $ kEffs
                                         ]
                          else L.empty

examine :: Bool -> [Sequence] -> L.ByteString
examine perSiteFlag ss = summarizeSequenceList ss <>
  case fromSequenceList ss of
    Left _    -> L.empty
    Right msa -> L.pack "\n" <> examineMSA perSiteFlag msa

-- | Examine sequences.
examineCmd :: Maybe FilePath -> Examine ()
examineCmd outFileBaseName = do
  $(logInfo) "Command: Examine sequences."
  ExamineArguments al inFile perSiteFlag <- lift ask
  ss <- readSeqs al inFile
  let result = examine perSiteFlag ss
  let outFilePath = (++ ".out") <$> outFileBaseName
  io "result of examination" result outFilePath