{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module Examine.Examine
( examineCmd
)
where
import Control.Monad.Logger
import Control.Monad.Trans.Class
import Control.Monad.Trans.Reader
import qualified Data.ByteString.Lazy.Char8 as L
import Text.Printf
import Examine.Options
import Tools
import ELynx.Data.Sequence.MultiSequenceAlignment
import ELynx.Data.Sequence.Sequence
import ELynx.Tools.InputOutput
examineMSA :: Bool -> MultiSequenceAlignment -> L.ByteString
examineMSA perSiteFlag msa =
L.unlines [ L.pack $ "Total number of columns in alignment: "
++ show (msaLength msa)
, L.pack $ "Number of columns without gaps: "
++ show (msaLength msaNoGaps)
, L.pack $ "Number of columns with standard characters only: "
++ show (msaLength msaOnlyStd)
, L.empty
, L.pack $ "Total number of characters: " ++ show nTot
, L.pack $ "Standard (i.e., not extended IUPAC) characters: "
++ show (nTot - nIUPAC - nGaps - nUnknowns)
, L.pack $ "Extended IUPAC characters: " ++ show nIUPAC
, L.pack $ "Gaps: " ++ show nGaps
, L.pack $ "Unknowns: " ++ show nUnknowns
, L.pack $ "Percentage of standard characters: "
++ printf "%.3f" (1.0 - percentageIUPAC - percentageGaps - percentageUnknowns)
, L.pack $ "Percentage of extended IUPAC characters: "
++ printf "%.3f" percentageIUPAC
, L.pack $ "Percentage of gaps: "
++ printf "%.3f" percentageGaps
, L.pack $ "Percentage of unknowns: "
++ printf "%.3f" percentageUnknowns
, L.empty
, L.pack "Mean effective number of states (measured using entropy):"
, L.pack "Across whole alignment: "
<> L.pack (printf "%.3f" kEffMean)
, L.pack "Across columns without gaps: "
<> L.pack (printf "%.3f" kEffMeanNoGaps)
, L.pack "Across columns without extended IUPAC characters: "
<> L.pack (printf "%.3f" kEffMeanOnlyStd)
]
<> perSiteBS
where
nTot = msaLength msa * msaNSequences msa
nIUPAC = countIUPACChars msa
nGaps = countGaps msa
nUnknowns = countUnknowns msa
percentageIUPAC = fromIntegral nIUPAC / fromIntegral nTot :: Double
percentageGaps = fromIntegral nGaps / fromIntegral nTot :: Double
percentageUnknowns = fromIntegral nUnknowns / fromIntegral nTot :: Double
msaNoGaps = filterColumnsNoGaps msa
msaOnlyStd = filterColumnsOnlyStd msaNoGaps
kEffs = kEffEntropy . toFrequencyData $ msa
kEffsNoGaps = kEffEntropy . toFrequencyData $ msaNoGaps
kEffsOnlyStd = kEffEntropy . toFrequencyData $ msaOnlyStd
kEffMean = sum kEffs / fromIntegral (length kEffs)
kEffMeanNoGaps = sum kEffsNoGaps / fromIntegral (length kEffsNoGaps)
kEffMeanOnlyStd = sum kEffsOnlyStd / fromIntegral (length kEffsOnlyStd)
perSiteBS = if perSiteFlag
then L.unlines [ L.pack "Effective number of used states per site:"
, L.pack . show $ kEffs
]
else L.empty
examine :: Bool -> [Sequence] -> L.ByteString
examine perSiteFlag ss = summarizeSequenceList ss <>
case fromSequenceList ss of
Left _ -> L.empty
Right msa -> L.pack "\n" <> examineMSA perSiteFlag msa
examineCmd :: Maybe FilePath -> Examine ()
examineCmd outFileBaseName = do
$(logInfo) "Command: Examine sequences."
ExamineArguments al inFile perSiteFlag <- lift ask
ss <- readSeqs al inFile
let result = examine perSiteFlag ss
let outFilePath = (++ ".out") <$> outFileBaseName
io "result of examination" result outFilePath