{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}

-- |
--
-- Module      :  Analyze.Analyze
-- Description :  Parse sequence file formats and analyze them
-- Copyright   :  2021 Dominik Schrempf
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Fri Oct  5 08:41:05 2018.
module SLynx.Examine.Examine
  ( examineCmd,
  )
where

import Control.Monad
import Control.Monad.IO.Class
import Control.Monad.Trans.Reader
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Matrix.Unboxed as MU
import qualified Data.Set as S
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Alphabet.Alphabet as A
import qualified ELynx.Alphabet.Character as C
import qualified ELynx.Sequence.Alignment as M
import qualified ELynx.Sequence.Distance as D
import ELynx.Sequence.Divergence
import qualified ELynx.Sequence.Sequence as Seq
import ELynx.Tools.ByteString
import ELynx.Tools.ELynx
import ELynx.Tools.Environment
import ELynx.Tools.Logger
import qualified Numeric.LinearAlgebra as L
import SLynx.Examine.Options
import SLynx.Tools
import qualified Statistics.Sample as Sm
import System.IO
import Text.Printf

pRow :: String -> String -> BL.ByteString
pRow :: String -> String -> ByteString
pRow String
name String
val = Int -> ByteString -> ByteString
alignLeft Int
50 ByteString
n forall a. Semigroup a => a -> a -> a
<> Int -> ByteString -> ByteString
alignRight Int
10 ByteString
v
  where
    n :: ByteString
n = String -> ByteString
BL.pack String
name
    v :: ByteString
v = String -> ByteString
BL.pack String
val

examineAlignment :: Bool -> M.Alignment -> BL.ByteString
examineAlignment :: Bool -> Alignment -> ByteString
examineAlignment Bool
perSiteFlag Alignment
a =
  [ByteString] -> ByteString
BL.unlines
    [ ByteString
"Sequences have equal length (multi sequence alignment, or single sequence).",
      ByteString
"Number of columns in alignment:",
      String -> String -> ByteString
pRow String
"  Total:" forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show Int
aL,
      String -> String -> ByteString
pRow String
"  Constant:" forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show Int
nConstant,
      String -> String -> ByteString
pRow String
"  Constant (including gaps or unknowns):" forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show Int
nConstantSoft,
      String -> String -> ByteString
pRow String
"  Without gaps:" forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show (Alignment -> Int
M.length Alignment
aNoGaps),
      String -> String -> ByteString
pRow String
"  With standard characters only:" forall a b. (a -> b) -> a -> b
$
        forall a. Show a => a -> String
show (Alignment -> Int
M.length Alignment
aOnlyStd),
      ByteString
"",
      String -> String -> ByteString
pRow String
"Total number of characters:" forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show Int
nTot,
      String -> String -> ByteString
pRow String
"Standard (i.e., not extended IUPAC) characters:" forall a b. (a -> b) -> a -> b
$
        forall a. Show a => a -> String
show (Int
nTot forall a. Num a => a -> a -> a
- Int
nIUPAC forall a. Num a => a -> a -> a
- Int
nGaps forall a. Num a => a -> a -> a
- Int
nUnknowns),
      String -> String -> ByteString
pRow String
"Extended IUPAC characters:" forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show Int
nIUPAC,
      String -> String -> ByteString
pRow String
"Gaps:" forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show Int
nGaps,
      String -> String -> ByteString
pRow String
"Unknowns:" forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show Int
nUnknowns,
      ByteString
"",
      String -> String -> ByteString
pRow String
"Percentage of standard characters:" forall a b. (a -> b) -> a -> b
$
        forall r. PrintfType r => String -> r
printf String
"%2.2f" (Double
100.0 forall a. Num a => a -> a -> a
- Double
percentIUPAC forall a. Num a => a -> a -> a
- Double
percentGaps forall a. Num a => a -> a -> a
- Double
percentUnknowns),
      String -> String -> ByteString
pRow String
"Percentage of extended IUPAC characters:" forall a b. (a -> b) -> a -> b
$
        forall r. PrintfType r => String -> r
printf String
"%2.2f" Double
percentIUPAC,
      String -> String -> ByteString
pRow String
"Percentage of gaps:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%2.2f" Double
percentGaps,
      String -> String -> ByteString
pRow String
"Percentage of unknowns:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%2.2f" Double
percentUnknowns,
      ByteString
"",
      String -> ByteString
BL.pack String
"Distribution of characters:",
      -- Holy crap.
      String -> ByteString
BL.pack forall a b. (a -> b) -> a -> b
$
        forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((forall a. a -> [a] -> [a]
: String
"     ") forall b c a. (b -> c) -> (a -> b) -> a -> c
. Character -> Char
C.toChar) forall a b. (a -> b) -> a -> b
$
          forall a. Set a -> [a]
S.toList forall a b. (a -> b) -> a -> b
$
            AlphabetSpec -> Set Character
A.std forall a b. (a -> b) -> a -> b
$
              Alphabet -> AlphabetSpec
A.alphabetSpec forall a b. (a -> b) -> a -> b
$
                Alignment -> Alphabet
M.alphabet Alignment
a,
      String -> ByteString
BL.pack forall a b. (a -> b) -> a -> b
$ [String] -> String
unwords forall a b. (a -> b) -> a -> b
$ forall a b. (a -> b) -> [a] -> [b]
map (forall r. PrintfType r => String -> r
printf String
"%.3f") [Double]
charFreqs,
      ByteString
"",
      String -> ByteString
BL.pack String
"Pairwise hamming distances (per site):",
      String -> String -> ByteString
pRow String
"  Mean:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%.3f" Double
hMean,
      String -> String -> ByteString
pRow String
"  Standard deviation:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%.3f" forall a b. (a -> b) -> a -> b
$ forall a. Floating a => a -> a
sqrt Double
hVar,
      String -> String -> ByteString
pRow String
"  Minimum:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%.3f" Double
hMin,
      String -> String -> ByteString
pRow String
"  Maximum:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%.3f" Double
hMax,
      ByteString
"",
      ByteString
"Mean effective number of states (measured using entropy):",
      String -> String -> ByteString
pRow String
"Across whole alignment:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMean,
      String -> String -> ByteString
pRow String
"Across columns without gaps:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanNoGaps,
      String -> String -> ByteString
pRow String
"Across columns without extended IUPAC characters:" forall a b. (a -> b) -> a -> b
$
        forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanOnlyStd,
      ByteString
"",
      String -> ByteString
BL.pack String
"Mean effective number of states (measured using homoplasy):",
      String -> String -> ByteString
pRow String
"Across whole alignment:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanHomo,
      String -> String -> ByteString
pRow String
"Across columns without gaps:" forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanNoGapsHomo,
      String -> String -> ByteString
pRow String
"Across columns without extended IUPAC characters:" forall a b. (a -> b) -> a -> b
$
        forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanOnlyStdHomo,
      ByteString
"",
      ByteString
"Divergence matrix:"
    ]
    forall a. Semigroup a => a -> a -> a
<> ByteString
perSiteBS
  where
    aL :: Int
aL = Alignment -> Int
M.length Alignment
a
    nConstant :: Int
nConstant = Alignment -> Int
M.length forall a b. (a -> b) -> a -> b
$ Alignment -> Alignment
M.filterColsConstant Alignment
a
    nConstantSoft :: Int
nConstantSoft = Alignment -> Int
M.length forall a b. (a -> b) -> a -> b
$ Alignment -> Alignment
M.filterColsConstantSoft Alignment
a
    nTot :: Int
nTot = Alignment -> Int
M.length Alignment
a forall a. Num a => a -> a -> a
* Alignment -> Int
M.nSequences Alignment
a
    nIUPAC :: Int
nIUPAC = Alignment -> Int
M.countIUPACChars Alignment
a
    nGaps :: Int
nGaps = Alignment -> Int
M.countGaps Alignment
a
    nUnknowns :: Int
nUnknowns = Alignment -> Int
M.countUnknowns Alignment
a
    percentIUPAC :: Double
percentIUPAC = Double
100 forall a. Num a => a -> a -> a
* forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nIUPAC forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nTot :: Double
    percentGaps :: Double
percentGaps = Double
100 forall a. Num a => a -> a -> a
* forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nGaps forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nTot :: Double
    percentUnknowns :: Double
percentUnknowns = Double
100 forall a. Num a => a -> a -> a
* forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nUnknowns forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nTot :: Double
    aNoGaps :: Alignment
aNoGaps = Alignment -> Alignment
M.filterColsNoGaps Alignment
a
    aNoGapsFreq :: FrequencyData
aNoGapsFreq = Alignment -> FrequencyData
M.toFrequencyData Alignment
aNoGaps
    aOnlyStd :: Alignment
aOnlyStd = Alignment -> Alignment
M.filterColsOnlyStd Alignment
aNoGaps
    aOnlyStdFreq :: FrequencyData
aOnlyStdFreq = Alignment -> FrequencyData
M.toFrequencyData Alignment
aOnlyStd
    charFreqsPerSite :: FrequencyData
charFreqsPerSite = Alignment -> FrequencyData
M.toFrequencyData Alignment
a
    charFreqs :: [Double]
charFreqs = FrequencyData -> [Double]
M.distribution FrequencyData
charFreqsPerSite
    seqs :: [Sequence]
seqs = Alignment -> [Sequence]
M.toSequences Alignment
a
    normlz :: a -> a
normlz a
x = forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
aL
    pairwiseHamming :: Vector Double
pairwiseHamming =
      forall a. Unbox a => [a] -> Vector a
V.fromList
        [ forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either forall a. HasCallStack => String -> a
error forall {a} {a}. (Fractional a, Integral a) => a -> a
normlz forall a b. (a -> b) -> a -> b
$ Sequence -> Sequence -> Either String Int
D.hamming Sequence
x Sequence
y
          | Sequence
x <- [Sequence]
seqs,
            Sequence
y <- [Sequence]
seqs,
            Sequence
x forall a. Eq a => a -> a -> Bool
/= Sequence
y
        ]
    (Double
hMean, Double
hVar) = forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
Sm.meanVariance Vector Double
pairwiseHamming
    hMin :: Double
hMin = forall a. (Unbox a, Ord a) => Vector a -> a
V.minimum Vector Double
pairwiseHamming
    hMax :: Double
hMax = forall a. (Unbox a, Ord a) => Vector a -> a
V.maximum Vector Double
pairwiseHamming
    kEffs :: [Double]
kEffs = FrequencyData -> [Double]
M.kEffEntropy FrequencyData
charFreqsPerSite
    kEffsNoGaps :: [Double]
kEffsNoGaps = FrequencyData -> [Double]
M.kEffEntropy FrequencyData
aNoGapsFreq
    kEffsOnlyStd :: [Double]
kEffsOnlyStd = FrequencyData -> [Double]
M.kEffEntropy FrequencyData
aOnlyStdFreq
    kEffMean :: Double
kEffMean = forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffs forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffs)
    kEffMeanNoGaps :: Double
kEffMeanNoGaps = forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsNoGaps forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsNoGaps)
    kEffMeanOnlyStd :: Double
kEffMeanOnlyStd = forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsOnlyStd forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsOnlyStd)
    kEffsHomo :: [Double]
kEffsHomo = FrequencyData -> [Double]
M.kEffHomoplasy FrequencyData
charFreqsPerSite
    kEffsNoGapsHomo :: [Double]
kEffsNoGapsHomo = FrequencyData -> [Double]
M.kEffHomoplasy FrequencyData
aNoGapsFreq
    kEffsOnlyStdHomo :: [Double]
kEffsOnlyStdHomo = FrequencyData -> [Double]
M.kEffHomoplasy FrequencyData
aOnlyStdFreq
    kEffMeanHomo :: Double
kEffMeanHomo = forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsHomo forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsHomo)
    kEffMeanNoGapsHomo :: Double
kEffMeanNoGapsHomo =
      forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsNoGapsHomo forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsNoGapsHomo)
    kEffMeanOnlyStdHomo :: Double
kEffMeanOnlyStdHomo =
      forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsOnlyStdHomo forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsOnlyStdHomo)
    perSiteBS :: ByteString
perSiteBS =
      if Bool
perSiteFlag
        then
          [ByteString] -> ByteString
BL.unlines
            [ String -> ByteString
BL.pack String
"Effective number of used states per site (measured using entropy):",
              String -> ByteString
BL.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> String
show forall a b. (a -> b) -> a -> b
$ [Double]
kEffs
            ]
        else ByteString
BL.empty

examine :: Bool -> [Seq.Sequence] -> BL.ByteString
examine :: Bool -> [Sequence] -> ByteString
examine Bool
perSiteFlag [Sequence]
ss =
  [Sequence] -> ByteString
Seq.summarizeSequences [Sequence]
ss forall a. Semigroup a => a -> a -> a
<> case [Sequence] -> Either String Alignment
M.fromSequences [Sequence]
ss of
    Left String
_ -> ByteString
BL.empty
    Right Alignment
a -> String -> ByteString
BL.pack String
"\n" forall a. Semigroup a => a -> a -> a
<> Bool -> Alignment -> ByteString
examineAlignment Bool
perSiteFlag Alignment
a

-- From https://stackoverflow.com/a/52602906/3536806.
tuples :: [a] -> [(a, a)]
tuples :: forall a. [a] -> [(a, a)]
tuples [] = []
tuples (a
x : [a]
xs) = forall a b. (a -> b) -> [a] -> [b]
map (a
x,) [a]
xs forall a. [a] -> [a] -> [a]
++ forall a. [a] -> [(a, a)]
tuples [a]
xs

-- This is all ugly, but who cares.
writeDivergenceMatrix :: Handle -> (Seq.Sequence, Seq.Sequence, MU.Matrix Int) -> IO ()
writeDivergenceMatrix :: Handle -> (Sequence, Sequence, Matrix Int) -> IO ()
writeDivergenceMatrix Handle
h (Sequence
x, Sequence
y, Matrix Int
xs) = do
  Handle -> ByteString -> IO ()
BL.hPutStrLn Handle
h forall a b. (a -> b) -> a -> b
$ ByteString
"> " forall a. Semigroup a => a -> a -> a
<> Sequence -> ByteString
Seq.name Sequence
x forall a. Semigroup a => a -> a -> a
<> ByteString
", " forall a. Semigroup a => a -> a -> a
<> Sequence -> ByteString
Seq.name Sequence
y
  Handle -> String -> IO ()
hPutStr Handle
h forall a b. (a -> b) -> a -> b
$ Int -> Matrix Double -> String
L.dispf Int
0 Matrix Double
m
  where
    n :: Int
n = forall a. Context a => Matrix a -> Int
MU.rows Matrix Int
xs
    m :: Matrix Double
m = Int -> [Double] -> Matrix Double
L.matrix Int
n forall a b. (a -> b) -> a -> b
$ forall a b. (a -> b) -> [a] -> [b]
map forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ forall a. Context a => Matrix a -> [a]
MU.toList Matrix Int
xs

computeDivergenceMatrices :: [Seq.Sequence] -> ELynx ExamineArguments ()
computeDivergenceMatrices :: [Sequence] -> ELynx ExamineArguments ()
computeDivergenceMatrices [Sequence]
ss = do
  forall e.
(HasLock e, HasLogHandles e, HasVerbosity e) =>
String -> Logger e ()
logInfoS String
"Compute divergence matrices."
  let xys :: [(Sequence, Sequence)]
xys = forall a. [a] -> [(a, a)]
tuples [Sequence]
ss
      ds :: [(Sequence, Sequence, Matrix Int)]
ds = forall a b. (a -> b) -> [a] -> [b]
map (\(Sequence
x, Sequence
y) -> (Sequence
x, Sequence
y, forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either forall a. HasCallStack => String -> a
error forall a. a -> a
id forall a b. (a -> b) -> a -> b
$ Sequence -> Sequence -> Either String (Matrix Int)
divergence Sequence
x Sequence
y)) [(Sequence, Sequence)]
xys
  Handle
h <- forall a. Reproducible a => String -> String -> ELynx a Handle
outHandle String
"divergence matrices" String
".div"
  forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall b c a. (b -> c) -> (a -> b) -> a -> c
. Handle -> (Sequence, Sequence, Matrix Int) -> IO ()
writeDivergenceMatrix Handle
h) [(Sequence, Sequence, Matrix Int)]
ds
  forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall a b. (a -> b) -> a -> b
$ Handle -> IO ()
hClose Handle
h

-- | Examine sequences.
examineCmd :: ELynx ExamineArguments ()
examineCmd :: ELynx ExamineArguments ()
examineCmd = do
  (ExamineArguments Alphabet
al String
inFile Bool
perSiteFlag Bool
divergenceFlag) <- forall a. Environment a -> a
localArguments forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (m :: * -> *) r. Monad m => ReaderT r m r
ask
  [Sequence]
ss <- forall e.
(HasLock e, HasLogHandles e, HasVerbosity e) =>
Alphabet -> String -> Logger e [Sequence]
readSeqs Alphabet
al String
inFile
  let result :: ByteString
result = Bool -> [Sequence] -> ByteString
examine Bool
perSiteFlag [Sequence]
ss
  forall a.
Reproducible a =>
String -> ByteString -> String -> ELynx a ()
out String
"result of examination" ByteString
result String
".out"
  forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
divergenceFlag ([Sequence] -> ELynx ExamineArguments ()
computeDivergenceMatrices [Sequence]
ss)