-- |
-- Module      :  ELynx.Import.MarkovProcess.SiteprofilesPhylobayes
-- Description :  Import site profiles in Phylobayes format
-- Copyright   :  2021 Dominik Schrempf
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Tue Jan 29 12:12:55 2019.
--
-- For now I just try to go with a huge empirical distribution mixture model. Let's
-- see if performance is good enough.
--
-- There are subtle differences between
-- `ELynx.Import.MarkovProcess.EDMModelPhylobayes` and this module, which collects
-- one stationary distribution for each site.
module ELynx.Import.MarkovProcess.SiteprofilesPhylobayes
  ( siteprofiles,
  )
where

import Control.Applicative
import Control.Monad
import qualified Data.Attoparsec.ByteString as AS
import qualified Data.Attoparsec.ByteString.Char8 as AC
import Data.Containers.ListUtils (nubInt)
import qualified Data.Vector.Storable as V
import ELynx.Import.MarkovProcess.EDMModelPhylobayes

-- | Parse stationary distributions from Phylobayes format.
siteprofiles :: AS.Parser [EDMComponent]
siteprofiles :: Parser [EDMComponent]
siteprofiles = (forall i a. Parser i a -> String -> Parser i a
AS.<?> String
"phylobayes siteprofiles") forall a b. (a -> b) -> a -> b
$ do
  ()
_ <- Parser ()
headerLines
  [EDMComponent]
cs <- forall (f :: * -> *) a. Alternative f => f a -> f [a]
many Parser EDMComponent
dataLine
  ()
_ <- (Char -> Bool) -> Parser ()
AC.skipWhile Char -> Bool
AC.isSpace
  ()
_ <- forall t. Chunk t => Parser t ()
AS.endOfInput
  let ls :: [Int]
ls = forall a b. (a -> b) -> [a] -> [b]
map forall (t :: * -> *) a. Foldable t => t a -> Int
length [EDMComponent]
cs
      nLs :: Int
nLs = forall (t :: * -> *) a. Foldable t => t a -> Int
length forall a b. (a -> b) -> a -> b
$ [Int] -> [Int]
nubInt [Int]
ls
  forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when
    (Int
nLs forall a. Eq a => a -> a -> Bool
/= Int
1)
    (forall a. HasCallStack => String -> a
error String
"The site profiles have a different number of entries.")
  forall (m :: * -> *) a. Monad m => a -> m a
return [EDMComponent]
cs

line :: AS.Parser ()
line :: Parser ()
line = (Word8 -> Bool) -> Parser ()
AS.skipWhile (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Bool
AC.isEndOfLine)

-- For now, just ignore the header.
headerLines :: AS.Parser ()
headerLines :: Parser ()
headerLines = Parser ()
line forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> (Char -> Bool) -> Parser ()
AC.skipWhile Char -> Bool
AC.isSpace forall i a. Parser i a -> String -> Parser i a
AS.<?> String
"headerLine"

dataLine :: AS.Parser EDMComponent
dataLine :: Parser EDMComponent
dataLine = (forall i a. Parser i a -> String -> Parser i a
AS.<?> String
"dataLine") forall a b. (a -> b) -> a -> b
$ do
  -- Ignore site number.
  Int
_ <- forall a. Integral a => Parser a
AC.decimal :: AS.Parser Int
  ()
_ <- (Word8 -> Bool) -> Parser ()
AS.skipWhile Word8 -> Bool
AC.isHorizontalSpace
  -- Also ignore additional white space on line.
  [Double]
vals <- Parser Double
AC.double forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`AS.sepBy1` (Word8 -> Bool) -> Parser ()
AS.skipWhile Word8 -> Bool
AC.isHorizontalSpace
  ()
_ <- (Char -> Bool) -> Parser ()
AC.skipWhile Char -> Bool
AC.isSpace
  -- Set the weight to 1.0 for all sites.
  forall (m :: * -> *) a. Monad m => a -> m a
return (Double
1.0, forall a. Storable a => [a] -> Vector a
V.fromList [Double]
vals)