{- |
Module      :  ELynx.Import.MarkovProcess.EDMModelPhylobayes
Description :  Import stationary distributions from Phylobayes format
Copyright   :  (c) Dominik Schrempf 2019
License     :  GPL-3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  portable

Creation date: Tue Jan 29 12:12:55 2019.

-}

module ELynx.Import.MarkovProcess.EDMModelPhylobayes
  ( Parser
  , EDMComponent
  , phylobayes
  ) where

import           Control.Monad
import qualified Data.ByteString.Lazy.Char8        as L
import qualified Data.Vector.Storable              as V
import           Data.Void
import           Text.Megaparsec
import           Text.Megaparsec.Byte
import           Text.Megaparsec.Byte.Lexer

import           ELynx.Data.MarkovProcess.EDMModel
import           ELynx.Tools.ByteString            (c2w)

-- | Shortcut.
type Parser = Parsec Void L.ByteString

-- | Parse stationary distributions from Phylobayes format.
phylobayes :: Parser [EDMComponent]
phylobayes = do
  n <- headerLine
  k <- kComponentsLine
  cs <- count k $ dataLine n
  _ <- many newline *> eof
    <?> "phylobayes"
  return cs

horizontalSpace :: Parser ()
horizontalSpace = skipMany $ char (c2w ' ') <|> tab

headerLine :: Parser Int
headerLine = do
  n <- decimal
  _ <- horizontalSpace
  -- FIXME: This should be more general, but then we also want to ensure that
  -- the order of states is correct.
  _ <- chunk (L.pack "A C D E F G H I K L M N P Q R S T V W Y") <|> chunk (L.pack "A C G T")
  _ <- many newline
    <?> "headerLine"
  return n

kComponentsLine :: Parser Int
kComponentsLine = decimal <* newline <?> "kComponentsLine"

dataLine :: Int -> Parser EDMComponent
dataLine n = do
  weight <- float
  _ <- horizontalSpace
  vals <- float `sepBy` horizontalSpace
  when (length vals /= n) (error "Did not find correct number of entries.")
  _ <- many newline
    <?> "dataLine"
  return (weight, V.fromList vals)