{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell   #-}

{- |
Module      :  Analyze.Analyze
Description :  Parse sequence file formats and analyze them
Copyright   :  (c) Dominik Schrempf 2018
License     :  GPL-3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  portable

Creation date: Fri Oct  5 08:41:05 2018.

-}

module SubSample.SubSample
  ( subSampleCmd
  )
where

import           Control.Monad
import           Control.Monad.IO.Class
import           Control.Monad.Logger
import           Control.Monad.Trans.Class
import           Control.Monad.Trans.Reader
import qualified Data.Text                                  as T
import qualified Data.Text.Lazy                             as LT
import qualified Data.Text.Lazy.Builder                     as LT
import qualified Data.Text.Lazy.Builder.Int                 as LT
import qualified Data.Vector                                as V
import           System.Random.MWC

import           SubSample.Options
import           Tools

import           ELynx.Data.Sequence.MultiSequenceAlignment
import           ELynx.Export.Sequence.Fasta
import           ELynx.Tools.InputOutput

-- | Get a given number of output file names with provided suffix.
--
-- > getOutFilePaths "BasePath" 11 "fasta"
--
-- Will result in @BasePath.00.fasta@ up to @BasePath.10.fasta@.
getOutFilePaths :: String -> Int -> String -> [FilePath]
getOutFilePaths file n suffix = [ file ++ "." ++ digitStr i ++ "." ++ suffix
                                | i <- [0 .. n-1] ]
  where nDigits    = ceiling $ logBase (10 :: Double) (fromIntegral n)
        digitStr i = T.unpack $ T.justifyRight nDigits '0' (LT.toStrict $ LT.toLazyText $ LT.decimal i)

-- | Sub sample sequences.
subSampleCmd :: Maybe FilePath  -- ^ Output file base name
             -> SubSample ()
subSampleCmd outFileBaseName = do
  SubSampleArguments al inFile nSites nAlignments seed <- lift ask
  $(logInfo) "Command: Sub sample from a multi sequence alignment."
  $(logInfo) $ T.pack $ "  Sample " <> show nSites <> " sites."
  $(logInfo) $ T.pack $ "  Sample " <> show nAlignments <> " multi sequence alignments."
  ss <- readSeqs al inFile
  gen <- liftIO $ maybe createSystemRandom (initialize . V.fromList) seed
  let msa = either error id (fromSequenceList ss)
  samples <- lift $ replicateM nAlignments $ randomSubSample nSites msa gen
  let results = map (sequencesToFasta . toSequenceList) samples
  outFilePaths <- case outFileBaseName of
    Nothing -> return $ repeat Nothing
    Just fn -> return $ Just <$> getOutFilePaths fn nAlignments "fasta"
  zipWithM_ (io "sub sampled multi sequence alignments") results outFilePaths