-- Diversity
-- By G.W. Schwartz

{- | Takes a fasta file and return the diversity of a certain order and
window length (to split into fragments) by position.
-}

{-# LANGUAGE BangPatterns #-}

-- Built-in
import Data.List
import Control.Monad (forever)
import qualified Data.Map.Strict as Map
import qualified System.IO as IO

-- Cabal
import Options.Applicative
import Data.Fasta.String
import Pipes
import qualified Pipes.Prelude as P

-- Local
import Math.Diversity.Types
import Math.Diversity.GenerateDiversity
import Math.Diversity.Print

-- Command line arguments
data Options = Options { inputLabel             :: String
                       , inputOrder             :: Double
                       , inputWindow            :: Int
                       , inputFasta             :: String
                       , inputSampleField       :: Int
                       , inputSubsampling       :: String
                       , fastBin                :: Bool
                       , runs                   :: Int
                       , removeNBool            :: Bool
                       , wholeSeq               :: Bool
                       , list                   :: Bool
                       , sample                 :: Bool
                       , rarefactionDF          :: Bool
                       , std                    :: Bool
                       , outputRarefaction      :: String
                       , outputRarefactionCurve :: String
                       , output                 :: String
                       }

-- Command line options
options :: Parser Options
options = Options
      <$> strOption
          ( long "input-label"
         <> short 'l'
         <> metavar "LABEL"
         <> value ""
         <> help "The label for this particular dataset (to differentiate\
                 \ the file in batch analyses)" )
      <*> option auto
          ( long "input-order"
         <> short 'r'
         <> metavar "[1]|INT"
         <> value 1
         <> help "The order of true diversity" )
      <*> option auto
          ( long "input-window"
         <> short 'w'
         <> metavar "[1]|INT"
         <> value 1
         <> help "The length of the sliding window for generating fragments" )
      <*> strOption
          ( long "input-fasta"
         <> short 'i'
         <> metavar "FILE"
         <> value ""
         <> help "The fasta file containing the germlines and clones" )
      <*> option auto
          ( long "input-sample-field"
         <> short 'S'
         <> metavar "INT"
         <> value 1
         <> help "The index for the sample ID in the header separated by '|'\
                 \ (1 indexed)" )
      <*> strOption
          ( long "input-subsampling"
         <> short 'I'
         <> metavar "INT INT"
         <> value "1 1"
         <> help "The start point and interval of subsamples in the\
                 \ rarefaction curve. For instance, '1 1' would be 1, 2, 3, ...\
                 \ '2 6' would be 2, 8, 14, ... Note: input is a string so\
                 \ use quotations around the entry and it always has the\
                 \ number of subsamples overall as the last point" )
      <*> switch
          ( long "fast-bin"
         <> short 'f'
         <> help "Whether to use a much faster, but approximated, binomial\
                 \ coefficient for the rarefaction analysis. This method\
                 \ results in NaNs for larger numbers, so in that case you\
                 \ you should use the slower, more accurate default method" )
      <*> option auto
          ( long "input-runs"
         <> short 'R'
         <> metavar "INT"
         <> value 0
         <> help "The number of runs for empirical resampling rarefaction.\
                 \ This method does not compute the theoretical, it reports the\
                 \ actual median and median absolute deviation (MAD) values of\
                 \ this many runs. If this value is not 0, empirical\
                 \ rarefaction is automatically enabled (individual based only,\
                 \ not for sample based)" )
      <*> switch
          ( long "remove-N"
         <> short 'n'
         <> help "Remove 'N' and 'n' characters" )
      <*> switch
          ( long "whole-sequence"
         <> short 'a'
         <> help "Ignore window length and only analyze the entire sequence for\
                 \ diversity and rarefaction curves." )
      <*> switch
          ( long "list"
         <> short 'L'
         <> help "Analyze a diversity of species in a list separated by lines\
                 \ instead of a fasta file" )
      <*> switch
          ( long "sample"
         <> short 's'
         <> help "Whether to use sample based rarefaction (requires sample ID\
                 \ field from input-sample-field)" )
      <*> switch
          ( long "rarefaction-df"
         <> short 'd'
         <> help "Whether to output the rarefaction curve as a data frame" )
      <*> switch
          ( long "std"
         <> short 't'
         <> help "Whether to output to stdout or to a file if no file is\
                 \ supplied" )
      <*> strOption
          ( long "output-rarefaction"
         <> short 'O'
         <> metavar "FILE"
         <> value ""
         <> help "The csv file containing the rarefaction values (the percent\
                 \ of the rarefaction curve that is above 95% of the height of\
                 \ the rarefaction curve). Expects a string, so you need a\
                 \ string even with std" )
      <*> strOption
          ( long "output-rarefaction-curve"
         <> short 'c'
         <> metavar "FILE"
         <> value ""
         <> help "The csv file containing the rarefaction curve. Expects a\
                 \ a string, so you need a string even with std" )
      <*> strOption
          ( long "output"
         <> short 'o'
         <> metavar "FILE"
         <> value ""
         <> help "The csv file containing the diversities at each position.\
                 \ expects a string, so you need a string wven with std" )

pipesPositionMap :: Options -> IO PositionMap
pipesPositionMap opts = do
    h <- if null . inputFasta $ opts
            then return IO.stdin
            else IO.openFile (inputFasta opts) IO.ReadMode
    runEffect
          $ P.fold (Map.unionWith (Map.unionWith (+))) Map.empty id
          $ P.fromHandle h
        >-> toFastaSequence (list opts) h
        >-> P.map ( generatePositionMap
                    (sample opts)
                    (inputSampleField opts)
                    (wholeSeq opts)
                    (inputWindow opts)
                  . removals )
  where
    toFastaSequence True _ = P.map ( \x -> FastaSequence { fastaHeader = ""
                                                        , fastaSeq = x } )
    toFastaSequence False h = pipesFasta h
    removals              = if removeNBool opts then removeN else id

generateDiversity :: Options -> IO ()
generateDiversity opts = do
    let contents     = parseFasta
        label        = inputLabel opts
        order        = inputOrder opts
        window       = inputWindow opts
        nFlag        = removeNBool opts
        whole        = wholeSeq opts
        start        = read . head . words . inputSubsampling $ opts
        interval     = read . last . words . inputSubsampling $ opts
        howToOutput x = if std opts then putStrLn else writeFile x

    positionMap <- pipesPositionMap opts

    if (null . output $ opts)
        then return ()
        else howToOutput (output opts)
           . printDiversity label order window
           $ positionMap
    if (null . outputRarefaction $ opts)
        then return ()
        else do
            s <- printRarefaction
                 (sample opts)
                 (fastBin opts)
                 (runs opts)
                 start
                 interval
                 label
                 window
                 positionMap
            howToOutput (outputRarefaction opts) s
    if (null . outputRarefactionCurve $ opts)
        then return ()
        else do
            s <- printRarefactionCurve
                 (rarefactionDF opts)
                 (sample opts)
                 (fastBin opts)
                 (runs opts)
                 start
                 interval
                 label
                 window
                 positionMap
            howToOutput (outputRarefactionCurve opts) s

main :: IO ()
main = execParser opts >>= generateDiversity
  where
    opts = info (helper <*> options)
      ( fullDesc
     <> progDesc "Return the diversity at each position for all sequences in a\
                 \ fasta file" )