-- Diversity
-- By G.W. Schwartz

{- | Takes a fasta file and return the diversity of a certain order and
window length (to split into fragments) by position.
-}

-- Built-in
import Data.List
import Control.Monad (forever, unless)
import qualified Data.Map.Strict as Map
import qualified System.IO as IO

-- Cabal
import Data.Semigroup ((<>))
import Data.Fasta.String
import Pipes
import qualified Pipes.Prelude as P
import Options.Applicative

-- Local
import Math.Diversity.Types
import Math.Diversity.GenerateDiversity
import Math.Diversity.Print

-- Command line arguments
data Options = Options { inputLabel             :: String
                       , inputOrder             :: Double
                       , inputWindow            :: Int
                       , inputFasta             :: String
                       , inputSampleField       :: Int
                       , inputCountField        :: Int
                       , inputSubsampling       :: String
                       , inputG                 :: Double
                       , fastBin                :: Bool
                       , runs                   :: Int
                       , gapsFlag               :: Bool
                       , removeNBool            :: Bool
                       , wholeSeq               :: Bool
                       , list                   :: Bool
                       , sample                 :: Bool
                       , rarefactionDF          :: Bool
                       , std                    :: Bool
                       , outputRarefaction      :: String
                       , outputRarefactionCurve :: String
                       , output                 :: String
                       }

-- Command line options
options :: Parser Options
options = Options
      <$> strOption
          ( long "input-label"
         <> short 'l'
         <> metavar "LABEL"
         <> value ""
         <> help "The label for this particular dataset (to differentiate\
                 \ the file in batch analyses)" )
      <*> option auto
          ( long "input-order"
         <> short 'r'
         <> metavar "[1]|INT"
         <> value 1
         <> help "The order of true diversity" )
      <*> option auto
          ( long "input-window"
         <> short 'w'
         <> metavar "[1]|INT"
         <> value 1
         <> help "The length of the sliding window for generating fragments" )
      <*> strOption
          ( long "input-fasta"
         <> short 'i'
         <> metavar "FILE"
         <> value ""
         <> help "The fasta file containing the germlines and clones" )
      <*> option auto
          ( long "input-sample-field"
         <> short 'S'
         <> metavar "INT"
         <> value 1
         <> help "The index for the sample ID in the header separated by '|'\
                 \ (1 indexed)" )
      <*> option auto
          ( long "input-count-field"
         <> short 'C'
         <> metavar "INT"
         <> value 0
         <> help "The index for the number of this type in the header separated\
                 \ by '|' (1 indexed). Used if there are multiple copies\
                 \ of one entry, so a '4' in the header would indicate that\
                 \ this entity occurred 4 times. Defaults to 0, meaning that\
                 \ this field is ignored and count each sequence as occurring\
                 \ just once" )
      <*> strOption
          ( long "input-subsampling"
         <> short 'I'
         <> metavar "INT INT (INT)"
         <> value "1 1"
         <> help "The start point, interval, and optional endpoint of\
                 \ subsamples in the rarefaction curve.\
                 \ For instance, '1 1 4' would be 1, 2, 3, 4\
                 \ '2 6 14' would be 2, 8, 14, ... Note: input is a string so\
                 \ use quotations around the entry and it always includes the\
                 \ number of subsamples overall in the result. Excluding the\
                 \ endpoint results in the number of samples the endpoint, so\
                 \ '1 1' would be 1, 2, 3, ..., N " )
      <*> option auto
          ( long "input-g"
         <> short 'g'
         <> metavar "Double"
         <> value 0.95
         <> help "Used for calculating the number of individuals\
                 \ (or samples) needed before the proportion g of the total\
                 \ number of estimated species is reached.\
                 \ Sobs / Sest < g < 1" )
      <*> switch
          ( long "fast-bin"
         <> short 'f'
         <> help "Whether to use a much faster, but approximated, binomial\
                 \ coefficient for the rarefaction analysis. This method\
                 \ results in NaNs for larger numbers, so in that case you\
                 \ you should use the slower, more accurate default method" )
      <*> option auto
          ( long "input-runs"
         <> short 'R'
         <> metavar "INT"
         <> value 0
         <> help "The number of runs for empirical resampling rarefaction.\
                 \ This method does not compute the theoretical, it reports the\
                 \ actual median and median absolute deviation (MAD) values of\
                 \ this many runs. If this value is not 0, empirical\
                 \ rarefaction is automatically enabled (individual based only,\
                 \ not for sample based)" )
      <*> switch
          ( long "keep-gaps"
         <> short 'G'
         <> help "Do not remove '.' and '-' characters from the analysis. This\
                 \ flag will thus treat these characters as additional entities\
                 \ rather than be ignored as artificial biological gaps in\
                 \ a sequence" )
      <*> switch
          ( long "remove-N"
         <> short 'n'
         <> help "Remove 'N' and 'n' characters" )
      <*> switch
          ( long "whole-sequence"
         <> short 'a'
         <> help "Ignore window length and only analyze the entire sequence for\
                 \ diversity and rarefaction curves." )
      <*> switch
          ( long "list"
         <> short 'L'
         <> help "Analyze a diversity of species in a list separated by lines\
                 \ instead of a fasta file" )
      <*> switch
          ( long "sample"
         <> short 's'
         <> help "Whether to use sample based rarefaction (requires sample ID\
                 \ field from input-sample-field)" )
      <*> switch
          ( long "rarefaction-df"
         <> short 'd'
         <> help "Whether to output the rarefaction curve as a data frame" )
      <*> switch
          ( long "std"
         <> short 't'
         <> help "Whether to output to stdout or to a file if no file is\
                 \ supplied" )
      <*> strOption
          ( long "output-rarefaction"
         <> short 'O'
         <> metavar "FILE"
         <> value ""
         <> help "The csv file containing the rarefaction values (the percent\
                 \ of the rarefaction curve that is above 95% of the height of\
                 \ the rarefaction curve). Expects a string, so you need a\
                 \ string even with std" )
      <*> strOption
          ( long "output-rarefaction-curve"
         <> short 'c'
         <> metavar "FILE"
         <> value ""
         <> help "The csv file containing the rarefaction curve. Expects a\
                 \ a string, so you need a string even with std" )
      <*> strOption
          ( long "output"
         <> short 'o'
         <> metavar "FILE"
         <> value ""
         <> help "The csv file containing the diversities at each position.\
                 \ expects a string, so you need a string even with std" )

parseSampling :: (Num a, Read a) => String -> [a]
parseSampling = map read . parsing . words
  where
    parsing [x, y] = [x, y, "0"]
    parsing x      = x

pipesPositionMap :: Options -> IO PositionMap
pipesPositionMap opts = do
    h <- if null . inputFasta $ opts
            then return IO.stdin
            else IO.openFile (inputFasta opts) IO.ReadMode
    runEffect
          $ P.fold (Map.unionWith (Map.unionWith (+))) Map.empty id
          $ P.fromHandle h
        >-> toFastaSequence (list opts) h
        >-> P.map ( generatePositionMap
                    (gapsFlag opts)
                    (sample opts)
                    (inputSampleField opts)
                    (inputCountField opts)
                    (wholeSeq opts)
                    (inputWindow opts)
                  . removals )
  where
    toFastaSequence True _ = P.map ( \x -> FastaSequence { fastaHeader = ""
                                                        , fastaSeq = x } )
    toFastaSequence False h = pipesFasta h
    removals              = if removeNBool opts then removeN else id

generateDiversity :: Options -> IO ()
generateDiversity opts = do
    let contents     = parseFasta
        label        = inputLabel opts
        order        = inputOrder opts
        window       = inputWindow opts
        nFlag        = removeNBool opts
        whole        = wholeSeq opts
        [start, interval, end] = parseSampling . inputSubsampling $ opts
        howToOutput x = if std opts then putStrLn else writeFile x

    positionMap <- pipesPositionMap opts

    unless (null . output $ opts)
         $ howToOutput (output opts)
         . printDiversity label order window
         $ positionMap
    unless (null . outputRarefaction $ opts)
         $ do
            s <- printRarefaction
                 (sample opts)
                 (inputG opts)
                 label
                 window
                 positionMap
            howToOutput (outputRarefaction opts) s
    unless (null . outputRarefactionCurve $ opts)
         $ do
            s <- printRarefactionCurve
                 (rarefactionDF opts)
                 (sample opts)
                 (fastBin opts)
                 (runs opts)
                 start
                 interval
                 end
                 label
                 window
                 positionMap
            howToOutput (outputRarefactionCurve opts) s

main :: IO ()
main = execParser opts >>= generateDiversity
  where
    opts = info (helper <*> options)
      ( fullDesc
     <> progDesc "Quantify the diversity of a population" )