{- integreat
Gregory W. Schwartz

Integrate data from multiple sources to find consistent (or inconsistent)
entities.
-}

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE QuasiQuotes #-}

module Main where

-- Standard
import Control.Monad
import Data.Bool
import qualified Data.Map.Strict as Map
import Data.Maybe
import Data.Monoid
import qualified Data.Set as Set
import System.IO

-- Cabal
import qualified Control.Lens as L
import Control.Monad.Trans
import qualified Data.ByteString.Lazy.Char8 as CL
import qualified Data.Csv as CSV
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Vector as V
import Options.Generic

-- Local
import Types
import Utility
import Load
import Edge.Correlation
import Integrate
import Print

-- | Command line arguments
data Options = Options { dataInput          :: Maybe String
                                           <?> "([STDIN] | FILE) The input file containing the data intensities. Follows the format: dataLevel,dataReplicate,vertex,intensity. dataLevel is the level (the base level for the experiment, like \"proteomic_cancer\" or \"RNA_cancer\" for instance, requires at least two levels), dataReplicate is the replicate in that experiment that the entity is from (the name of that data set with the replicate name, like \"RNA_cancer_1\"), and vertex is the name of the entity (must match those in the vertex-input), and the intensity is the value of this entity in this data set."
                       , vertexInput        :: Maybe String
                                           <?> "([Nothing] | FILE) The input file containing similarities between entities. Follows the format: vertexLevel1,vertexLevel2, vertex1,vertex2,similarity. vertexLevel1 is the level (the base title for the experiment, \"data set\") that vertex1 is from, vertexLevel2 is the level that vertex2 is from, and the similarity is a number representing the similarity between those two entities. If not specified, then the same entity (determined by vertex in data-input) will have a similarity of 1, different entities will have a similarity of 0."
                       , entityDiff         :: Maybe T.Text
                                           <?> "([Nothing] | STRING) When comparing entities that are the same, ignore the text after this separator. Used for comparing phosphorylated positions with another level. For example, if we have a strings ARG29 and ARG29_7 that we want to compare, we want to say that their value is the highest in correlation, so this string would be \"_\""
                       , alignmentMethod    :: Maybe String
                                           <?> "([CosineSimilarity] | RandomWalker | RandomWalkerSim) The method to get integrated vertex similarity between levels. CosineSimilarity uses the cosine similarity of each  vertex in each network compared to the other vertices in  other networks. RandomWalker uses a random walker with restart based network algnment algorithm in order to get similarity. RandomWalkerSim uses a random walker with restart and actually simulates the walker to get a stochastic result."
                       , edgeMethod         :: Maybe String
                                           <?> "([SpearmanCorrelation] | PearsonCorrelation ) The method to use for the edges between entities in the coexpression matrix."
                       , walkerRestart      :: Maybe Double
                                           <?> "([0.25] | PROBABILITY) For the random walker algorithm, the probability of making  a jump to a random vertex. Recommended to be the ratio of  the total number of vertices in the top 99% smallest subnetworks to the total number of nodes in the reduced  product graph (Jeong, 2015)."
                       , steps              :: Maybe Int
                                           <?> "([100] | STEPS) For the random walker algorithm, the number of steps to take  before stopping."
                       , premade            :: Bool
                                           <?> "([False] | BOOL) Whether the input data (dataInput) is a pre-made network of the format \"[([\"VERTEX\"], [(\"SOURCE\", \"DESTINATION\", WEIGHT)])]\", where VERTEX, SOURCE, and DESTINATION are of type INT starting at 0, in order, and WEIGHT is a DOUBLE representing the weight of the edge between SOURCE and DESTINATION."
                       , test               :: Bool
                                           <?> "([False] | BOOL) Whether the input data from premade is from a test run. If supplied, the output is changed to an accuracy measure. In this case, we get the total rank below the number of permuted vertices divided by the theoretical maximum (so if there were five changed vertices out off 10 and two were rank 8 and 10 while the others were in the top five, we would have (1 - ((3 + 5) / (10 + 9 + 8 + 7 + 6))) as the accuracy."
                       , entityFilter       :: Maybe Int
                                           <?> "([Nothing] | INT) The minimum number of samples an entity must appear in, otherwise the entity is removed from the analysis."
                       , entityFilterStdDev :: Maybe Double
                                           <?> "([Nothing] | DOUBLE) Remove entities that have less than this value for their standard deviation among all samples."
                       , permutations       :: Maybe Int
                                           <?> "([1000] | INT) The number of permutations for cosine similarity permutation test or bootstrap. Right now just does bootstrap and only shows the first comparison if there are multiple comparisons."
                       }
               deriving (Generic)

instance ParseRecord Options

-- | Get all of the required information for integration.
getIntegrationInput
    :: Options
    -> IO (Maybe (Set.Set ID), Maybe UnifiedData, IDMap, IDVec, VertexSimMap, EdgeSimMap, GrMap)
getIntegrationInput opts = do
    let processCsv = snd . either error id

    hPutStrLn stderr "Getting data input."
    dataEntries   <- fmap (processCsv . CSV.decodeByName)
                   . maybe CL.getContents CL.readFile
                   . unHelpful
                   . dataInput
                   $ opts

    let numSamples     = fmap NumSamples . unHelpful . entityFilter $ opts
        stdDevThresh   =
            fmap StdDevThreshold . unHelpful . entityFilterStdDev $ opts
        levels         = (\x -> maybe x (flip filterEntitiesStdDev x) stdDevThresh)
                       . entitiesToLevels
                       . (\x -> maybe x (flip filterEntities x) numSamples)
                       . datasToEntities
                       . V.toList
                       $ dataEntries
        unifiedData    = unifyAllLevels . fmap snd $ levels
        levelNames     = Set.toList . Set.fromList . fmap fst $ levels
        idMap          = getIDMap unifiedData
        idVec          = getIDVec unifiedData
        size           = Size . Map.size . unIDMap $ idMap
        eDiff          = fmap EntityDiff . unHelpful . entityDiff $ opts
        vertexContents =
            fmap (fmap (processCsv . CSV.decodeByName) . CL.readFile)
                . unHelpful
                . vertexInput
                $ opts

    hPutStrLn stderr $ "Level information (Name, Number of entities):"
    hPutStrLn stderr
        . show
        . fmap (L.over L._2 (Map.size . unLevel))
        $ levels

    when (isJust vertexContents)
        $ hPutStrLn stderr "Getting vertex similarities."

    vertexSimMap <- maybe
                        (return . defVertexSimMap size $ levelNames)
                        (fmap (vertexCsvToLevels idMap . V.toList))
                  $ vertexContents

    let edgeSimMethod = maybe SpearmanCorrelation read
                      . unHelpful
                      . edgeMethod
                      $ opts
        getSimMat SpearmanCorrelation = return . getSimMatCorrelation edgeSimMethod
        getSimMat PearsonCorrelation  = return . getSimMatCorrelation edgeSimMethod

    liftIO $ hPutStrLn stderr "Getting edge similarities."

    edgeSimMap   <- fmap (EdgeSimMap . Map.fromList)
                  . mapM ( L.sequenceOf L._2
                         . L.over L._2 ( getSimMat edgeSimMethod
                                       . standardizeLevel idMap
                                       )
                         )
                  $ levels

    let grMap = GrMap Map.empty

    return (Nothing, Just unifiedData, idMap, idVec, vertexSimMap, edgeSimMap, grMap)

-- | Get all of the network info that is pre-made for input into the integration method.
getPremadeIntegrationInput
    :: Options
    -> IO (Maybe (Set.Set ID), Maybe UnifiedData, IDMap, IDVec, VertexSimMap, EdgeSimMap, GrMap)
getPremadeIntegrationInput opts = do

    contents <- maybe getContents readFile
              . unHelpful
              . dataInput
              $ opts

    if unHelpful . test $ opts
        then return
           . getPremadeNetworks
           . L.over L._1 Just
           $ (read contents :: ([String], [([String], [(String, String, Double)])]))
        else return
           . getPremadeNetworks
           . (Nothing,)
           $ (read contents :: [([String], [(String, String, Double)])])

-- | Show the accuracy of a test analysis.
showAccuracy :: Set.Set ID -> IDVec -> V.Vector NodeCorrScoresInfo -> T.Text
showAccuracy truthSet idVec nodeCorrScoresInfo =
    T.pack . show . getAccuracy truthSet idVec $ nodeCorrScoresInfo

main :: IO ()
main = do
    opts <- getRecord "integreat, Gregory W. Schwartz.\
                      \ Integrate data from multiple sources to find consistent\
                      \ (or inconsistent) entities."

    (truthSet, unifiedData, idMap, idVec, vertexSimMap, edgeSimMap, grMap) <-
        bool (getIntegrationInput opts) (getPremadeIntegrationInput opts)
            . unHelpful
            . premade
            $ opts

    let alignment =
            maybe CosineSimilarity read . unHelpful . alignmentMethod $ opts
        size      = Size . Map.size . unIDMap $ idMap
        nPerm     =
            Permutations . fromMaybe 1000 . unHelpful . permutations $ opts

    hPutStrLn
        stderr
        "Calculating vertex similarities and bootstraps between networks."

    nodeCorrScoresMap <- case alignment of
        CosineSimilarity ->
            integrateCosineSim nPerm size vertexSimMap edgeSimMap
        RandomWalker     ->
            integrateWalker
                nPerm
                size
                ( WalkerRestart
                . fromMaybe 0.25
                . unHelpful
                . walkerRestart
                $ opts
                )
                edgeSimMap
        RandomWalkerSim ->
            integrateWalkerSim
                ( WalkerRestart
                . fromMaybe 0.25
                . unHelpful
                . walkerRestart
                $ opts
                )
                (Counter . fromMaybe 100 . unHelpful . steps $ opts)
                grMap

    hPutStrLn stderr "Calculating node correspondence scores."

    nodeCorrScoresInfo <- getNodeCorrScoresInfo nodeCorrScoresMap

    if unHelpful . test $ opts
        then T.putStr
            . maybe
                    (error "Truth set not found.")
                    (\x -> showAccuracy x idVec nodeCorrScoresInfo)
            $ truthSet
        else T.putStr
            . printNodeCorrScores idVec unifiedData nodeCorrScoresMap
            $ nodeCorrScoresInfo

    return ()