{- integreat Gregory W. Schwartz Integrate data from multiple sources to find consistent (or inconsistent) entities. -} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE QuasiQuotes #-} module Main where -- Standard import Control.Monad import Data.Bool import qualified Data.Map.Strict as Map import Data.Maybe import Data.Monoid import qualified Data.Set as Set import System.IO -- Cabal import qualified Control.Lens as L import Control.Monad.Trans import qualified Data.ByteString.Lazy.Char8 as CL import qualified Data.Csv as CSV import qualified Data.Text as T import qualified Data.Text.IO as T import qualified Data.Vector as V import Options.Generic -- Local import Types import Utility import Load import Edge.Correlation import Integrate import Print -- | Command line arguments data Options = Options { dataInput :: Maybe String "([STDIN] | FILE) The input file containing the data intensities. Follows the format: dataLevel,dataReplicate,vertex,intensity. dataLevel is the level (the base level for the experiment, like \"proteomic_cancer\" or \"RNA_cancer\" for instance, requires at least two levels), dataReplicate is the replicate in that experiment that the entity is from (the name of that data set with the replicate name, like \"RNA_cancer_1\"), and vertex is the name of the entity (must match those in the vertex-input), and the intensity is the value of this entity in this data set." , vertexInput :: Maybe String "([Nothing] | FILE) The input file containing similarities between entities. Follows the format: vertexLevel1,vertexLevel2, vertex1,vertex2,similarity. vertexLevel1 is the level (the base title for the experiment, \"data set\") that vertex1 is from, vertexLevel2 is the level that vertex2 is from, and the similarity is a number representing the similarity between those two entities. If not specified, then the same entity (determined by vertex in data-input) will have a similarity of 1, different entities will have a similarity of 0." , entityDiff :: Maybe T.Text "([Nothing] | STRING) When comparing entities that are the same, ignore the text after this separator. Used for comparing phosphorylated positions with another level. For example, if we have a strings ARG29 and ARG29_7 that we want to compare, we want to say that their value is the highest in correlation, so this string would be \"_\"" , alignmentMethod :: Maybe String "([CosineSimilarity] | RandomWalker | RandomWalkerSim) The method to get integrated vertex similarity between levels. CosineSimilarity uses the cosine similarity of each vertex in each network compared to the other vertices in other networks. RandomWalker uses a random walker with restart based network algnment algorithm in order to get similarity. RandomWalkerSim uses a random walker with restart and actually simulates the walker to get a stochastic result." , edgeMethod :: Maybe String "([SpearmanCorrelation] | PearsonCorrelation ) The method to use for the edges between entities in the coexpression matrix." , walkerRestart :: Maybe Double "([0.25] | PROBABILITY) For the random walker algorithm, the probability of making a jump to a random vertex. Recommended to be the ratio of the total number of vertices in the top 99% smallest subnetworks to the total number of nodes in the reduced product graph (Jeong, 2015)." , steps :: Maybe Int "([100] | STEPS) For the random walker algorithm, the number of steps to take before stopping." , premade :: Bool "([False] | BOOL) Whether the input data (dataInput) is a pre-made network of the format \"[([\"VERTEX\"], [(\"SOURCE\", \"DESTINATION\", WEIGHT)])]\", where VERTEX, SOURCE, and DESTINATION are of type INT starting at 0, in order, and WEIGHT is a DOUBLE representing the weight of the edge between SOURCE and DESTINATION." , test :: Bool "([False] | BOOL) Whether the input data from premade is from a test run. If supplied, the output is changed to an accuracy measure. In this case, we get the total rank below the number of permuted vertices divided by the theoretical maximum (so if there were five changed vertices out off 10 and two were rank 8 and 10 while the others were in the top five, we would have (1 - ((3 + 5) / (10 + 9 + 8 + 7 + 6))) as the accuracy." , entityFilter :: Maybe Int "([Nothing] | INT) The minimum number of samples an entity must appear in, otherwise the entity is removed from the analysis." , entityFilterStdDev :: Maybe Double "([Nothing] | DOUBLE) Remove entities that have less than this value for their standard deviation among all samples." , permutations :: Maybe Int "([1000] | INT) The number of permutations for cosine similarity permutation test or bootstrap. Right now just does bootstrap and only shows the first comparison if there are multiple comparisons." } deriving (Generic) instance ParseRecord Options -- | Get all of the required information for integration. getIntegrationInput :: Options -> IO (Maybe (Set.Set ID), Maybe UnifiedData, IDMap, IDVec, VertexSimMap, EdgeSimMap, GrMap) getIntegrationInput opts = do let processCsv = snd . either error id hPutStrLn stderr "Getting data input." dataEntries <- fmap (processCsv . CSV.decodeByName) . maybe CL.getContents CL.readFile . unHelpful . dataInput $ opts let numSamples = fmap NumSamples . unHelpful . entityFilter $ opts stdDevThresh = fmap StdDevThreshold . unHelpful . entityFilterStdDev $ opts levels = (\x -> maybe x (flip filterEntitiesStdDev x) stdDevThresh) . entitiesToLevels . (\x -> maybe x (flip filterEntities x) numSamples) . datasToEntities . V.toList $ dataEntries unifiedData = unifyAllLevels . fmap snd $ levels levelNames = Set.toList . Set.fromList . fmap fst $ levels idMap = getIDMap unifiedData idVec = getIDVec unifiedData size = Size . Map.size . unIDMap $ idMap eDiff = fmap EntityDiff . unHelpful . entityDiff $ opts vertexContents = fmap (fmap (processCsv . CSV.decodeByName) . CL.readFile) . unHelpful . vertexInput $ opts hPutStrLn stderr $ "Level information (Name, Number of entities):" hPutStrLn stderr . show . fmap (L.over L._2 (Map.size . unLevel)) $ levels when (isJust vertexContents) $ hPutStrLn stderr "Getting vertex similarities." vertexSimMap <- maybe (return . defVertexSimMap size $ levelNames) (fmap (vertexCsvToLevels idMap . V.toList)) $ vertexContents let edgeSimMethod = maybe SpearmanCorrelation read . unHelpful . edgeMethod $ opts getSimMat SpearmanCorrelation = return . getSimMatCorrelation edgeSimMethod getSimMat PearsonCorrelation = return . getSimMatCorrelation edgeSimMethod liftIO $ hPutStrLn stderr "Getting edge similarities." edgeSimMap <- fmap (EdgeSimMap . Map.fromList) . mapM ( L.sequenceOf L._2 . L.over L._2 ( getSimMat edgeSimMethod . standardizeLevel idMap ) ) $ levels let grMap = GrMap Map.empty return (Nothing, Just unifiedData, idMap, idVec, vertexSimMap, edgeSimMap, grMap) -- | Get all of the network info that is pre-made for input into the integration method. getPremadeIntegrationInput :: Options -> IO (Maybe (Set.Set ID), Maybe UnifiedData, IDMap, IDVec, VertexSimMap, EdgeSimMap, GrMap) getPremadeIntegrationInput opts = do contents <- maybe getContents readFile . unHelpful . dataInput $ opts if unHelpful . test $ opts then return . getPremadeNetworks . L.over L._1 Just $ (read contents :: ([String], [([String], [(String, String, Double)])])) else return . getPremadeNetworks . (Nothing,) $ (read contents :: [([String], [(String, String, Double)])]) -- | Show the accuracy of a test analysis. showAccuracy :: Set.Set ID -> IDVec -> V.Vector NodeCorrScoresInfo -> T.Text showAccuracy truthSet idVec nodeCorrScoresInfo = T.pack . show . getAccuracy truthSet idVec $ nodeCorrScoresInfo main :: IO () main = do opts <- getRecord "integreat, Gregory W. Schwartz.\ \ Integrate data from multiple sources to find consistent\ \ (or inconsistent) entities." (truthSet, unifiedData, idMap, idVec, vertexSimMap, edgeSimMap, grMap) <- bool (getIntegrationInput opts) (getPremadeIntegrationInput opts) . unHelpful . premade $ opts let alignment = maybe CosineSimilarity read . unHelpful . alignmentMethod $ opts size = Size . Map.size . unIDMap $ idMap nPerm = Permutations . fromMaybe 1000 . unHelpful . permutations $ opts hPutStrLn stderr "Calculating vertex similarities and bootstraps between networks." nodeCorrScoresMap <- case alignment of CosineSimilarity -> integrateCosineSim nPerm size vertexSimMap edgeSimMap RandomWalker -> integrateWalker nPerm size ( WalkerRestart . fromMaybe 0.25 . unHelpful . walkerRestart $ opts ) edgeSimMap RandomWalkerSim -> integrateWalkerSim ( WalkerRestart . fromMaybe 0.25 . unHelpful . walkerRestart $ opts ) (Counter . fromMaybe 100 . unHelpful . steps $ opts) grMap hPutStrLn stderr "Calculating node correspondence scores." nodeCorrScoresInfo <- getNodeCorrScoresInfo nodeCorrScoresMap if unHelpful . test $ opts then T.putStr . maybe (error "Truth set not found.") (\x -> showAccuracy x idVec nodeCorrScoresInfo) $ truthSet else T.putStr . printNodeCorrScores idVec unifiedData nodeCorrScoresMap $ nodeCorrScoresInfo return ()