{- normalize Gregory W. Schwartz Normalizes the data (entities, for instance genes or proteins) by column (samples). -} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE OverloadedStrings #-} module Main where -- Standard import Data.Maybe import qualified Data.Map.Strict as Map -- Cabal import qualified Data.Vector as V import qualified Data.ByteString.Lazy.Char8 as CL import qualified Data.Text as T import qualified Data.Csv as CSV import Options.Generic -- Local import Types import Load import Normalize import Filter -- | Command line arguments data Options = Options { labelField :: Maybe T.Text "The column containing the label for the entry." , sampleField :: T.Text "The column containing the sample for the entry." , entityField :: T.Text "The column containing the id for the entity in the entry." , valueField :: T.Text "The column field containing the value for the entry." , entityDiff :: Maybe T.Text "When comparing entities that are the same, ignore the text after this separator. Used for the bySample normalization. For example, if we have a strings ARG29_5 and ARG29_7 that we both want to be divided by another entity in another sample called ARG29, we would set this string to be \"_\"" , bySample :: Maybe T.Text "Normalize as usual, but at the end use this string to differentiate the sample field from the normalization samples, then divide the matching samples with these samples and renormalize. For instance, if we want to normalize \"normalizeMe\" by \"normalizeMeByThis\", we would set this string to be \"ByThis\" so the normalized values from \"normalizeMe\" are divided by the normalized values from \"normalizeMeByThis\". This string must make the latter become the former, so \"By\" would not work as it would become \"normalizeMeThis\". If there is no divisor, we remove that entity." , bySampleRemoveSynonyms :: Bool "When normalizing by sample, if the divisor appears multiple times we assume those are synonyms. Here, we would remove the synonym with the smaller intensity. If not set, errors out and provides the synonym name." , method :: Maybe String "([StandardScore] | UpperQuartile | QuantileMedian | QuantileAverage | None) The method for standardization of the samples. The Quantile* methods expect the same number of entities for each sample. " , filterEntitiesMissing :: Maybe Int "([0] | INT) Whether to remove entities that appear less than this many times after normalizing." , filterEntitiesValue :: Maybe Double "([Nothing] | DOUBLE) Whether to remove entities in filterEntitiesMissing but also counting entities with a value of this or less as missing." , filterEntitiesStdDev :: Maybe Double "([Nothing] | DOUBLE) Remove entities that have less than this value for their standard deviation among all samples they appear in, after normalization." , base :: Maybe Double "([Nothing] | DOUBLE) Log transform the data at the end using this base but before filtering." } deriving (Generic, Show) instance ParseRecord Options main :: IO () main = do opts <- getRecord "normalize, Gregory W. Schwartz.\ \ Normalizes the data (entities, for instance genes or\ \ proteins) by column (samples). Can read stdin." -- No header so we can READ the header (ugh). (_, csvContents) <- fmap (either error id . CSV.decodeByName) CL.getContents let synonymFlag = SynonymFlag . unHelpful . bySampleRemoveSynonyms $ opts eSep = fmap EntitySep . unHelpful . entityDiff $ opts sampleDiff = fmap NormSampleString . unHelpful . bySample $ opts filterNumSamples = NumSamples . fromMaybe 0 . unHelpful . filterEntitiesMissing $ opts filterValue = fmap ValueThreshold . unHelpful . filterEntitiesValue $ opts filterStdDev = fmap StdDevThreshold . unHelpful . filterEntitiesStdDev $ opts logBaseTransform = fmap Base . unHelpful . base $ opts entities = V.map ( csvRowToEntity (fmap Field . unHelpful $ labelField opts) (Field . unHelpful $ sampleField opts) (Field . unHelpful $ entityField opts) (Field . unHelpful $ valueField opts) ) $ csvContents sampleMap = toSampleMap entities normalizeMap = normalize (maybe StandardScore read . unHelpful . method $ opts) result = filterEntitiesBy filterValue filterStdDev filterNumSamples . (\ x -> maybe x (flip logTransform x) logBaseTransform ) . (\ x -> maybe x ( normalizeMap . flip (normalizeBySample synonymFlag eSep) x ) sampleDiff ) . normalizeMap $ sampleMap formatted = CL.append (CL.pack "label,sample,entity,numSamples,value") . CL.dropWhile (/= '\n') . CSV.encodeDefaultOrderedByName . concatMap V.toList . Map.elems $ result CL.putStrLn formatted return ()