{- Normalize
Gregory W. Schwartz

Collections the functions pertaining to the normalization of biological
data, where the rows are entities (genes or proteins) while the columns are
samples.
-}

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}

module Normalize
    ( logTransform
    , normalize
    , normalizeBySample
    ) where

-- Standard
import Data.Ord
import Data.List
import Data.Maybe (fromMaybe)
import qualified Data.Map.Strict as Map
import qualified Data.Sequence as Seq
import qualified Data.Foldable as F
import Data.Function (on)

-- Cabal
import Control.Lens
import Statistics.Quantile
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Statistics.Sample as Stat

-- Local
import Types
import Utility

-- | Log transform the normalize map.
logTransform :: Base -> Map.Map Sample (V.Vector Entity) -> Map.Map Sample (V.Vector Entity)
logTransform (Base base) = (fmap . fmap) (over value (logBase 2))

-- | Normalize a sample by standard scores.
standardScore :: V.Vector Entity -> V.Vector Entity
standardScore xs = V.map (over value (\x -> (x - mu) / sigma)) xs
  where
    mu    = Stat.mean . V.map _value $ xs
    sigma = Stat.stdDev . V.map _value $ xs

-- | Normalize all samples by a specific method.
normalize :: Method
          -> Map.Map Sample (V.Vector Entity)
          -> Map.Map Sample (V.Vector Entity)
normalize StandardScore = Map.map standardScore
normalize UpperQuartile = Map.map upperQuartileNormalize
normalize method@QuantileMedian  = quantileNormalize method
normalize method@QuantileAverage = quantileNormalize method
normalize None          = id

-- | Normalize a sample (1) by another sample (2) by division. The
-- NormSampleString contains the string that differentiates (1) from (2).
-- NormSampleString must be within (2) and must make, upon its removal from (2),
-- (1). For instance, if we want to normalize "normalizeMe" by
-- "normalizeMeByThis", we would set this string to be "ByThis" so the
-- normalized values from "normalizeMe" are divided by the normalized values
-- from "normalizeMeByThis". This string must make the latter become the former,
-- so "By" would not work as it would become "normalizeMeThis".
normalizeBySample :: SynonymFlag
                  -> Maybe EntitySep
                  -> NormSampleString
                  -> Map.Map Sample (V.Vector Entity)
                  -> Map.Map Sample (V.Vector Entity)
normalizeBySample synonymFlag entitySep normSampleString =
    Map.map ( V.fromList
            . concatMap (divideBySample synonymFlag . reverse . sort)
            )
        . Map.map groupDivisors
        . Map.mapKeysWith
            (V.++)
            ( Sample
            . T.replace (unNormSampleString normSampleString) ""
            . unSample
            )
        . Map.mapWithKey (tagDivisors entitySep normSampleString)

-- | Partition divisors and dividends and divide.
groupDivisors :: V.Vector (EntityName, (Divisor, Entity))
              -> V.Vector [(Divisor, Entity)]
groupDivisors = V.fromList
              . fmap (F.toList . snd)
              . Map.toAscList
              . Map.fromListWith (Seq.><)
              . fmap (over _2 Seq.singleton)
              . V.toList

-- | The actual subtraction (Z scores) of dividends by divisor. If there are too
-- many divisors, we assume they are a synonym if the SynonymFlag is true, so
-- we only take into account the highest intensity synonym.
divideBySample :: SynonymFlag -> [(Divisor, Entity)] -> [Entity]
divideBySample _ []                                    =
    error $ "Empty division in divideBySample."
divideBySample _ [(Divisor True, _)]                   = []
divideBySample _ ((Divisor False, _):_)                = []
divideBySample (SynonymFlag True) all@((Divisor True, x):(Divisor True, y):_) =
    divideBySample (SynonymFlag False)
        . (: (filter (not . unDivisor . fst) all))
        . maximumBy (comparing (_value . snd))
        . filter (unDivisor . fst)
        $ all
divideBySample (SynonymFlag False) ((Divisor True, x):(Divisor True, y):_) =
    error $ "Too many divisors found including: "
         ++ (show x)
         ++ " and "
         ++ (show y)
divideBySample _ ((Divisor True, x):xs)                                    =
    fmap ((-~) value (_value x) . snd) xs

-- | Tag all divisors in a sample.
tagDivisors :: Maybe EntitySep
            -> NormSampleString
            -> Sample
            -> V.Vector Entity
            -> V.Vector (EntityName, (Divisor, Entity))
tagDivisors entitySep needle haystack =
    fmap (tagDivisor entitySep needle haystack)

-- | Tag divisor in a sample.
tagDivisor :: Maybe EntitySep
           -> NormSampleString
           -> Sample
           -> Entity
           -> (EntityName, (Divisor, Entity))
tagDivisor sep (NormSampleString needle) (Sample haystack) !e =
    ( entityName sep
    , ( Divisor . T.isInfixOf needle $ haystack
      , over sample (T.replace needle "") e
      )
    )
  where
    entityName :: (Maybe EntitySep) -> EntityName
    entityName Nothing              = EntityName . _entity $ e
    entityName (Just (EntitySep s)) =
        EntityName . head . T.splitOn s . _entity $ e

-- | Normalize by the upper quartile method, log 2 transformed.
upperQuartileNormalize :: V.Vector Entity -> V.Vector Entity
upperQuartileNormalize xs =
    fmap (over value (/ uqVal zeroFiltered)) zeroFiltered
  where
    zeroFiltered = V.filter ((> 0) . _value) xs
    uqVal = continuousBy (ContParam 1 1) 3 4 . fmap _value

-- | Quantile normalization. Important: assumes the same number of entities in
-- each sample.
quantileNormalize :: Method
                  -> Map.Map Sample (V.Vector Entity)
                  -> Map.Map Sample (V.Vector Entity)
quantileNormalize method mat =
    Map.map (fmap (over value (fromMaybe 0 . (V.!?) summaryVec . round))) rankMat
  where
    summaryFunc QuantileMedian  = medianVector
    summaryFunc QuantileAverage = avgVector
    summaryFunc _ = error "Unsupported method for quantile normalization."
    summaryVec =
        V.fromList . fmap (summaryFunc method) . transposeSamples $ sortMat
    sortMat    = Map.map sortVector mat
    rankMat    = Map.map rankVector mat