{-|
Module: MachineLearning.PCA
Description: Principal Component Analysis.
Copyright: (c) Alexander Ignatyev, 2017
License: BSD-3
Stability: experimental
Portability: POSIX

Principal Component Analysis (PCA) - dimensionality reduction algorithm.
It is mostly used to speed up supervising learning (Regression, Classification, etc) and visualization of data.
-}

module MachineLearning.PCA
(
  getDimReducer
  , getDimReducer_rv
)

where

import Data.Maybe (fromMaybe)
import qualified Data.Vector.Storable as V
import qualified Numeric.LinearAlgebra as LA
import Numeric.LinearAlgebra ((<>), (??))
import qualified MachineLearning as ML

import MachineLearning.Types (R, Vector, Matrix)


-- | Computes "covariance matrix".
covarianceMatrix :: Matrix -> Matrix
covarianceMatrix x = ((LA.tr x) <> x) / (fromIntegral $ LA.rows x)


-- | Compute eigenvectors (matrix U) and singular values (matrix S) of the given covariance matrix.
pca :: Matrix -> (Matrix, Vector)
pca x = (u, s)
  where sigma = covarianceMatrix x
        (u, s, _) = LA.svd sigma


-- | Gets dimensionality reduction function, retained variance (0..1) and reduced X
-- for given matrix X and number of dimensions to retain.
getDimReducer :: Matrix -> Int -> (Matrix -> Matrix, R, Matrix)
getDimReducer x k = (reducer, retainedVariance, reducer xNorm)
  where muSigma = ML.meanStddev x
        xNorm = ML.featureNormalization muSigma x
        (u, s) = pca xNorm
        u' = u ?? (LA.All, LA.Take k)
        reducer xx = (ML.featureNormalization muSigma xx) <> u'
        retainedVariance = (V.sum $ V.slice 0 k s) / (V.sum s)


-- | Gets dimensionality reduction function, retained number of dimensions and reduced X
-- for given matrix X and variance to retain (0..1].
getDimReducer_rv :: Matrix -> R -> (Matrix -> Matrix, Int, Matrix)
getDimReducer_rv x rv = (reducer, k, reducer xNorm)
  where muSigma = ML.meanStddev x
        xNorm = ML.featureNormalization muSigma x
        (u, s) = pca xNorm
        sum_s = V.sum s
        variances = (V.scanl' (+) 0 s) / (LA.scalar sum_s)
        k = fromMaybe ((V.length s) - 1) $ V.findIndex (\v -> v >= rv) variances
        u' = u ?? (LA.All, LA.Take k)
        reducer xx = (ML.featureNormalization muSigma xx) <> u'