{-# OPTIONS_GHC -fglasgow-exts #-} ----------------------------------------------------------------------------- -- | -- Module : Numeric.Statistics.PCA -- Copyright : (c) Alexander Vivian Hugh McPhail 2010 -- License : GPL-style -- -- Maintainer : haskell.vivian.mcphail gmail com -- Stability : provisional -- Portability : portable -- -- Principal Components Analysis -- ----------------------------------------------------------------------------- module Numeric.Statistics.PCA ( pca, pcaTransform, pcaReduce ) where ----------------------------------------------------------------------------- import qualified Data.Array.IArray as I import Data.Packed.Vector import Data.Packed.Matrix import Numeric.LinearAlgebra.Interface import Numeric.LinearAlgebra.Algorithms import Numeric.GSL.Statistics import Numeric.Statistics ----------------------------------------------------------------------------- -- | find the n principal components of multidimensional data pca :: I.Array Int (Vector Double) -- the data -> Double -- eigenvalue threshold -> Matrix Double pca d q = let d' = fmap (\x -> x - (scalar $ mean x)) d -- remove the mean from each dimension cv = covarianceMatrix d' (val',vec') = eigSH cv -- the covariance matrix is real symmetric val = toList val' vec = toColumns vec' v' = zip val vec v = filter (\(x,_) -> x > q) v' -- keep only eigens > than parameter in fromColumns $ snd $ unzip v -- | perform a PCA transform of the original data (remove mean) -- | Final = M^T Data^T pcaTransform :: I.Array Int (Vector Double) -- ^ the data -> Matrix Double -- ^ the principal components -> I.Array Int (Vector Double) -- ^ the transformed data pcaTransform d m = let d' = fmap (\x -> x - (scalar $ mean x)) d -- remove the mean from each dimension in I.listArray (1,cols m) $ toRows $ (trans m) <> (fromRows $ I.elems d') -- | perform a dimension-reducing PCA modification pcaReduce :: I.Array Int (Vector Double) -- ^ the data -> Double -- ^ eigenvalue threshold -> I.Array Int (Vector Double) -- ^ the reduced data, with n principal components pcaReduce d q = let u = fmap (scalar . mean) d d' = zipWith (-) (I.elems d) (I.elems u) cv = covarianceMatrix $ I.listArray (I.bounds d) d' (val',vec') = eigSH cv -- the covariance matrix is real symmetric val = toList val' vec = toColumns vec' v' = zip val vec v = filter (\(x,_) -> x > q) v' -- keep only eigens > than parameter m = fromColumns $ snd $ unzip v in I.listArray (I.bounds d) $ zipWith (+) (toRows $ m <> (trans m) <> fromRows d') (I.elems u) -----------------------------------------------------------------------------