src/Numeric/LinearAlgebra/NIPALS.hs

{-# LANGUAGE ScopedTypeVariables, FlexibleContexts #-}
-- | Nonlinear Iterative Partial Least Squares
module Numeric.LinearAlgebra.NIPALS
    ( -- * Simplified Interface
      firstPC
    , firstPCFromScores
      -- * Monadic interface
    , firstPCFromScoresM
    , residual
    ) where

import Data.List (foldl1')
import Numeric.LinearAlgebra

-- | Calculate the first principal component of a set of samples.
--
-- Each row in the matrix is one sample. Note that this is transposed
-- compared to the implementation of principal components using 'svd'
-- or 'leftSV'
--
-- Example:
-- 
-- > let (pc,scores,residuals) = firstPC $ fromRows samples
-- 
-- This is calculated by providing a default estimate of the scores to
-- 'firstPCFromScores'
firstPC :: Matrix Double -> (Vector Double, Vector Double, Matrix Double)
firstPC m = firstPCFromScores m t0
    where
      t0 = head $ toColumns m

-- | Calculate the first principal component of a set of samples given
-- a starting estimate of the scores.
--
-- Each row in the matrix is one sample. Note that this is transposed
-- compared to the implementation of principal components using 'svd'
-- or 'leftSV'
--
-- The second argument is a starting guess for the score vector. If
-- this is close to the actual score vector, then this will cause the
-- algorthm to converge much faster.
--
-- Example:
-- 
-- > let (pc,scores,residuals) = firstPCFromScores (fromRows samples) scoresGuess
-- 
firstPCFromScores :: Matrix Double
                  -> Vector Double
                  -> (Vector Double, Vector Double, Matrix Double)
firstPCFromScores m t0 = (p,t,r)
    where
      steps = iterate refine (t0,undefined)
      convergence = let scores = map fst steps
                        dscores = zipWith diffScores scores $ tail scores
                    in dscores
      (t,p) = let steps' = zip convergence $ tail steps
                  steps'' = dropWhile (\(c,_) -> c > threshold) steps'
              in snd $ head steps''
      r = m `sub` (t `outer` p)

      refine (t,_) = (t', p')
          where
            p' = toUnit (trans m <> t)
            t' = m <> p'

      threshold = 16*eps

diffScores :: Vector Double -> Vector Double -> Double
diffScores ta tb = let xa = ta<.>ta
                       xb = tb<.>tb
                       x = abs $ xa - xb
                   in (x / (eps+xb)) :: Double

toUnit :: Vector Double -> Vector Double
toUnit v = if mag <= 0.0 then dim v |> (1 : repeat 0) else scale (1/mag) v
    where
      mag = norm2 v

-- | Calculate the first principal component -- calculating the
-- samples fresh on every pass.
--
-- This function calculates the exact same results as
-- 'firstPCFromScores' (minus the residual), but instead of an input
-- 'Matrix', it takes a monad action that yields the list of samples,
-- and it guarantees that the list returned by the action will be
-- consumed in a single pass. However the action may be demanded many
-- times.
--
-- The residual can't be calculated lazily, like it is in
-- 'firstPCFromScores', because the samples would need to be
-- demanded. Instead, to calculate the residual use 'residual'.
--
-- There is no corresponding @firstPCM@ that guesses the initial score
-- vector for you because if you need to use this function instead of
-- 'firstPC', then you really should come up with a reasonable
-- starting point or it will take forever.
firstPCFromScoresM :: Monad m =>
                      m [Vector Double]
                   -> Vector Double
                   -> m (Vector Double, Vector Double)
firstPCFromScoresM samplesM t0 = do
  (t,p) <- refine t0
  iter t0 t p
      where
        refine t = do
          p <- samplesM >>= (\s -> return $ transMult1Pass s t)
          let p' = toUnit p
          t' <- samplesM >>= (\s -> return $ mult1Pass s p')
          return (t',p')
        threshold = 16 * eps
        iter t0 t1 p1 | diff < threshold = return $ (p1,t1)
                      | otherwise = do
                                   (t2,p2) <- refine t1
                                   iter t1 t2 p2
                     where
                       diff = diffScores t0 t1

-- Single pass versions of (trans m <> v) and (m <> v) respectively,
-- where m is a list of rows
transMult1Pass, mult1Pass :: [Vector Double] -> Vector Double -> Vector Double
mult1Pass rows vec = fromList $ map (dot vec) rows
transMult1Pass cols vec = foldl1' add $ zipWith scale (toList vec) cols

-- | Calculate the residuals of a series of samples given a component
-- and score vector.
--
-- > (p,t) <- firstPCFromScoresM samplesM (randomVector 0 Gaussian numSamples)
-- > samples <- samplesM
-- > let r = residual samples p t
residual :: [Vector Double] -- ^ The samples
         -> Vector Double   -- ^ The component (also called the loading)
         -> Vector Double   -- ^ The scores
         -> [Vector Double] -- ^ The residuals for each sample
residual s p t = r
    where
      ts = toList t
      comp = map (`scale` p) ts
      r = zipWith sub s comp