```{-# LANGUAGE ScopedTypeVariables, FlexibleContexts #-}
-- | Nonlinear Iterative Partial Least Squares
module Numeric.LinearAlgebra.NIPALS
( -- * Simplified Interface
firstPC
, firstPCFromScores
, firstPCFromScoresM
, residual
) where

import Data.List (foldl1')
import Numeric.LinearAlgebra

-- | Calculate the first principal component of a set of samples.
--
-- Each row in the matrix is one sample. Note that this is transposed
-- compared to the implementation of principal components using 'svd'
-- or 'leftSV'
--
-- Example:
--
-- > let (pc,scores,residuals) = firstPC \$ fromRows samples
--
-- This is calculated by providing a default estimate of the scores to
-- 'firstPCFromScores'
firstPC :: Matrix Double -> (Vector Double, Vector Double, Matrix Double)
firstPC m = firstPCFromScores m t0
where
t0 = head \$ toColumns m

-- | Calculate the first principal component of a set of samples given
-- a starting estimate of the scores.
--
-- Each row in the matrix is one sample. Note that this is transposed
-- compared to the implementation of principal components using 'svd'
-- or 'leftSV'
--
-- The second argument is a starting guess for the score vector. If
-- this is close to the actual score vector, then this will cause the
-- algorthm to converge much faster.
--
-- Example:
--
-- > let (pc,scores,residuals) = firstPCFromScores (fromRows samples) scoresGuess
--
firstPCFromScores :: Matrix Double
-> Vector Double
-> (Vector Double, Vector Double, Matrix Double)
firstPCFromScores m t0 = (p,t,r)
where
steps = iterate refine (t0,undefined)
convergence = let scores = map fst steps
dscores = zipWith diffScores scores \$ tail scores
in dscores
(t,p) = let steps' = zip convergence \$ tail steps
steps'' = dropWhile (\(c,_) -> c > threshold) steps'
r = m `sub` (t `outer` p)

refine (t,_) = (t', p')
where
p' = toUnit (trans m <> t)
t' = m <> p'

threshold = 16*eps

diffScores :: Vector Double -> Vector Double -> Double
diffScores ta tb = let xa = ta<.>ta
xb = tb<.>tb
x = abs \$ xa - xb
in (x / (eps+xb)) :: Double

toUnit :: Vector Double -> Vector Double
toUnit v = if mag <= 0.0 then dim v |> (1 : repeat 0) else scale (1/mag) v
where
mag = norm2 v

-- | Calculate the first principal component -- calculating the
-- samples fresh on every pass.
--
-- This function calculates the exact same results as
-- 'firstPCFromScores' (minus the residual), but instead of an input
-- 'Matrix', it takes a monad action that yields the list of samples,
-- and it guarantees that the list returned by the action will be
-- consumed in a single pass. However the action may be demanded many
-- times.
--
-- The residual can't be calculated lazily, like it is in
-- 'firstPCFromScores', because the samples would need to be
-- demanded. Instead, to calculate the residual use 'residual'.
--
-- There is no corresponding @firstPCM@ that guesses the initial score
-- vector for you because if you need to use this function instead of
-- 'firstPC', then you really should come up with a reasonable
-- starting point or it will take forever.
m [Vector Double]
-> Vector Double
-> m (Vector Double, Vector Double)
firstPCFromScoresM samplesM t0 = do
(t,p) <- refine t0
iter t0 t p
where
refine t = do
p <- samplesM >>= (\s -> return \$ transMult1Pass s t)
let p' = toUnit p
t' <- samplesM >>= (\s -> return \$ mult1Pass s p')
return (t',p')
threshold = 16 * eps
iter t0 t1 p1 | diff < threshold = return \$ (p1,t1)
| otherwise = do
(t2,p2) <- refine t1
iter t1 t2 p2
where
diff = diffScores t0 t1

-- Single pass versions of (trans m <> v) and (m <> v) respectively,
-- where m is a list of rows
transMult1Pass, mult1Pass :: [Vector Double] -> Vector Double -> Vector Double
mult1Pass rows vec = fromList \$ map (dot vec) rows
transMult1Pass cols vec = foldl1' add \$ zipWith scale (toList vec) cols

-- | Calculate the residuals of a series of samples given a component
-- and score vector.
--
-- > (p,t) <- firstPCFromScoresM samplesM (randomVector 0 Gaussian numSamples)
-- > samples <- samplesM
-- > let r = residual samples p t
residual :: [Vector Double] -- ^ The samples