{-|
Module      : TimeSeries.Correlation
Copyright   : (C) 2013 Parallel Scientific Labs, LLC
License     : GPL-2
Stability   : experimental
Portability : portable

Functions for correlation analysis.
-}

module TimeSeries.Correlation where

import Data.Array.Base
import Data.List (sort)

import TimeSeries.PRG64
import TimeSeries.Window (Window(..))
import qualified TimeSeries.Window as W

-- | Direct computation of correlation of two windows X and Y.
-- Resulting value /r/ is:
--
-- <<http://bit.ly/15XPwC3>>
--
-- See: <http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#Definition>
--
-- Where: X̄ is mean of X, Xi is i-th element of X.
--
direct :: Window -> Window -> Double
direct (Window x) (Window y) = sigma_x_y_minus_means / sqrt_sigma_x_y_squared
  where
    sigma_x_y_minus_means =
      sum [ (xi-x_mean)*(yi-y_mean)
          | (xi,yi) <- zip (elems x) (elems y)]
    sqrt_sigma_x_y_squared = sqrt (sum xs' * sum ys') where
      xs' = [square (xi-x_mean) | xi <- elems x]
      ys' = [square (yi-y_mean) | yi <- elems y]
    x_mean = mean (elems x)
    y_mean = mean (elems y)

{-

Or:
       1            Xi-X̄      Yi-Ȳ
 r = ————— Σi=1,n( —————— )( –————— )
      n-1            sx        sy

where sx is standard deviation of x.
...r*2?

-}

-- | Naive correlation from sketch distance, using:
--
-- <<http://bit.ly/1fWIkjp>>
--
-- sketch distance is computated without incremental update.
--
sketch_naive ::
  [Window]
  -- ^ Random vectors, length of list is /d/ (the sketch size).
  -- Length of 'Window' is /sw/ (sliding window size).
  -> Int    -- ^ Size of sliding window.
  -> Window -- ^ First sliding window.
  -> Window -- ^ Second sliding window.
  -> Double
sketch_naive rs sz (Window wx) (Window wy) = 1 - (d'/len')
  where
    d' = sqrt $ sum ds
    ds = [ square (xj-yj)
         | (xi,yi) <- zip skX skY
         , let xj = (xi-avg_x) / var_x
         , let yj = (yi-avg_y) / var_y
         ]
    skX = [dotp (elems r) (elems wx)|Window r <- rs]
    skY = [dotp (elems r) (elems wy)|Window r <- rs]
    len' = 2 * (len x-1)
    avg_x = mean x
    avg_y = mean y
    var_x = (sum [square (xi - avg_x)|xi<-x]) / sz'
    var_y = (sum [square (yi - avg_y)|yi<-y]) / sz'
    x = elems wx
    y = elems wy
    sz' = fromIntegral sz

-- | Sketch distance based correlation.
sketch ::
  Window
  -- ^ Sketch of sliding window X
  -> Window
  -- ^ Sketch of sliding window Y
  -> Double
sketch wx wy = 1 - (d'/len') where
  d' = sum [square (x-y)|(x,y)<-zip (W.toList wx) (W.toList wy)]
  len' = 2 * fromIntegral (W.length wx)

{-
-- | Correlation from sketch distance with ∑i=0,nb-1(sum(X)) and
-- ∑i=0,nb-1(sum(X²)).
sketch' ::
  Int
  -- ^ Sketch size.
  -> Int
  -- ^ Sliding window size.
  -> (Window, Double, Double)
  -- ^ Triple of window X, ∑i=0,nb-1(sum(X)), and ∑i=0,nb-1(sum(X²)).
  -> (Window, Double, Double)
  -- ^ Triple of window Y, ∑i=0,nb-1(sum(Y)), and ∑i=0,nb-1(sum(Y²)).
  -> Double
sketch' d sw (Window wx, sx, sqx) (Window wy, sy, sqy) = 1 - (d'/len') where
  -- XXX: Not using sumx/sumsqx in sketch_distance.
  d' = square (sketch_distance d x' y')
  x' = [(xi-avg_x)/var_x|xi<-x]
  y' = [(yi-avg_y)/var_y|yi<-y]
  len' = 2 * sw'
  avg_x = sx / sw'
  avg_y = sy / sw'
  var_x = sqrt ((sqx/sw') - (square avg_x))
  var_y = sqrt ((sqy/sw') - (square avg_y))
  x = elems wx
  y = elems wy
  sw' = fromIntegral sw
-}

-- | Sketch distance.
sketch_distance ::
  Int         -- ^ Sketch size /d/.
  -> [Double] -- ^ First vector.
  -> [Double] -- ^ Second vector.
  -> Double
sketch_distance d x w = median norms / sqrt (fromIntegral d) where
  norms = map norm $ zipWith (zipWith (-)) ys zs
  median xs = sort xs !! (length xs `div` 2)
  --
  -- d random vectors with m elements:
  --
  -- > ri = {1, -1}ᵐ, i = [1,2,..,2b+1]
  --
  -- Random vectors are whole random vector, which means that not
  -- separated to pair of unit random vector and control vector.
  --
  b = 8
  m = length x
  prg64s = chunks m $ concat $ prg64Bits (prg64Init 0x9834219)
  (ys,zs) = foldr f ([],[]) [0..2*b] where
    f n (yacc,zacc) =
      let y = [dotp x r | r <- rs]
          z = [dotp w r | r <- rs]
          rs = take d $ drop (n*d) prg64s
      in  (y:yacc,z:zacc)

-- | Vector norm:
--
-- > norm as = sqrt $ sum [a^2|a <- as]
--
norm :: Floating a => [a] -> a
norm as = sqrt $ sum [square a|a <- as]

mean :: Fractional a => [a] -> a
mean zs = sum zs / len zs

stddev :: [Double] -> Double
stddev zs = sqrt (mean [square zi|zi<-zs] - (square (mean zs)))

stddev' :: Window -> Double
stddev' (Window x) = stddev $ elems x

-- XXX: Use i2f from or1200 somewhere to do conversion.
-- Length may handled with counting up a state, then used in floating
-- point division...
len :: Num b => [a] -> b
len zs = fromIntegral (length zs)

dotp :: Num a => [a] -> [a] -> a
dotp a b = sum $ zipWith (*) a b

chunks :: Int -> [a] -> [[a]]
chunks sz ys
  | null ys   = []
  | otherwise = let (pre,post) = splitAt sz ys in pre : chunks sz post

square :: Fractional a => a -> a
square = (^ (2::Int))