src/Math/Statistics/KSTest.hs

{-# LANGUAGE 
        MultiParamTypeClasses,
        FlexibleContexts, FlexibleInstances,
        GADTs,
        ParallelListComp
  #-}
module Math.Statistics.KSTest (ks, ksTest, KS(..)) where

import Control.Monad
import Data.Random
import Data.Random.Distribution.Kolmogorov (kCdf, kCdfQuick)
import Data.List (sort)

-- |Kolmogorov-Smirnov statistic for a set of data relative to a (continuous)
-- distribution with the given CDF.  Returns 3 common forms of the statistic:
-- (K+, K-, D), where K+ and K- are Smirnov's one-sided forms as presented in
-- Knuth's Semi-Numerical Algorithms (TAOCP, vol. 2)  and D is Kolmogorov's
-- undirected version.
-- 
-- In particular,
--
-- *   K+ = sup(\x -> F_n(x) - F(x))
-- *   K- = sup(\x -> F(x) - F_n(x))
-- *   D  = sup(\x -> abs(F_n(x) - F(x)))
--
ks f n xs = (kPlus, kMinus, d)
    where
        sqrt_n = sqrt (fromIntegral n)
        sorted = sort (map f xs)
        
        kPlus   = maximum [ j//n -  fx  | j <- [1..] | fx <- sorted]
        kMinus  = maximum [ fx   - j//n | j <- [0..] | fx <- sorted]
        d       = max kPlus kMinus
        
        infixl 7 //
        x//y = fromIntegral x / fromIntegral y

-- | @ksTest cdf xs@ 
-- Computes the probability of a random data set (of the same size as xs)
-- drawn from a continuous distribution with the given CDF having the same
-- Kolmogorov statistic as xs.
--
-- The statistic is the greatest absolute deviation of the empirical CDF of
-- XS from the assumed CDF @cdf@.
--
-- If the data were, in fact, drawn from a distribution with the given CDF,
-- then the resulting p-value should be uniformly distributed over (0,1].
ksTest f xs = 1 - kCdf n d
    where
        n = length xs
        
        (kPlus, kMinus, d) = ks f n xs

-- |'KS' distribution: not really a standard mathematical concept, but still
-- a nice conceptual shift.  @KS n d@ is the distribution of a random
-- variable constructed as a list of @n@ independent random variables of
-- distribution @d@.
-- 
-- The corresponding 'CDF' instance implements the K-S test for such lists.
-- For example, if @xs@ is a list of length 100 believed to contain Beta(2,5)
-- variates, then @cdf (KS 100 (Beta 2 5))@ is the K-S test for that distribution.
-- (Note that if @length xs@ is not 100, then the result will be 0 because
-- such lists cannot arise from that 'KS' distribution.  Somewhat arbitrarily,
-- all lists of \"impossible\" length are grouped at the bottom of the ordering
-- encoded by the 'CDF' instance.)
-- 
-- The 'KS' test can easily be applied recursively.
-- For example, if @d@ is a 'Distribution' of interest and you have 100 trials
-- each with 100 data points, you can test it by calling @cdf (KS 100 (KS 100 d))@.
data KS d a where
     KS :: !Int -> !(d a) -> KS d [a]

instance Eq (d a) => Eq (KS d [a]) where
    KS n1 d1 == KS n2 d2    = n1 == n2 && d1 == d2
instance Show (d a) => Show (KS d [a]) where
    showsPrec p (KS n d) = showParen (p > 10) 
        ( showString "KS " 
        . showsPrec 11 n
        . showChar ' '
        . showsPrec 11 d
        )

instance Distribution d a => Distribution (KS d) [a] where
    rvar (KS n d) = replicateM n (rvar d)

instance CDF d a => CDF (KS d) [a] where
    cdf (KS n dist) xs 
        | length (take (n+1) xs) == n       = 1 - kCdf n d
        | otherwise                         = 0
        where
            (kPlus, kMinus, d) = ks (cdf dist) n xs