-- |
-- Module    : Statistics.Test.KruskalWallis
-- Copyright : (c) 2014 Danny Navarro
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
module Statistics.Test.KruskalWallis
  ( -- * Kruskal-Wallis test
    kruskalWallisTest
    -- ** Building blocks
  , kruskalWallisRank
  , kruskalWallis
  , module Statistics.Test.Types
  ) where

import Data.Ord (comparing)
import qualified Data.Vector.Unboxed as U
import Statistics.Function (sort, sortBy, square)
import Statistics.Distribution (complCumulative)
import Statistics.Distribution.ChiSquared (chiSquared)
import Statistics.Types
import Statistics.Test.Types
import Statistics.Test.Internal (rank)
import Statistics.Sample
import qualified Statistics.Sample.Internal as Sample(sum)


-- | Kruskal-Wallis ranking.
--
-- All values are replaced by the absolute rank in the combined samples.
--
-- The samples and values need not to be ordered but the values in the result
-- are ordered. Assigned ranks (ties are given their average rank).
kruskalWallisRank :: (U.Unbox a, Ord a) => [U.Vector a] -> [U.Vector Double]
kruskalWallisRank :: [Vector a] -> [Vector Double]
kruskalWallisRank [Vector a]
samples = Vector (Int, Double) -> [Vector Double]
forall a. (Eq a, Unbox a) => Vector (a, Double) -> [Vector Double]
groupByTags
                          (Vector (Int, Double) -> [Vector Double])
-> (Vector Double -> Vector (Int, Double))
-> Vector Double
-> [Vector Double]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Comparison (Int, Double)
-> Vector (Int, Double) -> Vector (Int, Double)
forall (v :: * -> *) e. Vector v e => Comparison e -> v e -> v e
sortBy (((Int, Double) -> Int) -> Comparison (Int, Double)
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Int, Double) -> Int
forall a b. (a, b) -> a
fst)
                          (Vector (Int, Double) -> Vector (Int, Double))
-> (Vector Double -> Vector (Int, Double))
-> Vector Double
-> Vector (Int, Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Int -> Vector Double -> Vector (Int, Double)
forall a b.
(Unbox a, Unbox b) =>
Vector a -> Vector b -> Vector (a, b)
U.zip Vector Int
tags
                          (Vector Double -> [Vector Double])
-> Vector Double -> [Vector Double]
forall a b. (a -> b) -> a -> b
$ (a -> a -> Bool) -> Vector a -> Vector Double
forall (v :: * -> *) a.
(Vector v a, Vector v Double) =>
(a -> a -> Bool) -> v a -> v Double
rank a -> a -> Bool
forall a. Eq a => a -> a -> Bool
(==) Vector a
joinSample
  where
    (Vector Int
tags,Vector a
joinSample) = Vector (Int, a) -> (Vector Int, Vector a)
forall a b.
(Unbox a, Unbox b) =>
Vector (a, b) -> (Vector a, Vector b)
U.unzip
                      (Vector (Int, a) -> (Vector Int, Vector a))
-> (Vector (Int, a) -> Vector (Int, a))
-> Vector (Int, a)
-> (Vector Int, Vector a)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Comparison (Int, a) -> Vector (Int, a) -> Vector (Int, a)
forall (v :: * -> *) e. Vector v e => Comparison e -> v e -> v e
sortBy (((Int, a) -> a) -> Comparison (Int, a)
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Int, a) -> a
forall a b. (a, b) -> b
snd)
                      (Vector (Int, a) -> (Vector Int, Vector a))
-> Vector (Int, a) -> (Vector Int, Vector a)
forall a b. (a -> b) -> a -> b
$ ((Int, Vector a) -> Vector (Int, a))
-> [(Int, Vector a)] -> Vector (Int, a)
forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap ((Int -> Vector a -> Vector (Int, a))
-> (Int, Vector a) -> Vector (Int, a)
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry Int -> Vector a -> Vector (Int, a)
forall b a. (Unbox b, Unbox a) => a -> Vector b -> Vector (a, b)
tagSample) ([(Int, Vector a)] -> Vector (Int, a))
-> [(Int, Vector a)] -> Vector (Int, a)
forall a b. (a -> b) -> a -> b
$ [Int] -> [Vector a] -> [(Int, Vector a)]
forall a b. [a] -> [b] -> [(a, b)]
zip [(Int
1::Int)..] [Vector a]
samples
    tagSample :: a -> Vector b -> Vector (a, b)
tagSample a
t = (b -> (a, b)) -> Vector b -> Vector (a, b)
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
U.map (\b
x -> (a
t,b
x))

    groupByTags :: Vector (a, Double) -> [Vector Double]
groupByTags Vector (a, Double)
xs
        | Vector (a, Double) -> Bool
forall a. Unbox a => Vector a -> Bool
U.null Vector (a, Double)
xs = []
        | Bool
otherwise = Vector Double -> Vector Double
sort (((a, Double) -> Double) -> Vector (a, Double) -> Vector Double
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
U.map (a, Double) -> Double
forall a b. (a, b) -> b
snd Vector (a, Double)
ys) Vector Double -> [Vector Double] -> [Vector Double]
forall a. a -> [a] -> [a]
: Vector (a, Double) -> [Vector Double]
groupByTags Vector (a, Double)
zs
      where
        (Vector (a, Double)
ys,Vector (a, Double)
zs) = ((a, Double) -> Bool)
-> Vector (a, Double) -> (Vector (a, Double), Vector (a, Double))
forall a.
Unbox a =>
(a -> Bool) -> Vector a -> (Vector a, Vector a)
U.span (a -> a -> Bool
forall a. Eq a => a -> a -> Bool
(==) ((a, Double) -> a
forall a b. (a, b) -> a
fst ((a, Double) -> a) -> (a, Double) -> a
forall a b. (a -> b) -> a -> b
$ Vector (a, Double) -> (a, Double)
forall a. Unbox a => Vector a -> a
U.head Vector (a, Double)
xs) (a -> Bool) -> ((a, Double) -> a) -> (a, Double) -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a, Double) -> a
forall a b. (a, b) -> a
fst) Vector (a, Double)
xs


-- | The Kruskal-Wallis Test.
--
-- In textbooks the output value is usually represented by 'K' or 'H'. This
-- function already does the ranking.
kruskalWallis :: (U.Unbox a, Ord a) => [U.Vector a] -> Double
kruskalWallis :: [Vector a] -> Double
kruskalWallis [Vector a]
samples = (Double
nTot Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
1) Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
numerator Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
denominator
  where
    -- Total number of elements in all samples
    nTot :: Double
nTot    = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ [Vector Double] -> (Vector Double -> Int) -> Int
forall a. Num a => [Vector Double] -> (Vector Double -> a) -> a
sumWith [Vector Double]
rsamples Vector Double -> Int
forall a. Unbox a => Vector a -> Int
U.length
    -- Average rank of all samples
    avgRank :: Double
avgRank = (Double
nTot Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
1) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
2
    --
    numerator :: Double
numerator = [Vector Double] -> (Vector Double -> Double) -> Double
forall a. Num a => [Vector Double] -> (Vector Double -> a) -> a
sumWith [Vector Double]
rsamples ((Vector Double -> Double) -> Double)
-> (Vector Double -> Double) -> Double
forall a b. (a -> b) -> a -> b
$ \Vector Double
sample ->
        let n :: Double
n = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ Vector Double -> Int
forall a. Unbox a => Vector a -> Int
U.length Vector Double
sample
        in  Double
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double -> Double
square (Vector Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean Vector Double
sample Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
avgRank)
    denominator :: Double
denominator = [Vector Double] -> (Vector Double -> Double) -> Double
forall a. Num a => [Vector Double] -> (Vector Double -> a) -> a
sumWith [Vector Double]
rsamples ((Vector Double -> Double) -> Double)
-> (Vector Double -> Double) -> Double
forall a b. (a -> b) -> a -> b
$ \Vector Double
sample ->
        Vector Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
Sample.sum (Vector Double -> Double) -> Vector Double -> Double
forall a b. (a -> b) -> a -> b
$ (Double -> Double) -> Vector Double -> Vector Double
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
U.map (\Double
r -> Double -> Double
square (Double
r Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
avgRank)) Vector Double
sample

    rsamples :: [Vector Double]
rsamples = [Vector a] -> [Vector Double]
forall a. (Unbox a, Ord a) => [Vector a] -> [Vector Double]
kruskalWallisRank [Vector a]
samples


-- | Perform Kruskal-Wallis Test for the given samples and required
-- significance. For additional information check 'kruskalWallis'. This is just
-- a helper function.
--
-- It uses /Chi-Squared/ distribution for approximation as long as the sizes are
-- larger than 5. Otherwise the test returns 'Nothing'.
kruskalWallisTest :: (Ord a, U.Unbox a) => [U.Vector a] -> Maybe (Test ())
kruskalWallisTest :: [Vector a] -> Maybe (Test ())
kruskalWallisTest []      = Maybe (Test ())
forall a. Maybe a
Nothing
kruskalWallisTest [Vector a]
samples
  -- We use chi-squared approximation here
  | (Int -> Bool) -> [Int] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>Int
4) [Int]
ns = Test () -> Maybe (Test ())
forall a. a -> Maybe a
Just Test :: forall distr. PValue Double -> Double -> distr -> Test distr
Test { testSignificance :: PValue Double
testSignificance = Double -> PValue Double
forall a. (Ord a, Num a) => a -> PValue a
mkPValue (Double -> PValue Double) -> Double -> PValue Double
forall a b. (a -> b) -> a -> b
$ ChiSquared -> Double -> Double
forall d. Distribution d => d -> Double -> Double
complCumulative ChiSquared
d Double
k
                            , testStatistics :: Double
testStatistics   = Double
k
                            , testDistribution :: ()
testDistribution = ()
                            }
  | Bool
otherwise   = Maybe (Test ())
forall a. Maybe a
Nothing
  where
    k :: Double
k  = [Vector a] -> Double
forall a. (Unbox a, Ord a) => [Vector a] -> Double
kruskalWallis [Vector a]
samples
    ns :: [Int]
ns = (Vector a -> Int) -> [Vector a] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map Vector a -> Int
forall a. Unbox a => Vector a -> Int
U.length [Vector a]
samples
    d :: ChiSquared
d  = Int -> ChiSquared
chiSquared ([Int] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Int]
ns Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)

-- * Helper functions

sumWith :: Num a => [Sample] -> (Sample -> a) -> a
sumWith :: [Vector Double] -> (Vector Double -> a) -> a
sumWith [Vector Double]
samples Vector Double -> a
f = [a] -> a
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
Prelude.sum ([a] -> a) -> [a] -> a
forall a b. (a -> b) -> a -> b
$ (Vector Double -> a) -> [Vector Double] -> [a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Vector Double -> a
f [Vector Double]
samples
{-# INLINE sumWith #-}