-- |
-- Module    : Statistics.Test.KruskalWallis
-- Copyright : (c) 2014 Danny Navarro
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
module Statistics.Test.KruskalWallis
  ( kruskalWallisRank
  , kruskalWallis
  , kruskalWallisSignificant
  , kruskalWallisTest
  ) where

import Data.Ord (comparing)
import Data.Foldable (foldMap)
import qualified Data.Vector.Unboxed as U
import Statistics.Function (sort, sortBy, square)
import Statistics.Distribution (quantile)
import Statistics.Distribution.ChiSquared (chiSquared)
import Statistics.Test.Types (TestResult(..), significant)
import Statistics.Test.Internal (rank)
import Statistics.Sample
import qualified Statistics.Sample.Internal as Sample(sum)


-- | Kruskal-Wallis ranking.
--
-- All values are replaced by the absolute rank in the combined samples.
--
-- The samples and values need not to be ordered but the values in the result
-- are ordered. Assigned ranks (ties are given their average rank).
kruskalWallisRank :: [Sample] -> [Sample]
kruskalWallisRank samples = groupByTags
                          . sortBy (comparing fst)
                          . U.zip tags
                          $ rank (==) joinSample
  where
    (tags,joinSample) = U.unzip
                      . sortBy (comparing snd)
                      $ foldMap (uncurry tagSample) $ zip [(1::Int)..] samples
    tagSample t = U.map (\x -> (t,x))

    groupByTags xs
        | U.null xs = []
        | otherwise = sort (U.map snd ys) : groupByTags zs
      where
        (ys,zs) = U.span ((==) (fst $ U.head xs) . fst) xs


-- | The Kruskal-Wallis Test.
--
-- In textbooks the output value is usually represented by 'K' or 'H'. This
-- function already does the ranking.
kruskalWallis :: [Sample] -> Double
kruskalWallis samples = (nTot - 1) * numerator / denominator
  where
    -- Total number of elements in all samples
    nTot    = fromIntegral $ sumWith rsamples U.length
    -- Average rank of all samples
    avgRank = (nTot + 1) / 2
    --
    numerator = sumWith rsamples $ \sample ->
        let n = fromIntegral $ U.length sample
        in  n * square (mean sample - avgRank)
    denominator = sumWith rsamples $ \sample ->
        Sample.sum $ U.map (\r -> square (r - avgRank)) sample

    rsamples = kruskalWallisRank samples


-- | Calculates whether the Kruskal-Wallis test is significant.
--
-- It uses /Chi-Squared/ distribution for aproximation as long as the sizes are
-- larger than 5. Otherwise the test returns 'Nothing'.
kruskalWallisSignificant ::
       [Int]  -- ^ The samples' size
    -> Double -- ^ The p-value at which to test (e.g. 0.05)
    -> Double -- ^ K value from 'kruskallWallis'
    -> Maybe TestResult
kruskalWallisSignificant ns p k
    -- Use chi-squared approximation
    | all (>4) ns = Just . significant $ k > x
    -- TODO: Implement critical value calculation: kruskalWallisCriticalValue
    | otherwise = Nothing
  where
    x = quantile (chiSquared (length ns - 1)) (1 - p)

-- | Perform Kruskal-Wallis Test for the given samples and required
-- significance. For additional information check 'kruskalWallis'. This is just
-- a helper function.
kruskalWallisTest :: Double -> [Sample] -> Maybe TestResult
kruskalWallisTest p samples =
    kruskalWallisSignificant (map U.length samples) p $ kruskalWallis samples

-- * Helper functions

sumWith :: Num a => [Sample] -> (Sample -> a) -> a
sumWith samples f = Prelude.sum $ fmap f samples
{-# INLINE sumWith #-}