```{-# LANGUAGE TypeFamilies, PatternGuards #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  Statistics.Order
-- Copyright   :  (C) 2012 Edward Kmett,
--
-- Maintainer  :  Edward Kmett <ekmett@gmail.com>
-- Stability   :  provisional
-- Portability :  Haskell 2011 + TypeFamilies
--
----------------------------------------------------------------------------

module Statistics.Order
(
-- * L-Estimator
L(..)
-- ** Applying an L-estimator
, (@@), (@!)
-- ** Analyzing an L-estimator
, (@#)
, breakdown
-- ** Robust L-Estimators
, trimean  -- Tukey's trimean
, midhinge -- average of q1 and q3
, iqr      -- interquartile range
, iqm      -- interquartile mean
, lscale   -- second L-moment
-- ** L-Estimator Combinators
, trimmed
, winsorized, winsorised
, jackknifed
-- ** Trivial L-Estimators
, mean
, total
, lmin, lmax
, midrange
-- ** Sample-size-dependent L-Estimators
, nthSmallest
, nthLargest
-- ** Quantiles
-- *** Common quantiles
, quantile
, median
, tercile, t1, t2
, quartile, q1, q2, q3
, quintile, qu1, qu2, qu3, qu4
, percentile
, permille
-- *** Harrell-Davis Quantile Estimator
, hdquantile
-- *** Compute a quantile using a specified quantile estimation strategy
, quantileBy
-- * Sample Quantile Estimators
, Estimator
, Estimate(..)
, r1,r2,r3,r4,r5,r6,r7,r8,r9,r10
) where

import Data.Ratio
import Data.List (sort)
import Data.IntMap (IntMap)
import qualified Data.IntMap as IM
import Data.Vector (Vector, (!))
import qualified Data.Vector as V
import Data.VectorSpace
import Statistics.Distribution.Beta
import qualified Statistics.Distribution as D

-- | L-estimators are linear combinations of order statistics used by 'robust' statistics.
newtype L r = L { runL :: Int -> IntMap r }

-- TODO: Write code to test if the result of a given L-estimator will be always than or equal
-- to the result of another L-estimator at a given sample size.

-- | Calculate the result of applying an L-estimator after sorting list into order statistics
(@@) :: (Num r, Ord r) => L r -> [r] -> r
l @@ xs = l @! V.fromList (sort xs)

-- | Calculate the result of applying an L-estimator to a *pre-sorted* vector of order statistics
(@!) :: Num r => L r -> Vector r -> r
L f @! v = IM.foldrWithKey (\k x y -> (v ! k) * x + y) 0 \$ f (V.length v)

-- | get a vector of the coefficients of an L estimator when applied to an input of a given length
(@#) :: Num r => L r -> Int -> [r]
L f @# n = map (\k -> IM.findWithDefault 0 k fn) [0..n-1] where fn = f n

-- | calculate the breakdown % of an L-estimator
breakdown :: (Num r, Eq r) => L r -> Int
breakdown (L f)
| IM.null m = 50
| otherwise = fst (IM.findMin m) `min` (100 - fst (IM.findMax m))
where m = IM.filter (/= 0) \$ f 101

instance Num r => AdditiveGroup (L r) where
zeroV = L \$ \_ -> IM.empty
L fa ^+^ L fb = L \$ \n -> IM.unionWith (+) (fa n) (fb n)
negateV (L fa) = L \$ fmap negate . fa

instance Num r => VectorSpace (L r) where
type Scalar (L r) = r
x *^ L y = L \$ fmap (x *) . y

clamp :: Int -> Int -> Int
clamp n k
| k <= 0 = 0
| k >= n = n - 1
| otherwise = k

-- | The average of all of the order statistics. Not robust.
--
-- > breakdown mean = 0%
mean :: Fractional r => L r
mean = L \$ \n -> IM.fromList [ (i, 1 / fromIntegral n) | i <- [0..n-1]]

-- | The sum of all of the order statistics. Not robust.
--
-- > breakdown total = 0%
total :: Num r => L r
total = L \$ \n -> IM.fromList [ (i, 1) | i <- [0..n-1]]

-- | Calculate a trimmed L-estimator. If the sample size isn't evenly divided, linear interpolation is used
-- as described in <http://en.wikipedia.org/wiki/Trimmed_mean#Interpolation>

-- Trimming can increase the robustness of a statistic by removing outliers.

trimmed :: Fractional r => Rational -> L r -> L r
trimmed p (L g) = L \$ \n -> case properFraction (fromIntegral n * p) of
(w, 0)               -> IM.fromDistinctAscList [ (k + w, v)                        | (k,v) <- IM.toAscList \$ g (n - w*2)]
(w, f) | w' <- w + 1 -> IM.fromListWith (+) \$  [ (k + w, fromRational (1 - f) * v) | (k,v) <- IM.toList \$ g (n - w*2)] ++
[ (k + w', fromRational f  * v)     | (k,v) <- IM.toList \$ g (n - w'*2)]

-- | Calculates an interpolated winsorized L-estimator in a manner analogous to the trimmed estimator.
-- Unlike trimming, winsorizing replaces the extreme values.
winsorized, winsorised :: Fractional r => Rational -> L r -> L r
winsorised = winsorized
winsorized p (L g) = L \$ \n -> case properFraction (fromIntegral n * p) of
(w, 0)               -> IM.fromAscListWith (+) [ (w `max` min (n - 1 - w) k, v) | (k,v) <- IM.toAscList (g n) ]
(w, f) | w' <- w + 1 -> IM.fromListWith (+) \$ do
(k,v) <- IM.toList (g n)
[ (w  `max` min (n - 1 - w ) k, v * fromRational (1 - f)),
(w' `max` min (n - 1 - w') k, v * fromRational f)]

-- | Jackknifes the statistic by removing each sample in turn and recalculating the L-estimator,
-- requires at least 2 samples!
jackknifed :: Fractional r => L r -> L r
jackknifed (L g) = L \$ \n -> IM.fromAscListWith (+) \$ do
let n' = fromIntegral n
(k,v) <- IM.toAscList (g (n - 1))
let k' = fromIntegral k + 1
[(k, (n' - k') * v / n'), (k + 1, k' * v / n')]

-- | The most robust L-estimator possible.
--
-- > breakdown median = 50
median :: Fractional r => L r
median = L go where
go n
| odd n        = IM.singleton (div (n - 1) 2) 1
| k <- div n 2 = IM.fromList [(k-1, 0.5), (k, 0.5)]

-- | Sample quantile estimators
data Estimate r  = Estimate {-# UNPACK #-} !Rational (IntMap r)
type Estimator r = Rational -> Int -> Estimate r

-- | Compute a quantile using the given estimation strategy to interpolate when an exact quantile isn't available
quantileBy :: Num r => Estimator r -> Rational -> L r
quantileBy f p = L \$ \n -> case f p n of
Estimate h qp -> case properFraction h of
(w, 0) -> IM.singleton (clamp n (w - 1)) 1
_      -> qp

-- | Compute a quantile with traditional direct averaging
quantile :: Fractional r => Rational -> L r
quantile = quantileBy r2

tercile :: Fractional r => Rational -> L r
tercile n = quantile (n/3)

-- | terciles 1 and 2
--
-- > breakdown t1 = breakdown t2 = 33%
t1, t2 :: Fractional r => L r
t1 = quantile (1/3)
t2 = quantile (2/3)

quartile :: Fractional r => Rational -> L r
quartile n = quantile (n/4)

-- | quantiles, with breakdown points 25%, 50%, and 25% respectively
q1, q2, q3 :: Fractional r => L r
q1 = quantile 0.25
q2 = median
q3 = quantile 0.75

quintile :: Fractional r => Rational -> L r
quintile n = quantile (n/5)

-- | quintiles 1 through 4
qu1, qu2, qu3, qu4 :: Fractional r => L r
qu1 = quintile 1
qu2 = quintile 2
qu3 = quintile 3
qu4 = quintile 4

-- |
--
-- > breakdown (percentile n) = min n (100 - n)
percentile :: Fractional r => Rational -> L r
percentile n = quantile (n/100)

permille :: Fractional r => Rational -> L r
permille n = quantile (n/1000)

nthSmallest :: Num r => Int -> L r
nthSmallest k = L \$ \n -> IM.singleton (clamp n k) 1

nthLargest :: Num r => Int -> L r
nthLargest k = L \$ \n -> IM.singleton (clamp n (n - 1 - k)) 1

-- |
--
-- > midhinge = trimmed 0.25 midrange
-- > breakdown midhinge = 25%
midhinge :: Fractional r => L r
midhinge = (q1 ^+^ q3) ^/ 2

-- | Tukey's trimean
--
-- > breakdown trimean = 25
trimean :: Fractional r => L r
trimean = (q1 ^+^ 2 *^ q2 ^+^ q3) ^/ 4

-- | The maximum value in the sample
lmax :: Num r => L r
lmax = L \$ \n -> IM.singleton (n-1) 1

-- | The minimum value in the sample
lmin :: Num r => L r
lmin = L \$ \_ -> IM.singleton 0 1

-- |
-- > midrange = lmax - lmin
-- > breakdown midrange = 0%
midrange :: Fractional r => L r
midrange = L \$ \n -> IM.fromList [(0,-1),(n-1,1)]

-- | interquartile range
--
-- > breakdown iqr = 25%
-- > iqr = trimmed 0.25 midrange
iqr :: Fractional r => L r
iqr = q3 ^-^ q1

-- | interquartile mean
--
-- > iqm = trimmed 0.25 mean
iqm :: Fractional r => L r
iqm = trimmed 0.25 mean

-- | Direct estimator for the second L-moment given a sample
lscale :: Fractional r => L r
lscale = L \$ \n -> let
r = fromIntegral n
scale = 1 / (r * (r-1))
in IM.fromList [ (i - 1, scale * (2 * fromIntegral i - 1 - r)) | i <- [1..n] ]

-- | The Harrell-Davis quantile estimate. Uses multiple order statistics to approximate the quantile
-- to reduce variance.
hdquantile :: Fractional r => Rational -> L r
hdquantile q = L \$ \n ->
let n' = fromIntegral n
np1 = n' + 1
q' = fromRational q
d = betaDistr (q'*np1) (np1*(1-q')) in
if q == 0 then IM.singleton 0 1
else if q == 1 then IM.singleton (n - 1) 1
else IM.fromAscList
[ (i, realToFrac \$ D.cumulative d ((fromIntegral i + 1) / n') -
D.cumulative d (fromIntegral i / n'))
| i <- [0 .. n - 1]
]

-- More information on the individual estimators used below can be found in:
-- http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
-- and
-- http://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population

-- | Inverse of the empirical distribution function
r1 :: Num r => Estimator r
r1 p n = Estimate (np + 1%2) \$ IM.singleton (clamp n (ceiling np - 1)) 1
where np = fromIntegral n * p

-- | .. with averaging at discontinuities
r2 :: Fractional r => Estimator r
r2 p n =  Estimate (np + 1%2) \$
if p == 0      then IM.singleton 0       1
else if p == 1 then IM.singleton (n - 1) 1
else IM.fromList [(ceiling np - 1, 0.5), (floor np, 0.5)]
where np = fromIntegral n * p

-- | The observation numbered closest to Np. NB: does not yield a proper median
r3 :: Num r => Estimator r
r3 p n = Estimate np \$ IM.singleton (clamp n (round np - 1)) 1
where np = fromIntegral n * p

-- continuous sample quantiles
continuousEstimator ::
Fractional r =>
(Rational -> (Rational, Rational)) -> -- take the number of samples, and return upper and lower bounds on 'p = k/n' for which this interpolation should be used
(Rational -> Rational -> Rational) -> -- take p = k/q, and n the number of samples, and return the coefficient h which will be used for interpolation when h is not integral
Estimator r
continuousEstimator bds f p n = Estimate h \$
if p < lo then IM.singleton 0 1
else if p >= hi then IM.singleton (n - 1) 1
else case properFraction h of
(w,frac) | frac' <- fromRational frac -> IM.fromList [(w - 1, frac'), (w, 1 - frac')]
where
r = fromIntegral n
h = f p r
(lo, hi) = bds r

-- | Linear interpolation of the empirical distribution function. NB: does not yield a proper median.
r4 :: Fractional r => Estimator r
r4 = continuousEstimator (\n -> (1 / n, 1)) (*)

-- | .. with knots midway through the steps as used in hydrology. This is the simplest continuous estimator that yields a correct median
r5 :: Fractional r => Estimator r
r5 = continuousEstimator (\n -> let tn = 2 * n in (1 / tn, (tn - 1) / tn)) \$ \p n -> p*n + 0.5

-- | Linear interpolation of the expectations of the order statistics for the uniform distribution on [0,1]
r6 :: Fractional r => Estimator r
r6 = continuousEstimator (\n -> (1 / (n + 1), n / (n + 1))) \$ \p n -> p*(n+1)

-- | Linear interpolation of the modes for the order statistics for the uniform distribution on [0,1]
r7 :: Fractional r => Estimator r
r7 = continuousEstimator (\_ -> (0, 1)) \$ \p n -> p*(n-1) + 1

-- | Linear interpolation of the approximate medans for order statistics.
r8 :: Fractional r => Estimator r
r8 = continuousEstimator (\n -> (2/3 / (n + 1/3), (n - 1/3)/(n + 1/3))) \$ \p n -> p*(n + 1/3) + 1/3

-- | The resulting quantile estimates are approximately unbiased for the expected order statistics if x is normally distributed.
r9 :: Fractional r => Estimator r
r9 = continuousEstimator (\n -> (0.625 / (n + 0.25), (n - 0.375)/(n + 0.25))) \$ \p n -> p*(n + 0.25) + 0.375

-- | When rounding h, this yields the order statistic with the least expected square deviation relative to p.
r10 :: Fractional r => Estimator r
r10 = continuousEstimator (\n -> (1.5 / (n + 2), (n + 0.5)/(n + 2))) \$ \p n -> p*(n + 2) - 0.5

```