```{-# LANGUAGE TypeOperators #-}
-- |
-- Module    : Statistics.Quantile
-- Copyright : (c) 2009 Bryan O'Sullivan
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Functions for approximating quantiles, i.e. points taken at regular
-- intervals from the cumulative distribution function of a random
-- variable.
--
-- The number of quantiles is described below by the variable /q/, so
-- with /q/=4, a 4-quantile (also known as a /quartile/) has 4
-- intervals, and contains 5 points.  The parameter /k/ describes the
-- desired point, where 0 &#8804; /k/ &#8804; /q/.

module Statistics.Quantile
(
-- * Quantile estimation functions
weightedAvg
, ContParam(..)
, continuousBy

-- * Parameters for the continuous sample method
, hazen
, s
, spss
, medianUnbiased
, normalUnbiased

-- * References
-- \$references
) where

import Control.Exception (assert)
import Data.Array.Vector (allU, indexU, lengthU)
import Statistics.Constants (m_epsilon)
import Statistics.Function (partialSort)
import Statistics.Types (Sample)

-- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample,
-- using the weighted average method.
weightedAvg :: Int              -- ^ /k/, the desired quantile.
-> Int              -- ^ /q/, the number of quantiles.
-> Sample           -- ^ /x/, the sample data.
-> Double
weightedAvg k q x =
assert (q >= 2) .
assert (k >= 0) .
assert (k < q) .
assert (allU (not . isNaN) x) \$
xj + g * (xj1 - xj)
where
j   = floor idx
idx = fromIntegral (lengthU x - 1) * fromIntegral k / fromIntegral q
g   = idx - fromIntegral j
xj  = indexU sx j
xj1 = indexU sx (j+1)
sx  = partialSort (j+2) x
{-# INLINE weightedAvg #-}

-- | Parameters /a/ and /b/ to the 'continuousBy' function.
data ContParam = ContParam {-# UNPACK #-} !Double {-# UNPACK #-} !Double

-- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample /x/,
-- using the continuous sample method with the given parameters.  This
-- is the method used by most statistical software, such as R,
-- Mathematica, SPSS, and S.
continuousBy :: ContParam       -- ^ Parameters /a/ and /b/.
-> Int             -- ^ /k/, the desired quantile.
-> Int             -- ^ /q/, the number of quantiles.
-> Sample          -- ^ /x/, the sample data.
-> Double
continuousBy (ContParam a b) k q x =
assert (q >= 2) .
assert (k >= 0) .
assert (k <= q) .
assert (allU (not . isNaN) x) \$
(1-h) * item (j-1) + h * item j
where
j               = floor (t + eps)
t               = a + p * (fromIntegral n + 1 - a - b)
p               = fromIntegral k / fromIntegral q
h | abs r < eps = 0
| otherwise   = r
where r       = t - fromIntegral j
eps             = m_epsilon * 4
n               = lengthU x
item            = indexU sx . bracket
sx              = partialSort (bracket j + 1) x
bracket m       = min (max m 0) (n - 1)
{-# INLINE continuousBy #-}

-- | O(/n/ log /n/). Estimate the range between /q/-quantiles 1 and
-- /q/-1 of a sample /x/, using the continuous sample method with the
-- given parameters.
--
-- For instance, the interquartile range (IQR) can be estimated as
-- follows:
--
-- > midspread medianUnbiased 4 (toU [1,1,2,2,3])
-- > ==> 1.333333
midspread :: ContParam       -- ^ Parameters /a/ and /b/.
-> Int             -- ^ /q/, the number of quantiles.
-> Sample          -- ^ /x/, the sample data.
-> Double
midspread (ContParam a b) k x =
assert (allU (not . isNaN) x) .
assert (k > 0) \$
quantile (1-frac) - quantile frac
where
quantile i        = (1-h i) * item (j i-1) + h i * item (j i)
j i               = floor (t i + eps) :: Int
t i               = a + i * (fromIntegral n + 1 - a - b)
h i | abs r < eps = 0
| otherwise   = r
where r       = t i - fromIntegral (j i)
eps               = m_epsilon * 4
n                 = lengthU x
item              = indexU sx . bracket
sx                = partialSort (bracket (j (1-frac)) + 1) x
bracket m         = min (max m 0) (n - 1)
frac              = 1 / fromIntegral k

-- | California Department of Public Works definition, /a/=0, /b/=1.
-- Gives a linear interpolation of the empirical CDF.  This
-- corresponds to method 4 in R and Mathematica.

-- | Hazen's definition, /a/=0.5, /b/=0.5.  This is claimed to be
-- popular among hydrologists.  This corresponds to method 5 in R and
-- Mathematica.
hazen :: ContParam
hazen = ContParam 0.5 0.5
{-# INLINE hazen #-}

-- | Definition used by the SPSS statistics application, with /a/=0,
-- /b/=0 (also known as Weibull's definition).  This corresponds to
-- method 6 in R and Mathematica.
spss :: ContParam
spss = ContParam 0 0
{-# INLINE spss #-}

-- | Definition used by the S statistics application, with /a/=1,
-- /b/=1.  The interpolation points divide the sample range into @n-1@
-- intervals.  This corresponds to method 7 in R and Mathematica.
s :: ContParam
s = ContParam 1 1
{-# INLINE s #-}

-- | Median unbiased definition, /a/=1\/3, /b/=1\/3. The resulting
-- quantile estimates are approximately median unbiased regardless of
-- the distribution of /x/.  This corresponds to method 8 in R and
-- Mathematica.
medianUnbiased :: ContParam
medianUnbiased = ContParam third third
where third = 1/3
{-# INLINE medianUnbiased #-}

-- | Normal unbiased definition, /a/=3\/8, /b/=3\/8.  An approximately
-- unbiased estimate if the empirical distribution approximates the
-- normal distribution.  This corresponds to method 9 in R and
-- Mathematica.
normalUnbiased :: ContParam
normalUnbiased = ContParam ta ta
where ta = 3/8
{-# INLINE normalUnbiased #-}

-- \$references
--
-- * Weisstein, E.W. Quantile. /MathWorld/.
--   <http://mathworld.wolfram.com/Quantile.html>
--
-- * Hyndman, R.J.; Fan, Y. (1996) Sample quantiles in statistical
--   packages. /American Statistician/
--   50(4):361&#8211;365. <http://www.jstor.org/stable/2684934>
```