{-# LANGUAGE FlexibleContexts #-}
-- |
-- Module    : Statistics.Quantile
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Functions for approximating quantiles, i.e. points taken at regular
-- intervals from the cumulative distribution function of a random
-- variable.
--
-- The number of quantiles is described below by the variable /q/, so
-- with /q/=4, a 4-quantile (also known as a /quartile/) has 4
-- intervals, and contains 5 points.  The parameter /k/ describes the
-- desired point, where 0 ≤ /k/ ≤ /q/.

module Statistics.Quantile
    (
    -- * Quantile estimation functions
      weightedAvg
    , ContParam(..)
    , continuousBy
    , midspread

    -- * Parameters for the continuous sample method
    , cadpw
    , hazen
    , s
    , spss
    , medianUnbiased
    , normalUnbiased

    -- * References
    -- $references
    ) where

import Data.Vector.Generic ((!))
import Numeric.MathFunctions.Constants (m_epsilon)
import Statistics.Function (partialSort)
import qualified Data.Vector as V
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U

-- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample,
-- using the weighted average method.
--
-- The following properties should hold:
--   * the length of the input is greater than @0@
--   * the input does not contain @NaN@
--   * k ≥ 0 and k ≤ q
--
-- otherwise an error will be thrown.
weightedAvg :: G.Vector v Double =>
               Int        -- ^ /k/, the desired quantile.
            -> Int        -- ^ /q/, the number of quantiles.
            -> v Double   -- ^ /x/, the sample data.
            -> Double
weightedAvg k q x
  | G.any isNaN x   = modErr "weightedAvg" "Sample contains NaNs"
  | n == 0          = modErr "weightedAvg" "Sample is empty"
  | n == 1          = G.head x
  | q < 2           = modErr "weightedAvg" "At least 2 quantiles is needed"
  | k == q          = G.maximum x
  | k >= 0 || k < q = xj + g * (xj1 - xj)
  | otherwise       = modErr "weightedAvg" "Wrong quantile number"
  where
    j   = floor idx
    idx = fromIntegral (n - 1) * fromIntegral k / fromIntegral q
    g   = idx - fromIntegral j
    xj  = sx ! j
    xj1 = sx ! (j+1)
    sx  = partialSort (j+2) x
    n   = G.length x
{-# SPECIALIZE weightedAvg :: Int -> Int -> U.Vector Double -> Double #-}
{-# SPECIALIZE weightedAvg :: Int -> Int -> V.Vector Double -> Double #-}

-- | Parameters /a/ and /b/ to the 'continuousBy' function.
data ContParam = ContParam {-# UNPACK #-} !Double {-# UNPACK #-} !Double

-- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample /x/,
-- using the continuous sample method with the given parameters.  This
-- is the method used by most statistical software, such as R,
-- Mathematica, SPSS, and S.
continuousBy :: G.Vector v Double =>
                ContParam  -- ^ Parameters /a/ and /b/.
             -> Int        -- ^ /k/, the desired quantile.
             -> Int        -- ^ /q/, the number of quantiles.
             -> v Double   -- ^ /x/, the sample data.
             -> Double
continuousBy (ContParam a b) k q x
  | q < 2          = modErr "continuousBy" "At least 2 quantiles is needed"
  | k < 0 || k > q = modErr "continuousBy" "Wrong quantile number"
  | G.any isNaN x  = modErr "continuousBy" "Sample contains NaNs"
  | otherwise      = (1-h) * item (j-1) + h * item j
  where
    j               = floor (t + eps)
    t               = a + p * (fromIntegral n + 1 - a - b)
    p               = fromIntegral k / fromIntegral q
    h | abs r < eps = 0
      | otherwise   = r
      where r       = t - fromIntegral j
    eps             = m_epsilon * 4
    n               = G.length x
    item            = (sx !) . bracket
    sx              = partialSort (bracket j + 1) x
    bracket m       = min (max m 0) (n - 1)
{-# SPECIALIZE
    continuousBy :: ContParam -> Int -> Int -> U.Vector Double -> Double #-}
{-# SPECIALIZE
    continuousBy :: ContParam -> Int -> Int -> V.Vector Double -> Double #-}

-- | O(/n/ log /n/). Estimate the range between /q/-quantiles 1 and
-- /q/-1 of a sample /x/, using the continuous sample method with the
-- given parameters.
--
-- For instance, the interquartile range (IQR) can be estimated as
-- follows:
--
-- > midspread medianUnbiased 4 (U.fromList [1,1,2,2,3])
-- > ==> 1.333333
midspread :: G.Vector v Double =>
             ContParam  -- ^ Parameters /a/ and /b/.
          -> Int        -- ^ /q/, the number of quantiles.
          -> v Double   -- ^ /x/, the sample data.
          -> Double
midspread (ContParam a b) k x
  | G.any isNaN x = modErr "midspread" "Sample contains NaNs"
  | k <= 0        = modErr "midspread" "Nonpositive number of quantiles"
  | otherwise     = quantile (1-frac) - quantile frac
  where
    quantile i        = (1-h i) * item (j i-1) + h i * item (j i)
    j i               = floor (t i + eps) :: Int
    t i               = a + i * (fromIntegral n + 1 - a - b)
    h i | abs r < eps = 0
        | otherwise   = r
        where r       = t i - fromIntegral (j i)
    eps               = m_epsilon * 4
    n                 = G.length x
    item              = (sx !) . bracket
    sx                = partialSort (bracket (j (1-frac)) + 1) x
    bracket m         = min (max m 0) (n - 1)
    frac              = 1 / fromIntegral k
{-# SPECIALIZE midspread :: ContParam -> Int -> U.Vector Double -> Double #-}
{-# SPECIALIZE midspread :: ContParam -> Int -> V.Vector Double -> Double #-}

-- | California Department of Public Works definition, /a/=0, /b/=1.
-- Gives a linear interpolation of the empirical CDF.  This
-- corresponds to method 4 in R and Mathematica.
cadpw :: ContParam
cadpw = ContParam 0 1

-- | Hazen's definition, /a/=0.5, /b/=0.5.  This is claimed to be
-- popular among hydrologists.  This corresponds to method 5 in R and
-- Mathematica.
hazen :: ContParam
hazen = ContParam 0.5 0.5

-- | Definition used by the SPSS statistics application, with /a/=0,
-- /b/=0 (also known as Weibull's definition).  This corresponds to
-- method 6 in R and Mathematica.
spss :: ContParam
spss = ContParam 0 0

-- | Definition used by the S statistics application, with /a/=1,
-- /b/=1.  The interpolation points divide the sample range into @n-1@
-- intervals.  This corresponds to method 7 in R and Mathematica.
s :: ContParam
s = ContParam 1 1

-- | Median unbiased definition, /a/=1\/3, /b/=1\/3. The resulting
-- quantile estimates are approximately median unbiased regardless of
-- the distribution of /x/.  This corresponds to method 8 in R and
-- Mathematica.
medianUnbiased :: ContParam
medianUnbiased = ContParam third third
    where third = 1/3

-- | Normal unbiased definition, /a/=3\/8, /b/=3\/8.  An approximately
-- unbiased estimate if the empirical distribution approximates the
-- normal distribution.  This corresponds to method 9 in R and
-- Mathematica.
normalUnbiased :: ContParam
normalUnbiased = ContParam ta ta
    where ta = 3/8

modErr :: String -> String -> a
modErr f err = error $ "Statistics.Quantile." ++ f ++ ": " ++ err

-- $references
--
-- * Weisstein, E.W. Quantile. /MathWorld/.
--   <http://mathworld.wolfram.com/Quantile.html>
--
-- * Hyndman, R.J.; Fan, Y. (1996) Sample quantiles in statistical
--   packages. /American Statistician/
--   50(4):361&#8211;365. <http://www.jstor.org/stable/2684934>