{-# LANGUAGE FlexibleContexts #-} -- | -- Module : Statistics.Quantile -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Functions for approximating quantiles, i.e. points taken at regular -- intervals from the cumulative distribution function of a random -- variable. -- -- The number of quantiles is described below by the variable /q/, so -- with /q/=4, a 4-quantile (also known as a /quartile/) has 4 -- intervals, and contains 5 points. The parameter /k/ describes the -- desired point, where 0 ≤ /k/ ≤ /q/. module Statistics.Quantile ( -- * Quantile estimation functions weightedAvg , ContParam(..) , continuousBy , midspread -- * Parameters for the continuous sample method , cadpw , hazen , s , spss , medianUnbiased , normalUnbiased -- * References -- $references ) where import Control.Exception (assert) import Data.Vector.Generic ((!)) import Numeric.MathFunctions.Constants (m_epsilon) import Statistics.Function (partialSort) import qualified Data.Vector.Generic as G -- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample, -- using the weighted average method. weightedAvg :: G.Vector v Double => Int -- ^ /k/, the desired quantile. -> Int -- ^ /q/, the number of quantiles. -> v Double -- ^ /x/, the sample data. -> Double weightedAvg k q x | n == 1 = G.head x | otherwise = assert (q >= 2) . assert (k >= 0) . assert (k < q) . assert (G.all (not . isNaN) x) $ xj + g * (xj1 - xj) where j = floor idx idx = fromIntegral (n - 1) * fromIntegral k / fromIntegral q g = idx - fromIntegral j xj = sx ! j xj1 = sx ! (j+1) sx = partialSort (j+2) x n = G.length x {-# INLINE weightedAvg #-} -- | Parameters /a/ and /b/ to the 'continuousBy' function. data ContParam = ContParam {-# UNPACK #-} !Double {-# UNPACK #-} !Double -- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample /x/, -- using the continuous sample method with the given parameters. This -- is the method used by most statistical software, such as R, -- Mathematica, SPSS, and S. continuousBy :: G.Vector v Double => ContParam -- ^ Parameters /a/ and /b/. -> Int -- ^ /k/, the desired quantile. -> Int -- ^ /q/, the number of quantiles. -> v Double -- ^ /x/, the sample data. -> Double continuousBy (ContParam a b) k q x = assert (q >= 2) . assert (k >= 0) . assert (k <= q) . assert (G.all (not . isNaN) x) $ (1-h) * item (j-1) + h * item j where j = floor (t + eps) t = a + p * (fromIntegral n + 1 - a - b) p = fromIntegral k / fromIntegral q h | abs r < eps = 0 | otherwise = r where r = t - fromIntegral j eps = m_epsilon * 4 n = G.length x item = (sx !) . bracket sx = partialSort (bracket j + 1) x bracket m = min (max m 0) (n - 1) {-# INLINE continuousBy #-} -- | O(/n/ log /n/). Estimate the range between /q/-quantiles 1 and -- /q/-1 of a sample /x/, using the continuous sample method with the -- given parameters. -- -- For instance, the interquartile range (IQR) can be estimated as -- follows: -- -- > midspread medianUnbiased 4 (U.fromList [1,1,2,2,3]) -- > ==> 1.333333 midspread :: G.Vector v Double => ContParam -- ^ Parameters /a/ and /b/. -> Int -- ^ /q/, the number of quantiles. -> v Double -- ^ /x/, the sample data. -> Double midspread (ContParam a b) k x = assert (G.all (not . isNaN) x) . assert (k > 0) $ quantile (1-frac) - quantile frac where quantile i = (1-h i) * item (j i-1) + h i * item (j i) j i = floor (t i + eps) :: Int t i = a + i * (fromIntegral n + 1 - a - b) h i | abs r < eps = 0 | otherwise = r where r = t i - fromIntegral (j i) eps = m_epsilon * 4 n = G.length x item = (sx !) . bracket sx = partialSort (bracket (j (1-frac)) + 1) x bracket m = min (max m 0) (n - 1) frac = 1 / fromIntegral k {-# INLINE midspread #-} -- | California Department of Public Works definition, /a/=0, /b/=1. -- Gives a linear interpolation of the empirical CDF. This -- corresponds to method 4 in R and Mathematica. cadpw :: ContParam cadpw = ContParam 0 1 {-# INLINE cadpw #-} -- | Hazen's definition, /a/=0.5, /b/=0.5. This is claimed to be -- popular among hydrologists. This corresponds to method 5 in R and -- Mathematica. hazen :: ContParam hazen = ContParam 0.5 0.5 {-# INLINE hazen #-} -- | Definition used by the SPSS statistics application, with /a/=0, -- /b/=0 (also known as Weibull's definition). This corresponds to -- method 6 in R and Mathematica. spss :: ContParam spss = ContParam 0 0 {-# INLINE spss #-} -- | Definition used by the S statistics application, with /a/=1, -- /b/=1. The interpolation points divide the sample range into @n-1@ -- intervals. This corresponds to method 7 in R and Mathematica. s :: ContParam s = ContParam 1 1 {-# INLINE s #-} -- | Median unbiased definition, /a/=1\/3, /b/=1\/3. The resulting -- quantile estimates are approximately median unbiased regardless of -- the distribution of /x/. This corresponds to method 8 in R and -- Mathematica. medianUnbiased :: ContParam medianUnbiased = ContParam third third where third = 1/3 {-# INLINE medianUnbiased #-} -- | Normal unbiased definition, /a/=3\/8, /b/=3\/8. An approximately -- unbiased estimate if the empirical distribution approximates the -- normal distribution. This corresponds to method 9 in R and -- Mathematica. normalUnbiased :: ContParam normalUnbiased = ContParam ta ta where ta = 3/8 {-# INLINE normalUnbiased #-} -- $references -- -- * Weisstein, E.W. Quantile. /MathWorld/. -- -- -- * Hyndman, R.J.; Fan, Y. (1996) Sample quantiles in statistical -- packages. /American Statistician/ -- 50(4):361–365.