```{-# LANGUAGE FlexibleContexts #-}
-- |
-- Module    : Statistics.KernelDensity
-- Copyright : (c) 2009 Bryan O'Sullivan
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Kernel density estimation code, providing non-parametric ways to
-- estimate the probability density function of a sample.

module Statistics.KernelDensity
(
-- * Simple entry points
epanechnikovPDF
, gaussianPDF
-- * Building blocks
-- These functions may be useful if you need to construct a kernel
-- density function estimator other than the ones provided in this
-- module.

-- ** Choosing points from a sample
, Points(..)
, choosePoints
-- ** Bandwidth estimation
, Bandwidth
, bandwidth
, epanechnikovBW
, gaussianBW
-- ** Kernels
, Kernel
, epanechnikovKernel
, gaussianKernel
-- ** Low-level estimation
, estimatePDF
, simplePDF
) where

import Statistics.Constants (m_1_sqrt_2, m_2_sqrt_pi)
import Statistics.Function (minMax)
import Statistics.Sample (stdDev)
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Generic as G

-- | Points from the range of a 'Sample'.
newtype Points = Points {
fromPoints :: U.Vector Double
} deriving (Eq, Show)

-- | Bandwidth estimator for an Epanechnikov kernel.
epanechnikovBW :: Double -> Bandwidth
epanechnikovBW n = (80 / (n * m_2_sqrt_pi)) ** 0.2

-- | Bandwidth estimator for a Gaussian kernel.
gaussianBW :: Double -> Bandwidth
gaussianBW n = (4 / (n * 3)) ** 0.2

-- | The width of the convolution kernel used.
type Bandwidth = Double

-- | Compute the optimal bandwidth from the observed data for the given
-- kernel.
bandwidth :: G.Vector v Double =>
(Double -> Bandwidth)
-> v Double
-> Bandwidth
bandwidth kern values = stdDev values * kern (fromIntegral \$ G.length values)

-- | Choose a uniform range of points at which to estimate a sample's
-- probability density function.
--
-- If you are using a Gaussian kernel, multiply the sample's bandwidth
-- by 3 before passing it to this function.
--
-- If this function is passed an empty vector, it returns values of
-- positive and negative infinity.
choosePoints :: G.Vector v Double =>
Int             -- ^ Number of points to select, /n/
-> Double          -- ^ Sample bandwidth, /h/
-> v Double        -- ^ Input data
-> Points
choosePoints n h sample = Points . U.map f \$ U.enumFromTo 0 n'
where lo     = a - h
hi     = z + h
(a, z) = minMax sample
d      = (hi - lo) / fromIntegral n'
f i    = lo + fromIntegral i * d
n'     = n - 1

-- | The convolution kernel.  Its parameters are as follows:
--
-- * Scaling factor, 1\//nh/
--
-- * Bandwidth, /h/
--
-- * A point at which to sample the input, /p/
--
-- * One sample value, /v/
type Kernel =  Double
-> Double
-> Double
-> Double
-> Double

-- | Epanechnikov kernel for probability density function estimation.
epanechnikovKernel :: Kernel
epanechnikovKernel f h p v
| abs u <= 1 = f * (1 - u * u)
| otherwise  = 0
where u = (v - p) / (h * 0.75)

-- | Gaussian kernel for probability density function estimation.
gaussianKernel :: Kernel
gaussianKernel f h p v = exp (-0.5 * u * u) * g
where u = (v - p) / h
g = f * m_2_sqrt_pi * m_1_sqrt_2

-- | Kernel density estimator, providing a non-parametric way of
-- estimating the PDF of a random variable.
estimatePDF :: G.Vector v Double =>
Kernel           -- ^ Kernel function
-> Bandwidth        -- ^ Bandwidth, /h/
-> v Double         -- ^ Sample data
-> Points           -- ^ Points at which to estimate
-> U.Vector Double
estimatePDF kernel h sample
| n < 2     = errorShort "estimatePDF"
| otherwise = U.map k . fromPoints
where
k p = G.sum . G.map (kernel f h p) \$ sample
f   = 1 / (h * fromIntegral n)
n   = G.length sample
{-# INLINE estimatePDF #-}

-- | A helper for creating a simple kernel density estimation function
-- with automatically chosen bandwidth and estimation points.
simplePDF :: G.Vector v Double =>
(Double -> Double) -- ^ Bandwidth function
-> Kernel             -- ^ Kernel function
-> Double             -- ^ Bandwidth scaling factor (3 for a Gaussian kernel, 1 for all others)
-> Int                -- ^ Number of points at which to estimate
-> v Double           -- ^ sample data
-> (Points, U.Vector Double)
simplePDF fbw fpdf k numPoints sample =
(points, estimatePDF fpdf bw sample points)
where points = choosePoints numPoints (bw*k) sample
bw     = bandwidth fbw sample
{-# INLINE simplePDF #-}

-- | Simple Epanechnikov kernel density estimator.  Returns the
-- uniformly spaced points from the sample range at which the density
-- function was estimated, and the estimates at those points.
epanechnikovPDF :: G.Vector v Double =>
Int          -- ^ Number of points at which to estimate
-> v Double     -- ^ Data sample
-> (Points, U.Vector Double)
epanechnikovPDF = simplePDF epanechnikovBW epanechnikovKernel 1

-- | Simple Gaussian kernel density estimator.  Returns the uniformly
-- spaced points from the sample range at which the density function
-- was estimated, and the estimates at those points.
gaussianPDF :: G.Vector v Double =>
Int              -- ^ Number of points at which to estimate
-> v Double         -- ^ Data sample
-> (Points, U.Vector Double)
gaussianPDF = simplePDF gaussianBW gaussianKernel 3

errorShort :: String -> a
errorShort func = error ("Statistics.KernelDensity." ++ func ++
": at least two points required")
```