{-# LANGUAGE FlexibleContexts #-}
-- |
-- Module    : Statistics.Sample.KernelDensity.Simple
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Kernel density estimation code, providing non-parametric ways to
-- estimate the probability density function of a sample.
--
-- The techniques used by functions in this module are relatively
-- fast, but they generally give inferior results to the KDE function
-- in the main 'Statistics.KernelDensity' module (due to the
-- oversmoothing documented for 'bandwidth' below).

module Statistics.Sample.KernelDensity.Simple
    {-# DEPRECATED "Use Statistics.Sample.KernelDensity instead." #-}
    (
    -- * Simple entry points
      epanechnikovPDF
    , gaussianPDF
    -- * Building blocks
    -- These functions may be useful if you need to construct a kernel
    -- density function estimator other than the ones provided in this
    -- module.

    -- ** Choosing points from a sample
    , Points(..)
    , choosePoints
    -- ** Bandwidth estimation
    , Bandwidth
    , bandwidth
    , epanechnikovBW
    , gaussianBW
    -- ** Kernels
    , Kernel
    , epanechnikovKernel
    , gaussianKernel
    -- ** Low-level estimation
    , estimatePDF
    , simplePDF
    -- * References
    -- $references
    ) where

import Numeric.MathFunctions.Constants (m_1_sqrt_2, m_2_sqrt_pi)
import Statistics.Function (minMax)
import Statistics.Sample   (stdDev)
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Generic as G

-- | Points from the range of a 'Sample'.
newtype Points = Points {
      fromPoints :: U.Vector Double
    } deriving (Eq, Show)

-- | Bandwidth estimator for an Epanechnikov kernel.
epanechnikovBW :: Double -> Bandwidth
epanechnikovBW n = (80 / (n * m_2_sqrt_pi)) ** 0.2

-- | Bandwidth estimator for a Gaussian kernel.
gaussianBW :: Double -> Bandwidth
gaussianBW n = (4 / (n * 3)) ** 0.2

-- | The width of the convolution kernel used.
type Bandwidth = Double

-- | Compute the optimal bandwidth from the observed data for the
-- given kernel.
--
-- This function uses an estimate based on the standard deviation of a
-- sample (due to Deheuvels), which performs reasonably well for
-- unimodal distributions but leads to oversmoothing for more complex
-- ones.
bandwidth :: G.Vector v Double =>
             (Double -> Bandwidth)
          -> v Double
          -> Bandwidth
bandwidth kern values = stdDev values * kern (fromIntegral $ G.length values)

-- | Choose a uniform range of points at which to estimate a sample's
-- probability density function.
--
-- If you are using a Gaussian kernel, multiply the sample's bandwidth
-- by 3 before passing it to this function.
--
-- If this function is passed an empty vector, it returns values of
-- positive and negative infinity.
choosePoints :: G.Vector v Double =>
                Int             -- ^ Number of points to select, /n/
             -> Double          -- ^ Sample bandwidth, /h/
             -> v Double        -- ^ Input data
             -> Points
choosePoints n h sample = Points . U.map f $ U.enumFromTo 0 n'
  where lo     = a - h
        hi     = z + h
        (a, z) = minMax sample
        d      = (hi - lo) / fromIntegral n'
        f i    = lo + fromIntegral i * d
        n'     = n - 1

-- | The convolution kernel.  Its parameters are as follows:
--
-- * Scaling factor, 1\//nh/
--
-- * Bandwidth, /h/
--
-- * A point at which to sample the input, /p/
--
-- * One sample value, /v/
type Kernel =  Double
            -> Double
            -> Double
            -> Double
            -> Double

-- | Epanechnikov kernel for probability density function estimation.
epanechnikovKernel :: Kernel
epanechnikovKernel f h p v
    | abs u <= 1 = f * (1 - u * u)
    | otherwise  = 0
    where u = (v - p) / (h * 0.75)

-- | Gaussian kernel for probability density function estimation.
gaussianKernel :: Kernel
gaussianKernel f h p v = exp (-0.5 * u * u) * g
    where u = (v - p) / h
          g = f * 0.5 * m_2_sqrt_pi * m_1_sqrt_2

-- | Kernel density estimator, providing a non-parametric way of
-- estimating the PDF of a random variable.
estimatePDF :: G.Vector v Double =>
               Kernel           -- ^ Kernel function
            -> Bandwidth        -- ^ Bandwidth, /h/
            -> v Double         -- ^ Sample data
            -> Points           -- ^ Points at which to estimate
            -> U.Vector Double
estimatePDF kernel h sample
    | n < 2     = errorShort "estimatePDF"
    | otherwise = U.map k . fromPoints
  where
    k p = G.sum . G.map (kernel f h p) $ sample
    f   = 1 / (h * fromIntegral n)
    n   = G.length sample
{-# INLINE estimatePDF #-}

-- | A helper for creating a simple kernel density estimation function
-- with automatically chosen bandwidth and estimation points.
simplePDF :: G.Vector v Double =>
             (Double -> Double) -- ^ Bandwidth function
          -> Kernel             -- ^ Kernel function
          -> Double             -- ^ Bandwidth scaling factor (3 for a Gaussian kernel, 1 for all others)
          -> Int                -- ^ Number of points at which to estimate
          -> v Double           -- ^ sample data
          -> (Points, U.Vector Double)
simplePDF fbw fpdf k numPoints sample =
    (points, estimatePDF fpdf bw sample points)
  where points = choosePoints numPoints (bw*k) sample
        bw     = bandwidth fbw sample
{-# INLINE simplePDF #-}

-- | Simple Epanechnikov kernel density estimator.  Returns the
-- uniformly spaced points from the sample range at which the density
-- function was estimated, and the estimates at those points.
epanechnikovPDF :: G.Vector v Double =>
                   Int          -- ^ Number of points at which to estimate
                -> v Double     -- ^ Data sample
                -> (Points, U.Vector Double)
epanechnikovPDF = simplePDF epanechnikovBW epanechnikovKernel 1

-- | Simple Gaussian kernel density estimator.  Returns the uniformly
-- spaced points from the sample range at which the density function
-- was estimated, and the estimates at those points.
gaussianPDF :: G.Vector v Double =>
               Int              -- ^ Number of points at which to estimate
            -> v Double         -- ^ Data sample
            -> (Points, U.Vector Double)
gaussianPDF = simplePDF gaussianBW gaussianKernel 3

errorShort :: String -> a
errorShort func = error ("Statistics.KernelDensity." ++ func ++
                        ": at least two points required")

-- $references
--
-- * Deheuvels, P. (1977) Estimation non paramétrique de la densité
--   par histogrammes
--   généralisés. Mhttp://archive.numdam.org/article/RSA_1977__25_3_5_0.pdf>