{-# LANGUAGE DeriveDataTypeable, DeriveGeneric, FlexibleContexts #-}
-- |
-- Module    : Statistics.Sample.KernelDensity.Simple
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Kernel density estimation code, providing non-parametric ways to
-- estimate the probability density function of a sample.
--
-- The techniques used by functions in this module are relatively
-- fast, but they generally give inferior results to the KDE function
-- in the main 'Statistics.KernelDensity' module (due to the
-- oversmoothing documented for 'bandwidth' below).

module Statistics.Sample.KernelDensity.Simple
    {-# DEPRECATED "Use Statistics.Sample.KernelDensity instead." #-}
    (
    -- * Simple entry points
      epanechnikovPDF
    , gaussianPDF
    -- * Building blocks
    -- These functions may be useful if you need to construct a kernel
    -- density function estimator other than the ones provided in this
    -- module.

    -- ** Choosing points from a sample
    , Points(..)
    , choosePoints
    -- ** Bandwidth estimation
    , Bandwidth
    , bandwidth
    , epanechnikovBW
    , gaussianBW
    -- ** Kernels
    , Kernel
    , epanechnikovKernel
    , gaussianKernel
    -- ** Low-level estimation
    , estimatePDF
    , simplePDF
    -- * References
    -- $references
    ) where

import Data.Aeson (FromJSON, ToJSON)
import Data.Binary (Binary(..))
import Data.Data (Data, Typeable)
import Data.Vector.Binary ()
import GHC.Generics (Generic)
import Numeric.MathFunctions.Constants (m_1_sqrt_2, m_2_sqrt_pi)
import Prelude hiding (sum)
import Statistics.Function (minMax)
import Statistics.Sample (stdDev)
import Statistics.Sample.Internal (sum)
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U

-- | Points from the range of a 'Sample'.
newtype Points = Points {
      Points -> Vector Double
fromPoints :: U.Vector Double
    } deriving (Points -> Points -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Points -> Points -> Bool
$c/= :: Points -> Points -> Bool
== :: Points -> Points -> Bool
$c== :: Points -> Points -> Bool
Eq, ReadPrec [Points]
ReadPrec Points
Int -> ReadS Points
ReadS [Points]
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [Points]
$creadListPrec :: ReadPrec [Points]
readPrec :: ReadPrec Points
$creadPrec :: ReadPrec Points
readList :: ReadS [Points]
$creadList :: ReadS [Points]
readsPrec :: Int -> ReadS Points
$creadsPrec :: Int -> ReadS Points
Read, Int -> Points -> ShowS
[Points] -> ShowS
Points -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Points] -> ShowS
$cshowList :: [Points] -> ShowS
show :: Points -> String
$cshow :: Points -> String
showsPrec :: Int -> Points -> ShowS
$cshowsPrec :: Int -> Points -> ShowS
Show, Typeable, Typeable Points
Points -> DataType
Points -> Constr
(forall b. Data b => b -> b) -> Points -> Points
forall a.
Typeable a
-> (forall (c :: * -> *).
    (forall d b. Data d => c (d -> b) -> d -> c b)
    -> (forall g. g -> c g) -> a -> c a)
-> (forall (c :: * -> *).
    (forall b r. Data b => c (b -> r) -> c r)
    -> (forall r. r -> c r) -> Constr -> c a)
-> (a -> Constr)
-> (a -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
    Typeable t =>
    (forall d. Data d => c (t d)) -> Maybe (c a))
-> (forall (t :: * -> * -> *) (c :: * -> *).
    Typeable t =>
    (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c a))
-> ((forall b. Data b => b -> b) -> a -> a)
-> (forall r r'.
    (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall r r'.
    (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall u. (forall d. Data d => d -> u) -> a -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> a -> u)
-> (forall (m :: * -> *).
    Monad m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> Data a
forall u. Int -> (forall d. Data d => d -> u) -> Points -> u
forall u. (forall d. Data d => d -> u) -> Points -> [u]
forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Points -> r
forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Points -> r
forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Points -> m Points
forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Points -> m Points
forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Points
forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Points -> c Points
forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Points)
forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Points)
gmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Points -> m Points
$cgmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Points -> m Points
gmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Points -> m Points
$cgmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Points -> m Points
gmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Points -> m Points
$cgmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Points -> m Points
gmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Points -> u
$cgmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Points -> u
gmapQ :: forall u. (forall d. Data d => d -> u) -> Points -> [u]
$cgmapQ :: forall u. (forall d. Data d => d -> u) -> Points -> [u]
gmapQr :: forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Points -> r
$cgmapQr :: forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Points -> r
gmapQl :: forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Points -> r
$cgmapQl :: forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Points -> r
gmapT :: (forall b. Data b => b -> b) -> Points -> Points
$cgmapT :: (forall b. Data b => b -> b) -> Points -> Points
dataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Points)
$cdataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Points)
dataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Points)
$cdataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Points)
dataTypeOf :: Points -> DataType
$cdataTypeOf :: Points -> DataType
toConstr :: Points -> Constr
$ctoConstr :: Points -> Constr
gunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Points
$cgunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Points
gfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Points -> c Points
$cgfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Points -> c Points
Data, forall x. Rep Points x -> Points
forall x. Points -> Rep Points x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Points x -> Points
$cfrom :: forall x. Points -> Rep Points x
Generic)

instance FromJSON Points
instance ToJSON Points

instance Binary Points where
    get :: Get Points
get = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Vector Double -> Points
Points forall t. Binary t => Get t
get
    put :: Points -> Put
put = forall t. Binary t => t -> Put
put forall b c a. (b -> c) -> (a -> b) -> a -> c
. Points -> Vector Double
fromPoints

-- | Bandwidth estimator for an Epanechnikov kernel.
epanechnikovBW :: Double -> Bandwidth
epanechnikovBW :: Double -> Double
epanechnikovBW Double
n = (Double
80 forall a. Fractional a => a -> a -> a
/ (Double
n forall a. Num a => a -> a -> a
* Double
m_2_sqrt_pi)) forall a. Floating a => a -> a -> a
** Double
0.2

-- | Bandwidth estimator for a Gaussian kernel.
gaussianBW :: Double -> Bandwidth
gaussianBW :: Double -> Double
gaussianBW Double
n = (Double
4 forall a. Fractional a => a -> a -> a
/ (Double
n forall a. Num a => a -> a -> a
* Double
3)) forall a. Floating a => a -> a -> a
** Double
0.2

-- | The width of the convolution kernel used.
type Bandwidth = Double

-- | Compute the optimal bandwidth from the observed data for the
-- given kernel.
--
-- This function uses an estimate based on the standard deviation of a
-- sample (due to Deheuvels), which performs reasonably well for
-- unimodal distributions but leads to oversmoothing for more complex
-- ones.
bandwidth :: G.Vector v Double =>
             (Double -> Bandwidth)
          -> v Double
          -> Bandwidth
bandwidth :: forall (v :: * -> *).
Vector v Double =>
(Double -> Double) -> v Double -> Double
bandwidth Double -> Double
kern v Double
values = forall (v :: * -> *). Vector v Double => v Double -> Double
stdDev v Double
values forall a. Num a => a -> a -> a
* Double -> Double
kern (forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
values)

-- | Choose a uniform range of points at which to estimate a sample's
-- probability density function.
--
-- If you are using a Gaussian kernel, multiply the sample's bandwidth
-- by 3 before passing it to this function.
--
-- If this function is passed an empty vector, it returns values of
-- positive and negative infinity.
choosePoints :: G.Vector v Double =>
                Int             -- ^ Number of points to select, /n/
             -> Double          -- ^ Sample bandwidth, /h/
             -> v Double        -- ^ Input data
             -> Points
choosePoints :: forall (v :: * -> *).
Vector v Double =>
Int -> Double -> v Double -> Points
choosePoints Int
n Double
h v Double
sample = Vector Double -> Points
Points forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
U.map forall {a}. Integral a => a -> Double
f forall a b. (a -> b) -> a -> b
$ forall a. (Unbox a, Enum a) => a -> a -> Vector a
U.enumFromTo Int
0 Int
n'
  where lo :: Double
lo     = Double
a forall a. Num a => a -> a -> a
- Double
h
        hi :: Double
hi     = Double
z forall a. Num a => a -> a -> a
+ Double
h
        (Double
a, Double
z) = forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
minMax v Double
sample
        d :: Double
d      = (Double
hi forall a. Num a => a -> a -> a
- Double
lo) forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n'
        f :: a -> Double
f a
i    = Double
lo forall a. Num a => a -> a -> a
+ forall a b. (Integral a, Num b) => a -> b
fromIntegral a
i forall a. Num a => a -> a -> a
* Double
d
        n' :: Int
n'     = Int
n forall a. Num a => a -> a -> a
- Int
1

-- | The convolution kernel.  Its parameters are as follows:
--
-- * Scaling factor, 1\//nh/
--
-- * Bandwidth, /h/
--
-- * A point at which to sample the input, /p/
--
-- * One sample value, /v/
type Kernel =  Double
            -> Double
            -> Double
            -> Double
            -> Double

-- | Epanechnikov kernel for probability density function estimation.
epanechnikovKernel :: Kernel
epanechnikovKernel :: Kernel
epanechnikovKernel Double
f Double
h Double
p Double
v
    | forall a. Num a => a -> a
abs Double
u forall a. Ord a => a -> a -> Bool
<= Double
1 = Double
f forall a. Num a => a -> a -> a
* (Double
1 forall a. Num a => a -> a -> a
- Double
u forall a. Num a => a -> a -> a
* Double
u)
    | Bool
otherwise  = Double
0
    where u :: Double
u = (Double
v forall a. Num a => a -> a -> a
- Double
p) forall a. Fractional a => a -> a -> a
/ (Double
h forall a. Num a => a -> a -> a
* Double
0.75)

-- | Gaussian kernel for probability density function estimation.
gaussianKernel :: Kernel
gaussianKernel :: Kernel
gaussianKernel Double
f Double
h Double
p Double
v = forall a. Floating a => a -> a
exp (-Double
0.5 forall a. Num a => a -> a -> a
* Double
u forall a. Num a => a -> a -> a
* Double
u) forall a. Num a => a -> a -> a
* Double
g
    where u :: Double
u = (Double
v forall a. Num a => a -> a -> a
- Double
p) forall a. Fractional a => a -> a -> a
/ Double
h
          g :: Double
g = Double
f forall a. Num a => a -> a -> a
* Double
0.5 forall a. Num a => a -> a -> a
* Double
m_2_sqrt_pi forall a. Num a => a -> a -> a
* Double
m_1_sqrt_2

-- | Kernel density estimator, providing a non-parametric way of
-- estimating the PDF of a random variable.
estimatePDF :: G.Vector v Double =>
               Kernel           -- ^ Kernel function
            -> Bandwidth        -- ^ Bandwidth, /h/
            -> v Double         -- ^ Sample data
            -> Points           -- ^ Points at which to estimate
            -> U.Vector Double
estimatePDF :: forall (v :: * -> *).
Vector v Double =>
Kernel -> Double -> v Double -> Points -> Vector Double
estimatePDF Kernel
kernel Double
h v Double
sample
    | Int
n forall a. Ord a => a -> a -> Bool
< Int
2     = forall a. String -> a
errorShort String
"estimatePDF"
    | Bool
otherwise = forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
U.map Double -> Double
k forall b c a. (b -> c) -> (a -> b) -> a -> c
. Points -> Vector Double
fromPoints
  where
    k :: Double -> Double
k Double
p = forall (v :: * -> *). Vector v Double => v Double -> Double
sum forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (Kernel
kernel Double
f Double
h Double
p) forall a b. (a -> b) -> a -> b
$ v Double
sample
    f :: Double
f   = Double
1 forall a. Fractional a => a -> a -> a
/ (Double
h forall a. Num a => a -> a -> a
* forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n)
    n :: Int
n   = forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
sample
{-# INLINE estimatePDF #-}

-- | A helper for creating a simple kernel density estimation function
-- with automatically chosen bandwidth and estimation points.
simplePDF :: G.Vector v Double =>
             (Double -> Double) -- ^ Bandwidth function
          -> Kernel             -- ^ Kernel function
          -> Double             -- ^ Bandwidth scaling factor (3 for a Gaussian kernel, 1 for all others)
          -> Int                -- ^ Number of points at which to estimate
          -> v Double           -- ^ sample data
          -> (Points, U.Vector Double)
simplePDF :: forall (v :: * -> *).
Vector v Double =>
(Double -> Double)
-> Kernel -> Double -> Int -> v Double -> (Points, Vector Double)
simplePDF Double -> Double
fbw Kernel
fpdf Double
k Int
numPoints v Double
sample =
    (Points
points, forall (v :: * -> *).
Vector v Double =>
Kernel -> Double -> v Double -> Points -> Vector Double
estimatePDF Kernel
fpdf Double
bw v Double
sample Points
points)
  where points :: Points
points = forall (v :: * -> *).
Vector v Double =>
Int -> Double -> v Double -> Points
choosePoints Int
numPoints (Double
bwforall a. Num a => a -> a -> a
*Double
k) v Double
sample
        bw :: Double
bw     = forall (v :: * -> *).
Vector v Double =>
(Double -> Double) -> v Double -> Double
bandwidth Double -> Double
fbw v Double
sample
{-# INLINE simplePDF #-}

-- | Simple Epanechnikov kernel density estimator.  Returns the
-- uniformly spaced points from the sample range at which the density
-- function was estimated, and the estimates at those points.
epanechnikovPDF :: G.Vector v Double =>
                   Int          -- ^ Number of points at which to estimate
                -> v Double     -- ^ Data sample
                -> (Points, U.Vector Double)
epanechnikovPDF :: forall (v :: * -> *).
Vector v Double =>
Int -> v Double -> (Points, Vector Double)
epanechnikovPDF = forall (v :: * -> *).
Vector v Double =>
(Double -> Double)
-> Kernel -> Double -> Int -> v Double -> (Points, Vector Double)
simplePDF Double -> Double
epanechnikovBW Kernel
epanechnikovKernel Double
1

-- | Simple Gaussian kernel density estimator.  Returns the uniformly
-- spaced points from the sample range at which the density function
-- was estimated, and the estimates at those points.
gaussianPDF :: G.Vector v Double =>
               Int              -- ^ Number of points at which to estimate
            -> v Double         -- ^ Data sample
            -> (Points, U.Vector Double)
gaussianPDF :: forall (v :: * -> *).
Vector v Double =>
Int -> v Double -> (Points, Vector Double)
gaussianPDF = forall (v :: * -> *).
Vector v Double =>
(Double -> Double)
-> Kernel -> Double -> Int -> v Double -> (Points, Vector Double)
simplePDF Double -> Double
gaussianBW Kernel
gaussianKernel Double
3

errorShort :: String -> a
errorShort :: forall a. String -> a
errorShort String
func = forall a. HasCallStack => String -> a
error (String
"Statistics.KernelDensity." forall a. [a] -> [a] -> [a]
++ String
func forall a. [a] -> [a] -> [a]
++
                        String
": at least two points required")

-- $references
--
-- * Deheuvels, P. (1977) Estimation non paramétrique de la densité
--   par histogrammes
--   généralisés. Mhttp://archive.numdam.org/article/RSA_1977__25_3_5_0.pdf>