Copyright | (c) 2020 Composewell Technologies |
---|---|
License | Apache-2.0 |
Maintainer | streamly@composewell.com |
Stability | experimental |
Portability | GHC |
Safe Haskell | Safe-Inferred |
Language | Haskell2010 |
Statistical measures over a stream of data. All operations use numerically stable floating point arithmetic.
Measurements can be performed over the entire input stream or on a sliding window of fixed or variable size. Where possible, measures are computed online without buffering the input stream.
Currently there is no overflow detection.
References:
Synopsis
- lmap :: forall c a (m :: Type -> Type) b. (c -> a) -> Fold m (a, Maybe a) b -> Fold m (c, Maybe c) b
- cumulative :: forall (m :: Type -> Type) a b. Fold m (a, Maybe a) b -> Fold m a b
- length :: forall (m :: Type -> Type) b a. (Monad m, Num b) => Fold m (a, Maybe a) b
- sum :: forall (m :: Type -> Type) a. (Monad m, Num a) => Fold m (a, Maybe a) a
- sumInt :: forall (m :: Type -> Type) a. (Monad m, Integral a) => Fold m (a, Maybe a) a
- powerSum :: forall (m :: Type -> Type) a. (Monad m, Num a) => Int -> Fold m (a, Maybe a) a
- minimum :: (Monad m, Ord a) => Fold m (a, Maybe a) a
- maximum :: (Monad m, Ord a) => Fold m (a, Maybe a) a
- rawMoment :: (Monad m, Fractional a) => Int -> Fold m (a, Maybe a) a
- rawMomentFrac :: (Monad m, Floating a) => a -> Fold m (a, Maybe a) a
- mean :: forall m a. (Monad m, Fractional a) => Fold m (a, Maybe a) a
- welfordMean :: forall m a. (Monad m, Fractional a) => Fold m (a, Maybe a) a
- geometricMean :: (Monad m, Floating a) => Fold m (a, Maybe a) a
- harmonicMean :: (Monad m, Fractional a) => Fold m (a, Maybe a) a
- quadraticMean :: (Monad m, Floating a) => Fold m (a, Maybe a) a
- powerMean :: (Monad m, Floating a) => Int -> Fold m (a, Maybe a) a
- powerMeanFrac :: (Monad m, Floating a) => a -> Fold m (a, Maybe a) a
- ewma :: Monad m => Double -> Fold m Double Double
- ewmaAfterMean :: Monad m => Int -> Double -> Fold m Double Double
- ewmaRampUpSmoothing :: Monad m => Double -> Double -> Fold m Double Double
- range :: (Monad m, Num a, Ord a) => Fold m (a, Maybe a) a
- md :: MonadIO m => Fold m ((Double, Maybe Double), m (MutArray Double)) Double
- variance :: (Monad m, Fractional a) => Fold m (a, Maybe a) a
- stdDev :: (Monad m, Floating a) => Fold m (a, Maybe a) a
- skewness :: (Monad m, Floating a) => Fold m (a, Maybe a) a
- kurtosis :: (Monad m, Floating a) => Fold m (a, Maybe a) a
- sampleVariance :: (Monad m, Fractional a) => Fold m (a, Maybe a) a
- sampleStdDev :: (Monad m, Floating a) => Fold m (a, Maybe a) a
- stdErrMean :: (Monad m, Floating a) => Fold m (a, Maybe a) a
- resample :: (MonadIO m, Unbox a) => Unfold m (Array a) a
- foldResamples :: (MonadIO m, Unbox a) => Int -> Array a -> Fold m a b -> Stream m b
- jackKnifeMean :: (Monad m, Fractional a, Unbox a) => Array a -> Stream m a
- jackKnifeVariance :: (Monad m, Fractional a, Unbox a) => Array a -> Stream m a
- jackKnifeStdDev :: (Monad m, Unbox a, Floating a) => Array a -> Stream m a
- frequency :: (Monad m, Ord a) => Fold m (a, Maybe a) (Map a Int)
- frequency' :: (Monad m, Ord a) => Fold m a (Map a Int)
- mode :: (Monad m, Ord a) => Fold m a (Maybe (a, Int))
- data HistBin a
- = BelowRange
- | InRange a
- | AboveRange
- binOffsetSize :: Integral a => a -> a -> a -> a
- binFromSizeN :: Integral a => a -> a -> a -> a -> HistBin a
- binFromToN :: Integral a => a -> a -> a -> a -> HistBin a
- binBoundaries :: Array a -> a -> HistBin a
- histogram :: (Monad m, Ord k) => (a -> k) -> Fold m a (Map k Int)
- fft :: MonadIO m => MutArray (Complex Double) -> m ()
Incremental Folds
Folds of type Fold m (a, Maybe a) b
are incremental sliding window
folds. An input of type (a, Nothing)
indicates that the input element
a
is being inserted in the window without ejecting an old value
increasing the window size by 1. An input of type (a, Just a)
indicates that the first element is being inserted in the window and the
second element is being removed from the window, the window size remains
the same. The window size can only increase and never decrease.
You can compute the statistics over the entire stream using sliding
window folds by keeping the second element of the input tuple as
Nothing
.
lmap :: forall c a (m :: Type -> Type) b. (c -> a) -> Fold m (a, Maybe a) b -> Fold m (c, Maybe c) b #
Map a function on the incoming as well as outgoing element of a rolling window fold.
>>>
lmap f = Fold.lmap (bimap f (f <$>))
cumulative :: forall (m :: Type -> Type) a b. Fold m (a, Maybe a) b -> Fold m a b #
Convert an incremental fold to a cumulative fold using the entire input stream as a single window.
>>>
cumulative f = Fold.lmap (\x -> (x, Nothing)) f
Summary Statistics
Sums
length :: forall (m :: Type -> Type) b a. (Monad m, Num b) => Fold m (a, Maybe a) b #
The number of elements in the rolling window.
This is the \(0\)th power sum.
>>>
length = powerSum 0
sum :: forall (m :: Type -> Type) a. (Monad m, Num a) => Fold m (a, Maybe a) a #
Sum of all the elements in a rolling window:
\(S = \sum_{i=1}^n x_{i}\)
This is the first power sum.
>>>
sum = powerSum 1
Uses Kahan-Babuska-Neumaier style summation for numerical stability of floating precision arithmetic.
Space: \(\mathcal{O}(1)\)
Time: \(\mathcal{O}(n)\)
powerSum :: forall (m :: Type -> Type) a. (Monad m, Num a) => Int -> Fold m (a, Maybe a) a #
Sum of the \(k\)th power of all the elements in a rolling window:
\(S_k = \sum_{i=1}^n x_{i}^k\)
>>>
powerSum k = lmap (^ k) sum
Space: \(\mathcal{O}(1)\)
Time: \(\mathcal{O}(n)\)
Location
minimum :: (Monad m, Ord a) => Fold m (a, Maybe a) a Source #
The minimum element in a rolling window.
For smaller window sizes (< 30) Streamly.Data.Fold.Window.minimum performs better. If you want to compute the minimum of the entire stream Fold.min from streamly package would be much faster.
Time: \(\mathcal{O}(n*w)\) where \(w\) is the window size.
maximum :: (Monad m, Ord a) => Fold m (a, Maybe a) a Source #
The maximum element in a rolling window.
For smaller window sizes (< 30) Streamly.Data.Fold.Window.maximum performs better. If you want to compute the maximum of the entire stream Streamly.Data.Fold.maximum from streamly package would be much faster.
Time: \(\mathcal{O}(n*w)\) where \(w\) is the window size.
rawMoment :: (Monad m, Fractional a) => Int -> Fold m (a, Maybe a) a Source #
Raw moment is the moment about 0. The \(k\)th raw moment is defined as:
\(\mu'_k = \frac{\sum_{i=1}^n x_{i}^k}{n}\)
>>>
rawMoment k = Fold.teeWith (/) (powerSum p) length
See https://en.wikipedia.org/wiki/Moment_(mathematics) .
Space: \(\mathcal{O}(1)\)
Time: \(\mathcal{O}(n)\)
mean :: forall m a. (Monad m, Fractional a) => Fold m (a, Maybe a) a Source #
Arithmetic mean of elements in a sliding window:
\(\mu = \frac{\sum_{i=1}^n x_{i}}{n}\)
This is also known as the Simple Moving Average (SMA) when used in the sliding window and Cumulative Moving Avergae (CMA) when used on the entire stream.
Mean is the same as the first raw moment.
\(\mu = \mu'_1\)
>>>
mean = rawMoment 1
>>>
mean = powerMean 1
>>>
mean = Fold.teeWith (/) sum length
Space: \(\mathcal{O}(1)\)
Time: \(\mathcal{O}(n)\)
welfordMean :: forall m a. (Monad m, Fractional a) => Fold m (a, Maybe a) a Source #
Same as mean
but uses Welford's algorithm to compute the mean
incrementally.
It maintains a running mean instead of a running sum and adjusts the mean
based on a new value. This is slower than mean
because of using the
division operation on each step and it is numerically unstable (as of now).
The advantage over mean
could be no overflow if the numbers are large,
because we do not maintain a sum, but that is a highly unlikely corner case.
Internal
geometricMean :: (Monad m, Floating a) => Fold m (a, Maybe a) a Source #
Geometric mean, defined as:
\(GM = \sqrt[n]{x_1 x_2 \cdots x_n}\)
\(GM = \left(\prod_{i=1}^n x_i\right)^\frac{1}{n}\)
or, equivalently, as the arithmetic mean in log space:
\(GM = e ^{{\frac{\sum_{i=1}^{n}\ln a_i}{n}}}\)
>>>
geometricMean = exp <$> lmap log mean
harmonicMean :: (Monad m, Fractional a) => Fold m (a, Maybe a) a Source #
The harmonic mean of the positive numbers \(x_1, x_2, \ldots, x_n\) is defined as:
\(HM = \frac{n}{\frac1{x_1} + \frac1{x_2} + \cdots + \frac1{x_n}}\)
\(HM = \left(\frac{\sum\limits_{i=1}^n x_i^{-1}}{n}\right)^{-1}\)
>>>
harmonicMean = Fold.teeWith (/) length (lmap recip sum)
>>>
harmonicMean = powerMeanFrac (-1)
quadraticMean :: (Monad m, Floating a) => Fold m (a, Maybe a) a Source #
The quadratic mean or root mean square (rms) of the numbers \(x_1, x_2, \ldots, x_n\) is defined as:
\(RMS = \sqrt{ \frac{1}{n} \left( x_1^2 + x_2^2 + \cdots + x_n^2 \right) }.\)
>>>
quadraticMean = powerMean 2
powerMean :: (Monad m, Floating a) => Int -> Fold m (a, Maybe a) a Source #
The \(k\)th power mean of numbers \(x_1, x_2, \ldots, x_n\) is:
\(M_k = \left( \frac{1}{n} \sum_{i=1}^n x_i^k \right)^{\frac{1}{k}}\)
\(powerMean(k) = (rawMoment(k))^\frac{1}{k}\)
>>>
powerMean k = (** (1 / fromIntegral k)) <$> rawMoment k
All other means can be expressed in terms of power mean. It is also known as the generalized mean.
Weighted Means
Exponential Smoothing.
ewma :: Monad m => Double -> Fold m Double Double Source #
ewma smoothingFactor
.
ewma
of an empty stream is 0.
Exponential weighted moving average, \(s_n\), of \(n\) values, \(x_1,\ldots,x_n\), is defined recursively as:
\(\begin{align} s_0& = x_0\\ s_n & = \alpha x_{n} + (1-\alpha)s_{n-1},\quad n>0 \end{align}\)
If we expand the recursive term it becomes an exponential series:
\(s_n = \alpha \left[x_n + (1-\alpha)x_{n-1} + (1-\alpha)^2 x_{n-2} + \cdots + (1-\alpha)^{n-1} x_1 \right] + (1-\alpha)^n x_0\)
where \(\alpha\), the smoothing factor, is in the range \(0 <\alpha < 1\). More the value of \(\alpha\), the more weight is given to newer values. As a special case if it is 0 then the weighted sum would always be the same as the oldest value, if it is 1 then the sum would always be the same as the newest value.
ewmaAfterMean :: Monad m => Int -> Double -> Fold m Double Double Source #
ewma n k
is like ewma
but uses the mean of the first n
values and
then uses that as the initial value for the ewma
of the rest of the
values.
This can be used to reduce the effect of volatility of the initial value when k is too small.
ewmaRampUpSmoothing :: Monad m => Double -> Double -> Fold m Double Double Source #
ewma n k
is like ewma
but uses 1 as the initial smoothing factor and
then exponentially smooths it to k
using n
as the smoothing factor.
This is significantly faster than ewmaAfterMean
.
Spread
Second order central moment is a statistical measure of dispersion. The \(k\)th moment about the mean (or \(k\)th central moment) is defined as:
\(\mu_k = \frac{1}{n}\sum_{i=1}^n {(x_{i}-\mu)}^k\)
range :: (Monad m, Num a, Ord a) => Fold m (a, Maybe a) a Source #
The difference between the maximum and minimum elements of a rolling window.
>>>
range = Fold.teeWith (-) maximum minimum
If you want to compute the range of the entire stream Fold.teeWith (-)
Fold.maximum Fold.minimum
from the streamly package would be much faster.
Space: \(\mathcal{O}(n)\) where n
is the window size.
Time: \(\mathcal{O}(n*w)\) where \(w\) is the window size.
md :: MonadIO m => Fold m ((Double, Maybe Double), m (MutArray Double)) Double Source #
md n
computes the mean absolute deviation (or mean deviation) in a
sliding window of last n
elements in the stream.
The mean absolute deviation of the numbers \(x_1, x_2, \ldots, x_n\) is:
\(MD = \frac{1}{n}\sum_{i=1}^n |x_i-\mu|\)
Note: It is expensive to compute MD in a sliding window. We need to maintain a ring buffer of last n elements and maintain a running mean, when the result is extracted we need to compute the difference of all elements from the mean and get the average. Using standard deviation may be computationally cheaper.
See https://en.wikipedia.org/wiki/Average_absolute_deviation .
Pre-release
variance :: (Monad m, Fractional a) => Fold m (a, Maybe a) a Source #
The variance \(\sigma^2\) of a population of \(n\) equally likely values is defined as the average of the squares of deviations from the mean \(\mu\). In other words, second moment about the mean:
\(\sigma^2 = \frac{1}{n}\sum_{i=1}^n {(x_{i}-\mu)}^2\)
\(\sigma^2 = rawMoment(2) - \mu^2\)
\(\mu_2 = -(\mu'_1)^2 + \mu'_2\)
Note that the variance would be biased if applied to estimate the population
variance from a sample of the population. See sampleVariance
.
See https://en.wikipedia.org/wiki/Variance.
Space: \(\mathcal{O}(1)\)
Time: \(\mathcal{O}(n)\)
stdDev :: (Monad m, Floating a) => Fold m (a, Maybe a) a Source #
Standard deviation \(\sigma\) is the square root of variance
.
This is the population standard deviation or uncorrected sample standard deviation.
>>>
stdDev = sqrt <$> variance
See https://en.wikipedia.org/wiki/Standard_deviation .
Space: \(\mathcal{O}(1)\)
Time: \(\mathcal{O}(n)\)
Shape
Third and fourth order central moments are a measure of shape.
skewness :: (Monad m, Floating a) => Fold m (a, Maybe a) a Source #
Skewness \(\gamma\) is the standardized third central moment defined as:
\(\tilde{\mu}_3 = \frac{\mu_3}{\sigma^3}\)
The third central moment can be computed in terms of raw moments:
\(\mu_3 = 2(\mu'_1)^3 - 3\mu'_1\mu'_2 + \mu'_3\)
Substituting \(\mu'_1 = \mu\), and \(\mu'_2 = \mu^2 + \sigma^2\):
\(\mu_3 = -\mu^3 - 3\mu\sigma^2 + \mu'_3\)
Skewness is a measure of symmetry of the probability distribution. It is 0 for a symmetric distribution, negative for a distribution that is skewed towards left, positive for a distribution skewed towards right.
For a normal like distribution the median can be found around \(\mu - \frac{\gamma\sigma}{6}\) and the mode can be found around \(\mu - \frac{\gamma \sigma}{2}\).
kurtosis :: (Monad m, Floating a) => Fold m (a, Maybe a) a Source #
Kurtosis \(\kappa\) is the standardized fourth central moment, defined as:
\(\tilde{\mu}_4 = \frac{\mu_4}{\sigma^4}\)
The fourth central moment can be computed in terms of raw moments:
\(\mu_4 = -3(\mu'_1)^4 + 6(\mu'_1)^2\mu'_2 - 4\mu'_1\mu'_3\ + \mu'_4\)
Substituting \(\mu'_1 = \mu\), and \(\mu'_2 = \mu^2 + \sigma^2\):
\(\mu_4 = 3\mu^4 + 6\mu^2\sigma^2 - 4\mu\mu'_3 + \mu'_4\)
It is always non-negative. It is 0 for a point distribution, low for light tailed (platykurtic) distributions and high for heavy tailed (leptokurtic) distributions.
\(\kappa >= \gamma^2 + 1\)
For a normal distribution \(\kappa = 3\sigma^4\).
Estimation
sampleVariance :: (Monad m, Fractional a) => Fold m (a, Maybe a) a Source #
Unbiased sample variance i.e. the variance of a sample corrected to better estimate the variance of the population, defined as:
\(s^2 = \frac{1}{n - 1}\sum_{i=1}^n {(x_{i}-\mu)}^2\)
\(s^2 = \frac{n}{n - 1} \times \sigma^2\).
sampleStdDev :: (Monad m, Floating a) => Fold m (a, Maybe a) a Source #
Sample standard deviation:
\(s = \sqrt{sampleVariance}\)
>>>
sampleStdDev = sqrt <$> sampleVariance
See https://en.wikipedia.org/wiki/Unbiased_estimation_of_standard_deviation .
stdErrMean :: (Monad m, Floating a) => Fold m (a, Maybe a) a Source #
Standard error of the sample mean (SEM), defined as:
\( SEM = \frac{sampleStdDev}{\sqrt{n}} \)
See https://en.wikipedia.org/wiki/Standard_error .
Space: \(\mathcal{O}(1)\)
Time: \(\mathcal{O}(n)\)
Resampling
resample :: (MonadIO m, Unbox a) => Unfold m (Array a) a Source #
Randomly select elements from an array, with replacement, producing a stream of the same size as the original array.
:: (MonadIO m, Unbox a) | |
=> Int | Number of resamples to compute. |
-> Array a | Original sample. |
-> Fold m a b | Estimator fold |
-> Stream m b |
Resample an array multiple times and run the supplied fold on each resampled stream, producing a stream of fold results. The fold is usually an estimator fold.
jackKnifeMean :: (Monad m, Fractional a, Unbox a) => Array a -> Stream m a Source #
Given an array of n
items, compute mean of (n - 1)
items at a time,
producing a stream of all possible mean values omitting a different item
every time.
jackKnifeVariance :: (Monad m, Fractional a, Unbox a) => Array a -> Stream m a Source #
Given an array of n
items, compute variance of (n - 1)
items at a time,
producing a stream of all possible variance values omitting a different item
every time.
jackKnifeStdDev :: (Monad m, Unbox a, Floating a) => Array a -> Stream m a Source #
Standard deviation computed from jackKnifeVariance
.
Probability Distribution
frequency :: (Monad m, Ord a) => Fold m (a, Maybe a) (Map a Int) Source #
Count the frequency of elements in a sliding window.
>>>
input = Stream.fromList [1,1,3,4,4::Int]
>>>
f = Ring.slidingWindow 4 Statistics.frequency
>>>
Stream.fold f input
fromList [(1,1),(3,1),(4,2)]
frequency' :: (Monad m, Ord a) => Fold m a (Map a Int) Source #
Determine the frequency of each element in the stream.
mode :: (Monad m, Ord a) => Fold m a (Maybe (a, Int)) Source #
Find out the most frequently ocurring element in the stream and its frequency.
binOffsetSize :: Integral a => a -> a -> a -> a Source #
binOffsetSize offset binSize input
. Given an integral input value,
return its bin index provided that each bin contains binSize
items and the
bins are aligned such that the 0 index bin starts at offset
from 0. If
offset = 0 then the bin with index 0 would have values from 0 to binSize -
1.
This API does not put a bound on the number of bins, therefore, the number of bins could be potentially large depending on the range of values.
binFromSizeN :: Integral a => a -> a -> a -> a -> HistBin a Source #
binFromSizeN low binSize nbins input
. Classify input
into bins
specified by a low
limit, binSize
and nbins
. Inputs below the lower
limit are classified into BelowRange
and inputs above the highest bin are
classified into AboveRange
. InRange
inputs are classified into bins
starting from bin index 0.
binFromToN :: Integral a => a -> a -> a -> a -> HistBin a Source #
binFromToN low high nbins input
. Like binFromSizeN
except that a range
of lower and higher limit is specified. binSize
is computed using the
range and nbins
. nbins
is rounded to the range 0 < nbins < (high - low
+ 1)
. high >= low
must hold.
binBoundaries :: Array a -> a -> HistBin a Source #
Classify an input value to bins using the bin boundaries specified in an array.
Unimplemented
histogram :: (Monad m, Ord k) => (a -> k) -> Fold m a (Map k Int) Source #
Given a bin classifier function and a stream of values, generate a histogram map from indices of bins to the number of items in the bin.
>>>
Stream.fold (histogram (binOffsetSize 0 3)) $ Stream.fromList [1..15]
fromList [(0,2),(1,3),(2,3),(3,3),(4,3),(5,1)]