-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Collection of useful statistical methods.
--   
--   High-level statistical methods.
--   
--   <ul>
--   <li>Confusion matrix</li>
--   <li>Confusion matrix dependent statistics (sensitivity, specificity,
--   F-measure, mcc)</li>
--   <li>EM algorithm for two-component Gaussian mixture.</li>
--   <li>GMM (Gaussian Mixture Models) with &gt;=1 Gaussians fitted to the
--   data.</li>
--   </ul>
--   
--   Note that some methods are for testing only (two-component Gaussian
--   mixture EM).
@package StatisticalMethods
@version 0.0.0.1


-- | This is a simplified version of the Expectation-Maximization algorithm
--   for a two-component Gaussian mixture model. Cf. Hastie et al, The
--   Elements of Statistical Learning, Springer. Chapter 8.5.1.
module Statistics.EM.TwoGaussian

-- | Finds the fix-points of the EM step iterations.
emFix :: Vector Double -> (Weight, Normal, Normal) -> (Weight, Normal, Normal)

-- | Finds the best fix-point with all elements <tt>xs</tt> as starting
--   points for the means. It holds that mu_1 &lt; mu_2.
emStarts :: Vector Double -> (Weight, Normal, Normal)


-- | EM for a mixture of k one-dimensional Gaussians. This procedure tends
--   to produce <a>NaN</a>s whenever more Gaussians are being selected than
--   are called for. This is rather convenient. ;-)
--   
--   TODO cite paper
module Statistics.EM.GMM

-- | Find an optimal set of parameters <a>Theta</a>. The additional
--   <a>takeWhile (not . isnan . fst)</a> makes sure that in cases of
--   overfitting, <a>emFix</a> does terminate. Due to the way we check and
--   take, in case of NaNs, the returned values will be NaNs (checking fst,
--   returning snd).
emFix :: Data -> Theta -> Theta

-- | Given a set of <a>Data</a> and a number <tt>k</tt> of Gaussian peaks,
--   try to find the optimal GMM. This is done by trying each data point as
--   mu for each Gaussian. Note that this will be rather slow for larger
--   <tt>k</tt> (larger than, say 2 or 3). In that case, a random-drawing
--   method should be chosen.
--   
--   TODO xs' -&gt; xs sorting makes me cry!
emStarts :: Int -> Data -> Theta


-- | This module contains test data taken from <a>Elements of Statistical
--   Learning</a>.
--   
--   TODO correct citation
module TestData.Elements
table_8_1 :: Vector Double


-- | The confusion matrix contains four data points: the true and false
--   positives and the true and false negatives. From these four data
--   points, other statistics can be extracted.
--   
--   Fawcett, ROC Graphs: Notes and Practical Considerations for
--   Researchers, 2004, Kluwer Academic Publishers
module Statistics.ConfusionMatrix

-- | The confusion matrix.
data ConfusionMatrix
ConfusionMatrix :: WrappedDouble -> WrappedDouble -> WrappedDouble -> WrappedDouble -> ConfusionMatrix
fn :: ConfusionMatrix -> WrappedDouble
fp :: ConfusionMatrix -> WrappedDouble
tn :: ConfusionMatrix -> WrappedDouble
tp :: ConfusionMatrix -> WrappedDouble
type WrappedDouble = Either String Double

-- | Given a certain data-set, create a confusion matrix.
class MkConfusionMatrix a
mkConfusionMatrix :: MkConfusionMatrix a => a -> ConfusionMatrix
instance Read ConfusionMatrix
instance Show ConfusionMatrix
instance Eq ConfusionMatrix


-- | In general, it is not easy to define the whole confusion matrix
--   generically without knowing anything about the source data. For
--   certain elements however, it is possible. These instances are all
--   defined on newtypes in order to not create instances on generic data
--   types like lists.
module Statistics.ConfusionMatrix.Instances

-- | The ctor expects the total number of possibilities first, then a list
--   of true positive elements, followed by a list of predicted elements.
newtype (Eq a, Ord a) => ListSimilar a
ListSimilar :: (Int, [a], [a]) -> ListSimilar a
instance (Eq a, Ord a) => MkConfusionMatrix (ListSimilar a)


-- | Common performance metrics which can be calculated using the confusion
--   matrix.
--   
--   Fawcett, ROC Graphs: Notes and Practical Considerations for
--   Researchers, 2004, Kluwer Academic Publishers
module Statistics.PerformanceMetrics

-- | sensitivity
sensitivity :: ConfusionMatrix -> WrappedDouble

-- | specificity
specificity :: ConfusionMatrix -> WrappedDouble

-- | positive predictive value
ppv :: ConfusionMatrix -> WrappedDouble

-- | mathews correlation coefficient
mcc :: ConfusionMatrix -> WrappedDouble

-- | F-measure
fmeasure :: ConfusionMatrix -> WrappedDouble