-----------------------------------------------------------------------------
-- Copyright 2020, Ideas project team. This file is distributed under the
-- terms of the Apache License 2.0. For more information, see the files
-- "LICENSE.txt" and "NOTICE.txt", which are included in the distribution.
-----------------------------------------------------------------------------
module Domain.Hypothesis.Common
   ( TypeOfTest(..),
     dependent, independent, chiSquared,
     pickAlpha, sidedFromHA,
     h0FromHA, h0FromHAEqualSign,
     validTests, testFormulaFromTest,
     degreesOfFreedomFromTest,
     computeCritical, computeCriticalT, computeCriticalZ
     , computeCriticalR, computeCriticalF, computeCriticalChi,
     computePValue,   computePValueT,   computePValueZ,
     chooseTypeOfTest, isTTest
   ) where

import Data.Maybe
import Ideas.Common.Library (Term(TCon))
import Domain.Statistics.Symbols
import Domain.Hypothesis.Tables
import Domain.Math.Expr hiding ((^))
import Domain.Math.Data.Relation
import Domain.Statistics.Component
import Domain.Statistics.ComponentSet
import Ideas.Common.View
import Domain.Math.Numeric.Views

data TypeOfTest = TestMean
                | CompareMeans
                | CompareMeansPaired
                | TestProportion
                | CompareProportions
                | TestCorrelation
    deriving (Eq, Show)

dependent, independent, chiSquared :: Expr
dependent   = toExpr (TCon dependentSymbol [])
independent = toExpr (TCon independentSymbol [])
chiSquared  = Var "chisq"

-- Pick a value for alpha
pickAlpha :: Double
pickAlpha = 0.05

-- Determine the sidedness based on the alternative hypothesis.
sidedFromHA :: Relation Expr -> Sided
sidedFromHA ha = fromRelation (relationType ha)
 where
   fromRelation :: RelationType -> Sided
   fromRelation LessThan             = LeftSided
   fromRelation LessThanOrEqualTo    = LeftSided
   fromRelation GreaterThan          = RightSided
   fromRelation GreaterThanOrEqualTo = RightSided
   fromRelation EqualTo              = TwoSided
   fromRelation NotEqualTo           = TwoSided
   fromRelation rt                   = error $ "sidedFromHA: " ++ show rt

h0FromHA :: Relation Expr -> Relation Expr
h0FromHA ha = let h0Rel = inverseRelType (relationType ha)
              in  makeType h0Rel (leftHandSide ha) (rightHandSide ha)

h0FromHAEqualSign :: Relation Expr -> Relation Expr
h0FromHAEqualSign ha = makeType EqualTo (leftHandSide ha) (rightHandSide ha)
   
inverseRelType :: RelationType -> RelationType
inverseRelType relType = fromMaybe relType (lookup relType table)
 where
   table = pairs ++ map (\(a,b) -> (b,a)) pairs
   pairs = [(LessThan, GreaterThanOrEqualTo), (LessThanOrEqualTo, GreaterThan), (EqualTo, NotEqualTo)]

-- Returns the valid tests for this state
-- Corner case for two samples (n1,n2 instead of n)
validTests :: ComponentSet -> [TestType]
validTests cs = 
   case getTestType TestChoice (initials cs) of
      Just x -> [x]
      _ -> select (chooseTypeOfTest cs)
 where
   n = fromMaybe 0 $ do 
      expr <- getExpr SampleSize cs
      match naturalView expr
      
   psdKnown = cs `contains` PopulationSdev
  
   select TestMean 
      | psdKnown  = [ZTest]
      | n > 100   = [ZTest] -- Sietske: ignore TTestOne for now
      | otherwise = [TTestOne]
   select CompareMeans         = [TTestTwo]
   select CompareMeansPaired   = [TTestPaired]
   select TestProportion       = [ZTest]
   select CompareProportions   = [ZTest]
   select TestCorrelation      = [RPearson, TTestOne]

-- Returns the test statistic formula from the chose test.
-- Note: this only works for the case of testing the mean, other cases should 
-- give the formula as part of the exercise
testFormulaFromTest :: Monad m => TestType -> TypeOfTest -> m Expr
testFormulaFromTest testType typeOfTest = 
   case (testType, typeOfTest) of 
      (TTestOne, TestMean) -> 
         return $ (Var "M" - Var "mu") / (Var "s" / sqrt (Var "n"))
      (ZTest, TestMean) -> 
         return $ (Var "M" - Var "mu") / (Var "sigma" / sqrt (Var "n"))
      (TTestTwo, CompareMeans) -> 
         return $ (mean1 - mean2) / sqrt (toExpr PooledVariance * (1 / Var "n1" + 1 / Var "n2"))
      (TTestPaired, TestMean) ->
         return $ (Var "M" - Var "mu") / (Var "s" / sqrt (Var "n"))
      (TTestPaired, CompareMeansPaired) -> 
         return $ (mean1 - mean2) / (toExpr SampleSdev * sqrt (1 / Var "n")) -- to do: ask Sietske
      (_, TestProportion) -> 
         return $ (Var "p" - Var "p0") / sqrt (Var "p0" * (1.0 - Var "p0") / Var "n")
      (_, CompareProportions) -> 
         return $ (Var "p1" - Var "p2" - Var "d0") / sqrt (Var "p0" * (1.0 - Var "p0") / (Var "n" / 2))
      (TTestOne, TestCorrelation) -> 
         return $ (Var "r" * sqrt (Var "n" - 2)) / sqrt (1 - Var "r"**2)
      (RPearson, TestCorrelation) -> 
         return $ Var "r"
      _ -> 
         fail $ "teststatisticFromTest " ++ show (testType, typeOfTest)
 where
   mean1 = toExpr (One SampleMean)
   mean2 = toExpr (Two SampleMean)

-- Returns the formula for the degrees of degreesOfFreedom
degreesOfFreedomFromTest :: Monad m => TestType -> TypeOfTest -> m Expr
degreesOfFreedomFromTest TTestOne    TestCorrelation = return $ Var "n" - 2
degreesOfFreedomFromTest TTestOne    _ = return $ Var "n" - 1
degreesOfFreedomFromTest TTestPaired _ = return $ Var "n" - 1
degreesOfFreedomFromTest TTestTwo    _ = return $ Var "n1" + Var "n2" - 2
degreesOfFreedomFromTest RPearson    _ = return $ Var "n" - 2
degreesOfFreedomFromTest _           _ = fail "degrees of freedom test failed"

-- Returns the critical value from the given test and alpha
computeCritical :: Monad m => TestType -> Sided -> Double -> Maybe Double -> m Double
computeCritical test sided alpha mdf
   | isTTest test  = case mdf of
                        Just df -> return $ computeCriticalT sided alpha df
                        Nothing -> fail "df missing"
   | test == RPearson = case mdf of
                           Just df -> return $ computeCriticalR sided alpha df
                           Nothing -> fail "df missing"
   | test == ZTest = return $ computeCriticalZ sided alpha
   | otherwise     = fail "unknown test"

isTTest :: TestType -> Bool
isTTest TTestOne    = True
isTTest TTestTwo    = True
isTTest TTestPaired = True
isTTest _           = False

computeCriticalR :: Sided -> Double -> Double -> Double
computeCriticalR sided alpha df = sqrt (t ^ (2 :: Int) / (t ^ (2 :: Int) + df)) 
 where
   t = computeCriticalT sided alpha df

computeCriticalF :: Monad m => Double -> Double -> Double -> m Double
computeCriticalF dfBetween dfWithin alpha = 
   maybe (fail "unknown critical-f value") return (fTable dfBetween dfWithin alpha) -- TODO Sietske


computeCriticalChi :: Monad m => Sided -> Double -> Double -> m Double
computeCriticalChi TwoSided   alpha df = chivalue' df (alpha / 2)
computeCriticalChi LeftSided  alpha df = negate <$> chivalue' df alpha
computeCriticalChi RightSided alpha df = chivalue' df alpha

chivalue' :: Monad m => Double -> Double -> m Double
chivalue' df alpha = 
   maybe (fail "unknown critical-chi value") return $ chiTable alpha (round df)

computeCriticalT :: Sided -> Double -> Double -> Double
computeCriticalT TwoSided   alpha df =   tvalue' df (alpha / 2)
computeCriticalT LeftSided  alpha df = - tvalue' df alpha
computeCriticalT RightSided alpha df =   tvalue' df alpha

tvalue' :: Double -> Double -> Double
tvalue' df alpha | isJust tableLookup = fromJust tableLookup
                 | otherwise          = fromInteger (round $ findValue (tvalue df) 0.00005 (0.5 - alpha) * 1000) / 1000.0
 where
   tableLookup = tTable alpha (round df)

computeCriticalZ :: Sided -> Double -> Double
computeCriticalZ TwoSided   alpha =   zvalue' (alpha / 2)
computeCriticalZ LeftSided  alpha = - zvalue' alpha
computeCriticalZ RightSided alpha =   zvalue' alpha

zvalue' :: Double -> Double
zvalue' alpha | isJust tableLookup = fromJust tableLookup
              | otherwise          = fromInteger (round $ findValue zvalue 0.00005 (0.5 - alpha) * 1000) / 1000.0
 where
   tableLookup = zTable alpha

-- | Utils for computing t-/z-/p-values
zvalue :: Double -> Double
zvalue x = (1.0 / sqrt (2.0 * pi)) * exp 1 ** negate (x**2/2)

tvalue :: Double -> (Double -> Double)
tvalue df x = 1 / (sqrt df * beta 0.5 (df / 2.0)) * (1.0 + (x*x) / df) ** negate ((df + 1.0) / 2.0)

-- Utility functions for finding the t-value
-- Source: https://wiki.haskell.org/index.php?title=Gamma_and_Beta_function
cof :: [Double]
cof = [76.18009172947146,-86.50532032941677,24.01409824083091,-1.231739572450155,0.001208650973866179,-0.000005395239384953]
 
ser :: Double
ser = 1.000000000190015
 
gammaln :: Double -> Double
gammaln xx = let tmp' = (xx+5.5) - (xx+0.5)*log(xx+5.5)
                 ser' =  foldl (+) ser $ map (\(y,c) -> c/(xx+y)) $ zip [1..] cof
             in -tmp' + log(2.5066282746310005 * ser' / xx)
             
beta :: Double -> Double -> Double
beta z w = exp (gammaln z + gammaln w - gammaln (z+w))

findValue :: (Double -> Double) -> Double -> Double -> Double
findValue f stepSize target = fst $ until (\(_, x) -> x >= target) (\(a, x) -> (a + stepSize, x + stepSize * f a)) (0, 0)

findValue' :: (Double -> Double) -> Double -> Double -> Double
findValue' f stepSize target = snd $ until (\(a, _) -> a >= target) (\(a, x) -> (a + stepSize, x + stepSize * f a)) (0, 0)


-- Compute the P-value
computePValue :: Monad m => TestType -> Sided -> Double -> Maybe Double -> m Double
computePValue test sided ts (Just df) 
   | isTTest test = return $ computePValueT sided ts df
   | otherwise    = fail "cannot compute p-value"
computePValue ZTest sided ts _ = 
   return $ computePValueZ sided ts
computePValue _ _ _ _ = 
   fail "cannot compute p-value"

computePValueT :: Sided -> Double -> Double -> Double
computePValueT TwoSided   ts df = (0.5 - pvalueT df (abs ts)) * 2
computePValueT LeftSided  ts df | ts < 0    = 0.5 - pvalueT df (abs ts)
                                | otherwise = 0.5 + pvalueT df      ts
computePValueT RightSided ts df | ts < 0    = 1.0 - (0.5 - pvalueT df (abs ts))
                                | otherwise =         0.5 - pvalueT df      ts
   
pvalueT :: Double -> Double -> Double
pvalueT df testStatistic | testStatistic > 4.0  = 0.5 -- Performance gain, assuming precision is not needed for exceptionally high values
                         | otherwise            = findValue' (tvalue df) 0.00005 testStatistic

computePValueZ :: Sided -> Double -> Double
computePValueZ TwoSided   ts = (0.5 - pvalueZ (abs ts)) * 2
computePValueZ LeftSided  ts | ts < 0    = 0.5 - pvalueZ (abs ts)
                             | otherwise = 0.5 + pvalueZ      ts
computePValueZ RightSided ts | ts < 0    = 1.0 - (0.5 - pvalueZ (abs ts))
                             | otherwise =        0.5 - pvalueZ      ts

pvalueZ :: Double -> Double
pvalueZ testStatistic | testStatistic > 4.0  = 0.5 -- Performance gain, assuming precision is not needed for exceptionally high values
                      | otherwise            = findValue' zvalue 0.00005 testStatistic

-- | Utility function
chooseTypeOfTest :: ComponentSet -> TypeOfTest
chooseTypeOfTest cs 
   | contains cs Correlation      = TestCorrelation
   | contains cs (Two SampleMean) = if contains cs (Two SampleSize)
                                    then CompareMeans
                                    else CompareMeansPaired
   | contains cs (Two Proportion) = CompareProportions
   | contains cs Proportion       = TestProportion
   | contains cs SampleMean       = TestMean
   | otherwise                    = TestMean