----------------------------------------------------------------------------- -- Copyright 2020, Ideas project team. This file is distributed under the -- terms of the Apache License 2.0. For more information, see the files -- "LICENSE.txt" and "NOTICE.txt", which are included in the distribution. ----------------------------------------------------------------------------- module Domain.Hypothesis.Common ( TypeOfTest(..), dependent, independent, chiSquared, pickAlpha, sidedFromHA, h0FromHA, h0FromHAEqualSign, validTests, testFormulaFromTest, degreesOfFreedomFromTest, computeCritical, computeCriticalT, computeCriticalZ , computeCriticalR, computeCriticalF, computeCriticalChi, computePValue, computePValueT, computePValueZ, chooseTypeOfTest, isTTest ) where import Data.Maybe import Ideas.Common.Library (Term(TCon)) import Domain.Statistics.Symbols import Domain.Hypothesis.Tables import Domain.Math.Expr hiding ((^)) import Domain.Math.Data.Relation import Domain.Statistics.Component import Domain.Statistics.ComponentSet import Ideas.Common.View import Domain.Math.Numeric.Views data TypeOfTest = TestMean | CompareMeans | CompareMeansPaired | TestProportion | CompareProportions | TestCorrelation deriving (Eq, Show) dependent, independent, chiSquared :: Expr dependent = toExpr (TCon dependentSymbol []) independent = toExpr (TCon independentSymbol []) chiSquared = Var "chisq" -- Pick a value for alpha pickAlpha :: Double pickAlpha = 0.05 -- Determine the sidedness based on the alternative hypothesis. sidedFromHA :: Relation Expr -> Sided sidedFromHA ha = fromRelation (relationType ha) where fromRelation :: RelationType -> Sided fromRelation LessThan = LeftSided fromRelation LessThanOrEqualTo = LeftSided fromRelation GreaterThan = RightSided fromRelation GreaterThanOrEqualTo = RightSided fromRelation EqualTo = TwoSided fromRelation NotEqualTo = TwoSided fromRelation rt = error $ "sidedFromHA: " ++ show rt h0FromHA :: Relation Expr -> Relation Expr h0FromHA ha = let h0Rel = inverseRelType (relationType ha) in makeType h0Rel (leftHandSide ha) (rightHandSide ha) h0FromHAEqualSign :: Relation Expr -> Relation Expr h0FromHAEqualSign ha = makeType EqualTo (leftHandSide ha) (rightHandSide ha) inverseRelType :: RelationType -> RelationType inverseRelType relType = fromMaybe relType (lookup relType table) where table = pairs ++ map (\(a,b) -> (b,a)) pairs pairs = [(LessThan, GreaterThanOrEqualTo), (LessThanOrEqualTo, GreaterThan), (EqualTo, NotEqualTo)] -- Returns the valid tests for this state -- Corner case for two samples (n1,n2 instead of n) validTests :: ComponentSet -> [TestType] validTests cs = case getTestType TestChoice (initials cs) of Just x -> [x] _ -> select (chooseTypeOfTest cs) where n = fromMaybe 0 $ do expr <- getExpr SampleSize cs match naturalView expr psdKnown = cs `contains` PopulationSdev select TestMean | psdKnown = [ZTest] | n > 100 = [ZTest] -- Sietske: ignore TTestOne for now | otherwise = [TTestOne] select CompareMeans = [TTestTwo] select CompareMeansPaired = [TTestPaired] select TestProportion = [ZTest] select CompareProportions = [ZTest] select TestCorrelation = [RPearson, TTestOne] -- Returns the test statistic formula from the chose test. -- Note: this only works for the case of testing the mean, other cases should -- give the formula as part of the exercise testFormulaFromTest :: Monad m => TestType -> TypeOfTest -> m Expr testFormulaFromTest testType typeOfTest = case (testType, typeOfTest) of (TTestOne, TestMean) -> return $ (Var "M" - Var "mu") / (Var "s" / sqrt (Var "n")) (ZTest, TestMean) -> return $ (Var "M" - Var "mu") / (Var "sigma" / sqrt (Var "n")) (TTestTwo, CompareMeans) -> return $ (mean1 - mean2) / sqrt (toExpr PooledVariance * (1 / Var "n1" + 1 / Var "n2")) (TTestPaired, TestMean) -> return $ (Var "M" - Var "mu") / (Var "s" / sqrt (Var "n")) (TTestPaired, CompareMeansPaired) -> return $ (mean1 - mean2) / (toExpr SampleSdev * sqrt (1 / Var "n")) -- to do: ask Sietske (_, TestProportion) -> return $ (Var "p" - Var "p0") / sqrt (Var "p0" * (1.0 - Var "p0") / Var "n") (_, CompareProportions) -> return $ (Var "p1" - Var "p2" - Var "d0") / sqrt (Var "p0" * (1.0 - Var "p0") / (Var "n" / 2)) (TTestOne, TestCorrelation) -> return $ (Var "r" * sqrt (Var "n" - 2)) / sqrt (1 - Var "r"**2) (RPearson, TestCorrelation) -> return $ Var "r" _ -> fail $ "teststatisticFromTest " ++ show (testType, typeOfTest) where mean1 = toExpr (One SampleMean) mean2 = toExpr (Two SampleMean) -- Returns the formula for the degrees of degreesOfFreedom degreesOfFreedomFromTest :: Monad m => TestType -> TypeOfTest -> m Expr degreesOfFreedomFromTest TTestOne TestCorrelation = return $ Var "n" - 2 degreesOfFreedomFromTest TTestOne _ = return $ Var "n" - 1 degreesOfFreedomFromTest TTestPaired _ = return $ Var "n" - 1 degreesOfFreedomFromTest TTestTwo _ = return $ Var "n1" + Var "n2" - 2 degreesOfFreedomFromTest RPearson _ = return $ Var "n" - 2 degreesOfFreedomFromTest _ _ = fail "degrees of freedom test failed" -- Returns the critical value from the given test and alpha computeCritical :: Monad m => TestType -> Sided -> Double -> Maybe Double -> m Double computeCritical test sided alpha mdf | isTTest test = case mdf of Just df -> return $ computeCriticalT sided alpha df Nothing -> fail "df missing" | test == RPearson = case mdf of Just df -> return $ computeCriticalR sided alpha df Nothing -> fail "df missing" | test == ZTest = return $ computeCriticalZ sided alpha | otherwise = fail "unknown test" isTTest :: TestType -> Bool isTTest TTestOne = True isTTest TTestTwo = True isTTest TTestPaired = True isTTest _ = False computeCriticalR :: Sided -> Double -> Double -> Double computeCriticalR sided alpha df = sqrt (t ^ (2 :: Int) / (t ^ (2 :: Int) + df)) where t = computeCriticalT sided alpha df computeCriticalF :: Monad m => Double -> Double -> Double -> m Double computeCriticalF dfBetween dfWithin alpha = maybe (fail "unknown critical-f value") return (fTable dfBetween dfWithin alpha) -- TODO Sietske computeCriticalChi :: Monad m => Sided -> Double -> Double -> m Double computeCriticalChi TwoSided alpha df = chivalue' df (alpha / 2) computeCriticalChi LeftSided alpha df = negate <$> chivalue' df alpha computeCriticalChi RightSided alpha df = chivalue' df alpha chivalue' :: Monad m => Double -> Double -> m Double chivalue' df alpha = maybe (fail "unknown critical-chi value") return $ chiTable alpha (round df) computeCriticalT :: Sided -> Double -> Double -> Double computeCriticalT TwoSided alpha df = tvalue' df (alpha / 2) computeCriticalT LeftSided alpha df = - tvalue' df alpha computeCriticalT RightSided alpha df = tvalue' df alpha tvalue' :: Double -> Double -> Double tvalue' df alpha | isJust tableLookup = fromJust tableLookup | otherwise = fromInteger (round $ findValue (tvalue df) 0.00005 (0.5 - alpha) * 1000) / 1000.0 where tableLookup = tTable alpha (round df) computeCriticalZ :: Sided -> Double -> Double computeCriticalZ TwoSided alpha = zvalue' (alpha / 2) computeCriticalZ LeftSided alpha = - zvalue' alpha computeCriticalZ RightSided alpha = zvalue' alpha zvalue' :: Double -> Double zvalue' alpha | isJust tableLookup = fromJust tableLookup | otherwise = fromInteger (round $ findValue zvalue 0.00005 (0.5 - alpha) * 1000) / 1000.0 where tableLookup = zTable alpha -- | Utils for computing t-/z-/p-values zvalue :: Double -> Double zvalue x = (1.0 / sqrt (2.0 * pi)) * exp 1 ** negate (x**2/2) tvalue :: Double -> (Double -> Double) tvalue df x = 1 / (sqrt df * beta 0.5 (df / 2.0)) * (1.0 + (x*x) / df) ** negate ((df + 1.0) / 2.0) -- Utility functions for finding the t-value -- Source: https://wiki.haskell.org/index.php?title=Gamma_and_Beta_function cof :: [Double] cof = [76.18009172947146,-86.50532032941677,24.01409824083091,-1.231739572450155,0.001208650973866179,-0.000005395239384953] ser :: Double ser = 1.000000000190015 gammaln :: Double -> Double gammaln xx = let tmp' = (xx+5.5) - (xx+0.5)*log(xx+5.5) ser' = foldl (+) ser $ map (\(y,c) -> c/(xx+y)) $ zip [1..] cof in -tmp' + log(2.5066282746310005 * ser' / xx) beta :: Double -> Double -> Double beta z w = exp (gammaln z + gammaln w - gammaln (z+w)) findValue :: (Double -> Double) -> Double -> Double -> Double findValue f stepSize target = fst $ until (\(_, x) -> x >= target) (\(a, x) -> (a + stepSize, x + stepSize * f a)) (0, 0) findValue' :: (Double -> Double) -> Double -> Double -> Double findValue' f stepSize target = snd $ until (\(a, _) -> a >= target) (\(a, x) -> (a + stepSize, x + stepSize * f a)) (0, 0) -- Compute the P-value computePValue :: Monad m => TestType -> Sided -> Double -> Maybe Double -> m Double computePValue test sided ts (Just df) | isTTest test = return $ computePValueT sided ts df | otherwise = fail "cannot compute p-value" computePValue ZTest sided ts _ = return $ computePValueZ sided ts computePValue _ _ _ _ = fail "cannot compute p-value" computePValueT :: Sided -> Double -> Double -> Double computePValueT TwoSided ts df = (0.5 - pvalueT df (abs ts)) * 2 computePValueT LeftSided ts df | ts < 0 = 0.5 - pvalueT df (abs ts) | otherwise = 0.5 + pvalueT df ts computePValueT RightSided ts df | ts < 0 = 1.0 - (0.5 - pvalueT df (abs ts)) | otherwise = 0.5 - pvalueT df ts pvalueT :: Double -> Double -> Double pvalueT df testStatistic | testStatistic > 4.0 = 0.5 -- Performance gain, assuming precision is not needed for exceptionally high values | otherwise = findValue' (tvalue df) 0.00005 testStatistic computePValueZ :: Sided -> Double -> Double computePValueZ TwoSided ts = (0.5 - pvalueZ (abs ts)) * 2 computePValueZ LeftSided ts | ts < 0 = 0.5 - pvalueZ (abs ts) | otherwise = 0.5 + pvalueZ ts computePValueZ RightSided ts | ts < 0 = 1.0 - (0.5 - pvalueZ (abs ts)) | otherwise = 0.5 - pvalueZ ts pvalueZ :: Double -> Double pvalueZ testStatistic | testStatistic > 4.0 = 0.5 -- Performance gain, assuming precision is not needed for exceptionally high values | otherwise = findValue' zvalue 0.00005 testStatistic -- | Utility function chooseTypeOfTest :: ComponentSet -> TypeOfTest chooseTypeOfTest cs | contains cs Correlation = TestCorrelation | contains cs (Two SampleMean) = if contains cs (Two SampleSize) then CompareMeans else CompareMeansPaired | contains cs (Two Proportion) = CompareProportions | contains cs Proportion = TestProportion | contains cs SampleMean = TestMean | otherwise = TestMean