module Statistics.LinearRegression (
linearRegression,
linearRegressionRSqr,
linearRegressionTLS,
correl,
covar,
robustFit,
nonRandomRobustFit,
robustFitRSqr,
EstimationParameters(..),
ErrorFunction,
Estimator,
EstimatedRelation,
defaultEstimationParameters,
linearRegressionError,
linearRegressionTLSError,
converge,
) where
import qualified Data.Vector.Unboxed as U
import Data.Vector.Unboxed ((!))
import Safe (at)
import System.Random
import System.Random.Shuffle (shuffleM)
import Control.Monad.Random.Class
import Control.Monad.Random (evalRand)
import Control.Monad (liftM)
import Data.Function (on)
import Data.List (minimumBy, sortBy)
import Data.Maybe (fromMaybe)
import qualified Statistics.Sample as S
covar :: S.Sample -> S.Sample -> Double
covar xs ys = covar' m1 m2 n xs ys
where
!n = fromIntegral $ U.length xs
!m1 = S.mean xs
!m2 = S.mean ys
covar' :: Double -> Double -> Double -> S.Sample -> S.Sample -> Double
covar' m1 m2 n xs ys = U.sum (U.zipWith (*) (U.map (subtract m1) xs) (U.map (subtract m2) ys)) / (n1)
correl :: S.Sample -> S.Sample -> Double
correl xs ys = let !c = covar xs ys
!sx = S.stdDev xs
!sy = S.stdDev ys
in c / (sx * sy)
linearRegressionRSqr :: S.Sample -> S.Sample -> (Double, Double, Double)
linearRegressionRSqr xs ys = (alpha, beta, r2)
where
!c = covar' m1 m2 n xs ys
!r2 = c*c / (v1*v2)
!(m1,v1) = S.meanVarianceUnb xs
!(m2,v2) = S.meanVarianceUnb ys
!n = fromIntegral $ U.length xs
!beta = c / v1
!alpha = m2 beta * m1
linearRegression :: S.Sample -> S.Sample -> (Double, Double)
linearRegression xs ys = (alpha, beta)
where
(alpha, beta, _) = linearRegressionRSqr xs ys
linearRegressionTLS :: S.Sample -> S.Sample -> (Double,Double)
linearRegressionTLS xs ys = (alpha, beta)
where
!c = covar' m1 m2 n xs ys
!b = (v1 v2) / c
!(m1,v1) = S.meanVarianceUnb xs
!(m2,v2) = S.meanVarianceUnb ys
!n = fromIntegral $ U.length xs
!betas = [(b sqrt(b^2+4))/2,(b + sqrt(b^2+4)) /2]
!beta = if c > 0 then maximum betas else minimum betas
!alpha = m2 beta * m1
type EstimatedRelation = (Double,Double)
type Estimator = (S.Sample -> S.Sample -> EstimatedRelation)
type ErrorFunction = (EstimatedRelation -> (Double,Double) -> Double)
data EstimationParameters = EstimationParameters {
outlierFraction :: !Double,
shortIterationSteps :: !Int,
maxSubsetsNum :: !Int,
groupSubsets :: !Int,
mediumSetSize :: !Int,
largeSetSize :: !Int,
estimator :: Estimator,
errorFunction :: ErrorFunction
}
defaultEstimationParameters = EstimationParameters {
outlierFraction = 0.25,
shortIterationSteps = 3,
maxSubsetsNum = 500,
groupSubsets = 10,
mediumSetSize = 600,
largeSetSize = 1500,
estimator = linearRegression,
errorFunction = linearRegressionError
}
linearRegressionError :: ErrorFunction
linearRegressionError (alpha,beta) (x,y) = (y(beta*x+alpha))^2
linearRegressionTLSError :: ErrorFunction
linearRegressionTLSError (alpha,beta) (x,y) = ey/(1+beta^2)
where
ey = linearRegressionError (alpha,beta) (x,y)
setSize :: EstimationParameters -> S.Sample -> Int
setSize ep xs = max (n `div` 2 + 1) . round $ (1outlierFraction ep) * (fromIntegral n)
where
n = U.length xs
concentrationStep :: EstimationParameters -> S.Sample -> S.Sample -> (EstimatedRelation, Double) -> (EstimatedRelation, Double)
concentrationStep ep xs ys (prev, prev_err) = (new_estimate, new_err)
where
set_size = setSize ep xs
xyerrors = map (\p -> (p,errorFunction ep prev p)) $ zip (U.toList xs) (U.toList ys)
(xys,errors) = unzip . take set_size . sortBy (compare `on` snd) $ xyerrors
(good_xs,good_ys) = unzip xys
new_estimate = estimator ep (U.fromList good_xs) (U.fromList good_ys)
new_err = sum errors
concentration :: EstimationParameters -> S.Sample -> S.Sample -> EstimatedRelation -> [(EstimatedRelation, Double)]
concentration ep xs ys params = tail $ iterate (concentrationStep ep xs ys) (params,1)
converge :: EstimationParameters -> S.Sample -> S.Sample -> EstimatedRelation -> EstimatedRelation
converge ep xs ys = fst . findConvergencePoint . concentration ep xs ys
findConvergencePoint :: Ord a => [(b,a)] -> (b,a)
findConvergencePoint (x:y:ys)
| snd x <= snd y = x
| otherwise = findConvergencePoint (y:ys)
findConvergencePoint xs = error "Too short a list for conversion (size < 2)"
concentrateNSteps :: EstimationParameters -> S.Sample -> S.Sample -> EstimatedRelation -> (EstimatedRelation,Double)
concentrateNSteps ep xs ys params = concentration ep xs ys params !! shortIterationSteps ep
robustFit :: MonadRandom m => EstimationParameters -> S.Sample -> S.Sample -> m EstimatedRelation
robustFit ep xs ys = do
let n = U.length xs
if n < 2
then
error "cannot fit an input of size < 2"
else if n == 2
then return $ lineParams ((U.head xs,U.head ys),(U.last xs,U.last ys))
else
liftM (candidatesToWinner ep xs ys) $ if n < mediumSetSize ep
then
singleGroupFitCandidates ep Nothing xs ys
else if n < largeSetSize ep
then largeGroupFitCandidates ep xs ys
else do
(nxs,nys) <- liftM unzip $ randomSubset (zip (U.toList xs) (U.toList ys)) (largeSetSize ep)
largeGroupFitCandidates ep (U.fromList nxs) (U.fromList nys)
robustFitRSqr :: MonadRandom m => EstimationParameters -> S.Sample -> S.Sample -> m (EstimatedRelation,Double)
robustFitRSqr ep xs ys = do
er <- robustFit ep xs ys
let (good_xs,good_ys) = U.unzip . U.fromList . take (setSize ep xs) . sortBy (compare `on` errorFunction ep er) . U.toList $ U.zip xs ys
return (er,correl good_xs good_ys ^ 2)
nonRandomRobustFit :: EstimationParameters -> S.Sample -> S.Sample -> EstimatedRelation
nonRandomRobustFit ep xs ys = evalRand (robustFit ep xs ys) (mkStdGen 1)
candidatesToWinner :: EstimationParameters -> S.Sample -> S.Sample -> [EstimatedRelation] -> EstimatedRelation
candidatesToWinner ep xs ys = fst . minimumBy (compare `on` snd) . map (findConvergencePoint . concentration ep xs ys)
largeGroupFitCandidates :: MonadRandom m => EstimationParameters -> S.Sample -> S.Sample -> m [EstimatedRelation]
largeGroupFitCandidates ep xs ys = do
let n = U.length xs
let sub_groups_num = n `div` (mediumSetSize ep `div` 2)
let sub_groups_size = n `div` sub_groups_num
shuffled <- shuffleM $ zip (U.toList xs) (U.toList ys)
let sub_groups = map (U.unzip . U.fromList) $ splitTo sub_groups_size shuffled
let sub_groups_candidates = maxSubsetsNum ep `div` sub_groups_num
candidates_list <- mapM (applyTo $ singleGroupFitCandidates ep (Just sub_groups_candidates)) sub_groups
let candidates = concat candidates_list
return . map fst . take (groupSubsets ep) . sortBy (compare `on` snd) . map (findConvergencePoint . concentration ep xs ys) $ candidates
singleGroupFitCandidates :: MonadRandom m => EstimationParameters -> Maybe Int -> S.Sample -> S.Sample -> m [EstimatedRelation]
singleGroupFitCandidates ep m_subsets xs ys = do
let all_pairs = allPairs $ zip (U.toList xs) (U.toList ys)
let return_size = fromMaybe (maxSubsetsNum ep) m_subsets
initial_sets <- if return_size > length all_pairs
then return all_pairs
else randomSubset all_pairs return_size
return . map fst . take (groupSubsets ep) . sortBy (compare `on` snd) . map (concentrateNSteps ep xs ys . lineParams) $ initial_sets
lineParams :: ((Double,Double),(Double,Double)) -> EstimatedRelation
lineParams ((x1,y1),(x2,y2)) = (alpha,beta)
where
beta = (y2y1)/(x2x1)
alpha = y1 beta*x1
allPairs :: [a] -> [(a,a)]
allPairs [] = []
allPairs [x] = []
allPairs [x,y] = [(x,y)]
allPairs (x:xs) = (zip xs . repeat $ x) ++ allPairs xs
randomSubset :: MonadRandom m => [a] -> Int -> m [a]
randomSubset xs size = liftM (take size) $ shuffleM xs
splitTo :: Int -> [a] -> [[a]]
splitTo n = map (take n) . takeWhile (not . null) . iterate (drop n)
applyTo :: (a->b->c) -> (a,b) -> c
applyTo f (x,y) = f x y