{- | Normalise is a module in the HasGP Gaussian process library. It contains functions for performing basic normalisation tasks on training examples, and for computing assorted standard statistics. Copyright (C) 2011 Sean Holden. sbh11\@cl.cam.ac.uk. -} {- This file is part of HasGP. HasGP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. HasGP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with HasGP. If not, see . -} module HasGP.Data.Normalise where import Numeric.LinearAlgebra import HasGP.Types.MainTypes import HasGP.Support.Linear as L import HasGP.Support.Functions as F -- | Compute the mean for each attribute in a set of examples. exampleMean :: Inputs -- ^ Matrix - one row per example -> DVector -- ^ Vector of means for each attribute. exampleMean examples = fromList $ map (L.sumVectorDiv r) (toColumns examples) where r = rows examples -- | Compute the variance for each attribute in a set of examples. exampleVariance :: Inputs -- ^ Matrix - one row per example -> DVector -- ^ Vector of variances for each attribute. exampleVariance examples = fromList $ map (L.sumVectorDiv r) (toColumns $ fromRows $ map ((mapVector (^2)) . (\x -> x-m)) (toRows examples)) where r = rows examples m = exampleMean examples -- | Compute the mean and variance for each attribute in a set of examples. exampleMeanVariance :: Inputs -- ^ Matrix - one row per example -> (DVector, DVector) -- ^ Means and variances exampleMeanVariance examples = (exampleMean examples, exampleVariance examples) -- | Normalise a set of examples to have specified mean and variance. normaliseMeanVariance :: DVector -- ^ Vector of new means required -> DVector -- ^ Vector of new variances required -> Inputs -- ^ Matrix - one row per example -> Inputs -- ^ Normalised matrix normaliseMeanVariance newMean newVariance examples = fromRows $ map (\x -> x+newMean) varianceAdjusted where (m, v) = exampleMeanVariance examples zeroMean = map (\x -> x-m) (toRows examples) varianceAdjustment = zipVectorWith (\x y -> (sqrt x)/(sqrt y)) newVariance v varianceAdjusted = map (zipVectorWith (*) varianceAdjustment) zeroMean -- | The same as normaliseMeanVariance but every column (attribute) is -- normalised in the same way. normaliseMeanVarianceSimple :: Double -- ^ New mean required -> Double -- ^ New variance required -> Inputs -- ^ Matrix - one row per example -> Inputs -- ^ Normalised matrix normaliseMeanVarianceSimple newMean newVariance examples = normaliseMeanVariance (constant newMean c) (constant newVariance c) examples where c = cols examples -- | Normalise a set of examples to have specified maximum and minimum. normaliseBetweenLimits :: Double -- ^ New min required -> Double -- ^ New max required -> Inputs -- ^ Matrix - one row per example -> Inputs -- ^ Normalised matrix normaliseBetweenLimits min max examples = fromColumns $ zipWith (\x y -> mapVector (x+) y) cV (zipWith scale mV columns) where columns = toColumns examples minV = map minElement columns maxV = map maxElement columns mV = zipWith (\x y -> ((max - min) / (y - x))) minV maxV cV = zipWith (\x y -> (min - (y * x))) minV mV -- | Find the columns of a matrix in which all values are equal. findRedundantAttributes :: Inputs -- ^ Matrix - one row per example -> [Bool] -- ^ List - True elements mark redundancy findRedundantAttributes examples = map allSame columns where columns = map toList (toColumns examples) allSame [] = True allSame [h] = True allSame [h1,h2] = (h1 == h2) allSame (h1:h2:t) = (h1 == h2) && (allSame (h2:t)) -- | List column numbers for redundant attributes. listRedundantAttributes :: Inputs -- ^ Matrix - one row per example -> [Int] -- ^ List - positions of redundant attributes listRedundantAttributes examples = findColumns boolean 1 [] where boolean = findRedundantAttributes examples findColumns [] n result = reverse result findColumns (h:t) n result | h = findColumns t (n+1) (n:result) | otherwise = findColumns t (n+1) result -- | Remove any redundant columns from a matrix. removeRedundantAttributes :: Inputs -- ^ Matrix - one row per example -> Inputs -- ^ Modified matrix - one row per example removeRedundantAttributes examples = fromColumns $ removeTrueColumns [] r (toColumns examples) where r = findRedundantAttributes examples removeTrueColumns result [] [] = reverse result removeTrueColumns result (True:t1) (c:t2) = removeTrueColumns result t1 t2 removeTrueColumns result (False:t1) (c:t2) = removeTrueColumns (c:result) t1 t2 -- | Specify a list of columns (matrix numbered from 1). -- Produce a matrix with ONLY those columns in the -- order specified in the list. retainAttributes :: [Int] -- ^ List of columns to keep. -> Inputs -- ^ Matrix - one row per example -> Inputs -- ^ Modified matrix - one row per example retainAttributes l m = trans $ extractRows l2 $ trans m where l2 = map (\x -> x-1) l -- | Compute the numbers for the confusion matrix. -- It is assumed that classes are +1 (positive) and -1 (negative). -- Result is (a,b,c,d): -- a - correct negatives -- b - predict positive when correct is negative -- c - predict negative when correct is positive -- d - correct positives confusionMatrix :: Targets -> Outputs -> (Double,Double,Double,Double) confusionMatrix correct predicted = cm (toList correct) (toList predicted) (0,0,0,0) where cm [] [] result = result cm (h1:t1) (h2:t2) (a,b,c,d) = case (h1, h2) of (1.0, 1.0) -> cm t1 t2 (a,b,c,d+1) (1.0,-1.0) -> cm t1 t2 (a,b,c+1,d) (-1.0, 1.0) -> cm t1 t2 (a,b+1,c,d) (-1.0,-1.0) -> cm t1 t2 (a+1,b,c,d) cm _ _ result = error "Correct and predicted vectors must have the same length" -- | Print the confusion matrix and some other statistics printConfusionMatrix :: Targets -- ^ Vector of targets -> Outputs -- ^ Vector of actual outputs -> IO () printConfusionMatrix correct predicted = do let (a,b,c,d) = confusionMatrix correct predicted let n = a+b+c+d let trueP = d/(d+c) let precision = d/(d+b) putStrLn ("------------------------------------------------") putStrLn ("Correct -1, Predicted -1: a = " ++ (show a)) putStrLn ("Correct -1, Predicted +1: b = " ++ (show b)) putStrLn ("Correct +1, Predicted -1: c = " ++ (show c)) putStrLn ("Correct +1, Predicted +1: d = " ++ (show d)) putStrLn ("------------------------------------------------") putStrLn ("Number of examples: n = a+b+c+d = " ++ (show n)) putStrLn ("Accuracy: a+d/n = " ++ (show ((a+d)/n))) putStrLn ("Recall/True Positive: d/d+c = " ++ (show trueP)) putStrLn ("False Positive: b/b+a = " ++ (show (b/(b+a)))) putStrLn ("True Negative: a/b+a = " ++ (show (a/(b+a)))) putStrLn ("False Negative: c/d+c = " ++ (show (c/(d+c)))) putStrLn ("Precision: d/d+b = " ++ (show precision)) putStrLn ("F Measure (beta = 1) = " ++ (show ((2 * trueP * precision)/(trueP + precision)))) putStrLn ("------------------------------------------------") return () -- | Assuming the labels are +1 or -1, count how many there are of each. countLabels :: Targets -> IO () countLabels v = do let d = dim v let plus = length $ filter (==(1.0)) $ toList v putStrLn ("Total number of labels: " ++ (show d)) putStrLn ("Number of +1 labels: " ++ (show plus)) putStrLn ("Number of -1 labels: " ++ (show (d - plus))) return ()