{-# LANGUAGE BangPatterns , NoMonomorphismRestriction #-} -- | Scoring functions commonly used for evaluation of NLP -- systems. Most functions in this module work on sequences which are -- instances of 'Data.Foldable', but some take a precomputed table of -- 'Counts'. This will give a speedup if you want to compute multiple -- scores on the same data. For example to compute the Mutual -- Information, Variation of Information and the Adjusted Rand Index -- on the same pair of clusterings: -- -- >>> let cs = counts "abcabc" "abaaba" -- >>> mapM_ (print . ($ cs)) [mi, ari, vi] -- >>> 0.9182958340544894 -- >>> 0.4444444444444445 -- >>> 0.6666666666666663 module NLP.Scores ( -- * Scores for classification and ranking accuracy , recipRank , avgPrecision -- * Scores for clustering , ari , mi , vi -- * Auxiliary types and functions , Count , Counts , counts , sum , mean , jaccard , entropy , histogram -- * Extracting joint and marginal counts from 'Counts' , countJoint , countFst , countSnd -- * Extracting lists of values from 'Counts' , fstElems , sndElems ) where import qualified Data.Foldable as F import qualified Data.Traversable as T import Data.Monoid import Data.List hiding (sum) import qualified Data.Set as Set import qualified Data.Map as Map import Prelude hiding (sum) import NLP.Scores.Internals -- | Accuracy: the proportion of elements in the first sequence equal -- to elements at corresponding positions in second -- sequence. Sequences should be of equal lengths. accuracy :: (Eq a, Fractional c, T.Traversable t, F.Foldable s) => t a -> s a -> c accuracy xs = mean . fmap fromEnum . zipWithTF (==) xs . F.toList {-# SPECIALIZE accuracy :: [Double] -> [Double] -> Double #-} -- | Reciprocal rank: the reciprocal of the rank at which the first arguments -- occurs in the sequence given as the second argument. recipRank :: (Eq a, Fractional b, F.Foldable t) => a -> t a -> b recipRank y ys = case [ r | (r,y') <- zip [1::Int ..] . F.toList $ ys , y' == y ] of [] -> 0 r:_ -> 1/fromIntegral r {-# SPECIALIZE recipRank :: Double -> [Double] -> Double #-} -- | Average precision. -- avgPrecision :: (Fractional n, Ord a, F.Foldable t) => Set.Set a -> t a -> n avgPrecision gold _ | Set.size gold == 0 = 0 avgPrecision gold xs = (/fromIntegral (Set.size gold)) . sum . map (\(r,rel,cum) -> if rel == 0 then 0 else fromIntegral cum / fromIntegral r) . takeWhile (\(_,_,cum) -> cum <= Set.size gold) . snd . mapAccumL (\z (r,rel) -> (z+rel,(r,rel,z+rel))) 0 $ [ (r,fromEnum $ x `Set.member` gold) | (x,r) <- zip (F.toList xs) [1::Int ..]] {-# SPECIALIZE avgPrecision :: (Ord a) => Set.Set a -> [a] -> Double #-} -- | Mutual information: MI(X,Y) = H(X) - H(X|Y) = H(Y) - H(Y|X). Also -- known as information gain. mi :: (Ord a, Ord b) => Counts a b -> Double mi (Counts cxy cx cy) = let n = Map.foldl' (+) 0 cxy cell (P x y) nxy = let nx = cx Map.! x ny = cy Map.! y in nxy / n * logBase 2 (nxy * n / nx / ny) in sum [ cell (P x y) nxy | (P x y, nxy) <- Map.toList cxy ] -- | Variation of information: VI(X,Y) = H(X) + H(Y) - 2 MI(X,Y) vi :: (Ord a, Ord b) => Counts a b -> Double vi cs@(Counts _ cx cy) = entropy (elems cx) + entropy (elems cy) - 2 * mi cs where elems = Map.elems -- | Adjusted Rand Index: ari :: (Ord a, Ord b) => Counts a b -> Double ari (Counts cxy cx cy) = (sum1 - sum2*sum3/choicen2) / (1/2 * (sum2+sum3) - (sum2*sum3) / choicen2) where choicen2 = choice (sum . Map.elems $ cx) 2 sum1 = sum [ choice nij 2 | nij <- Map.elems cxy ] sum2 = sum [ choice ni 2 | ni <- Map.elems cx ] sum3 = sum [ choice nj 2 | nj <- Map.elems cy ] -- | The sum of a sequence of numbers sum :: (F.Foldable t, Num a) => t a -> a sum = F.foldl' (+) 0 {-# SPECIALIZE sum :: [Double] -> Double #-} {-# SPECIALIZE sum :: [Int] -> Int #-} {-# INLINE sum #-} -- | The mean of a sequence of numbers. mean :: (F.Foldable t, Fractional n, Real a) => t a -> n mean xs = let (P tot len) = F.foldl' (\(P s l) x -> (P (s+x) (l+1))) (P 0 0) xs in realToFrac tot/len {-# SPECIALIZE mean :: [Double] -> Double #-} -- | The binomial coefficient: C^n_k = PROD^k_i=1 (n-k-i)/i choice :: (Enum b, Fractional b) => b -> b -> b choice n k = foldl' (*) 1 [n-k+1 .. n] / foldl' (*) 1 [1 .. k] {-# SPECIALIZE choice :: Double -> Double -> Double #-} -- | Jaccard coefficient -- J(A,B) = |AB| / |A union B| jaccard :: (Fractional n, Ord a) => Set.Set a -> Set.Set a -> n jaccard a b = fromIntegral (Set.size (Set.intersection a b)) / fromIntegral (Set.size (Set.union a b)) {-# SPECIALIZE jaccard :: (Ord a) => Set.Set a -> Set.Set a -> Double #-} -- | Entropy: H(X) = -SUM_i P(X=i) log_2(P(X=i)). @entropy xs@ is the -- entropy of the random variable represented by the sequence @xs@, -- where each element of @xs@ is the count of the one particular -- value the random variable can take. If you need to compute the -- entropy from a sequence of outcomes, the following will work: -- -- > entropy . elems . histogram -- entropy :: (Floating c, F.Foldable t) => t c -> c entropy cx = negate . getSum . F.foldMap (Sum . f) $ cx where n = sum cx logn = logBase 2 n f nx = nx / n * (logBase 2 nx - logn) -- | @histogram xs@ is returns the map of the frequency counts of the -- elements in sequence @xs@ histogram :: (Num a, Ord k, F.Foldable t) => t k -> Map.Map k a histogram = F.foldl' (\ z k -> Map.insertWith' (+) k 1 z) Map.empty -- | Creates count table 'Counts' counts :: (Ord a, Ord b, T.Traversable t, F.Foldable s) => t a -> s b -> Counts a b counts xs = F.foldl' f empty . zipWithTF P xs . F.toList where f cs@(Counts cxy cx cy) p@(P x y) = cs { joint = Map.insertWith' (+) p 1 cxy , marginalFst = Map.insertWith' (+) x 1 cx , marginalSnd = Map.insertWith' (+) y 1 cy } -- | Joint count countJoint :: (Ord a, Ord b) => a -> b -> Counts a b -> Count countJoint x y = Map.findWithDefault 0 (P x y) . joint -- | Count of first element countFst :: Ord k => k -> Counts k b -> Count countFst x = Map.findWithDefault 0 x . marginalFst -- | Count of second element countSnd :: Ord k => k -> Counts a k -> Count countSnd y = Map.findWithDefault 0 y . marginalSnd -- | List of values of first element fstElems :: Counts k b -> [k] fstElems = Map.keys . marginalFst -- | List of values of second element sndElems :: Counts a k -> [k] sndElems = Map.keys . marginalSnd -- | @zipWithTF h t f@ zips the values from the traversable @t@ with -- the values from the foldable @f@ using the function @h@. zipWithTF :: (T.Traversable t, F.Foldable f) => (a -> b -> c) -> t a -> f b -> t c zipWithTF h t f = snd . T.mapAccumL map_one (F.toList f) $ t where map_one (x:xs) y = (xs, h y x)