Calculate the contingency matrix for two clusterings. For clusterings $C = {c_1,c_2,...c_n} and $K = {k_1, k_2,..k_m}$ the matrix is defined as $ a_ij = c_i U k_j $. The matrix is calculated lazily, and arbitrary functions can be applied to it to derive various scores: edit distance, jaccard index, exact count, etc. \begin{code} module Main where import qualified Data.ByteString.Lazy.Char8 as B import Data.Map hiding (map,filter,null) import qualified Data.Map as M import Data.List import Data.Maybe import System.Environment (getArgs) import Text.Printf type ByteString = B.ByteString main :: IO () main = do (a:args) <- getArgs a' <- B.readFile a let ref = mkC $ readC a' n = maximum $ map length ("clustering":args) putStrLn (pad n "clustering" ++ "\tVI\tMIF\tJaccard\tRand\tFowMal\tSkew") mapM_ (comp n ref) args pad i s = s ++ replicate (i-length s) ' ' comp :: Int -> Clustering -> FilePath -> IO () comp n k f = do f' <- B.readFile f let mx = matrix k (readC f') ct = mkPairs k mx sh = printf "%.3f" putStrLn $ concat $ intersperse "\t" $ (pad n f) : map sh [fst $ iscore mx -- VI ,snd $ iscore mx -- MIF ,jaccard ct, rand ct, fowmal ct, skew ct] -- ++"\tmatrix: "++show ct) type CID = Int type Sequence = ByteString -- hash for efficiency? type Cluster = [Sequence] type Clustering = (Map CID Cluster ,Map Sequence CID) -- optimization option cid :: Clustering -> Sequence -> Maybe CID cid k x = M.lookup x (snd k) seqs :: Clustering -> CID -> Maybe Cluster seqs k c = M.lookup c (fst k) -- read a clustering from file readC :: ByteString -> [Cluster] readC = filter (not.null) . map B.words . B.lines -- build the Clustering from a set of clusters mkC :: [Cluster] -> Clustering mkC = foldr ins (M.empty,M.empty) . zip [0..] where ins (n,ss) (i2c,s2i) = (M.insert n ss i2c, foldr (\s -> M.insert s n) s2i ss) \end{code} Matrix construction. \begin{code} type Row = [(Maybe CID,[Sequence])] type Matrix = [Row] -- reads a clustering and produces the confusion (-tingency) matrix matrix :: Clustering -> [[Sequence]] -> Matrix matrix c = map (matrixRow c) matrixRow :: Clustering -> [Sequence] -> Row matrixRow k c1 = comb $ sort $ map (\x -> (cid k x, [x])) c1 -- cid,seq where comb ((a,b):(c,d):xs) | a==c = comb ((a,b++d):xs) | otherwise = (a,b):comb ((c,d):xs) comb [(a,b)] = [(a,b)] comb [] = [] \end{code} \subsection{Jaccard index, and friends} Indices based on pair counts. This includes jaccard, rand, ...? "skew" is based on MacNemar's test. \begin{code} type Pairs = (Double,Double,Double,Double) -- a b c, and d jaccard, rand, fowmal, skew :: Pairs -> Double jaccard (a,b,c,_) = a/(a+b+c) rand (a,b,c,d) = (a+d)/(a+b+c+d) fowmal (a,b,c,d) = a/sqrt((a+b)*(a+c)) -- Fowlkes-Mallows index -- this is the same as fowmal (why?)! hubert (a,b,c,d) = (a*d - b*c)/sqrt((a+b)*(c+d)*(a+c)*(b+d)) -- Gamma stat? skew (a,b,c,d) = (b-c)/sqrt(b+c)/sqrt d mkPairs :: Clustering -> Matrix -> Pairs mkPairs k m = let (a,b,c,ct) = foldl (jc1 k) (0,0,0,0) m in (a/2,b/2,c/2,(ct*(ct-1)-a-b-c)/2) -- this is pretty ugly! jc1 :: Clustering -> Pairs -> Row -> Pairs jc1 k j row = let row' = filter (isJust.fst) row l = sum $ map (len.snd) row' in foldr (jc2 k l) j row' jc2 k lc (Just i,ss) (a,b,c,ct) = (a+ls*(ls-1),b+(ls*(lk-ls)),c+(ls*(lc-ls)),ct+ls) where ls = len ss lk = len $ fromJust (seqs k i) -- fixme! only seqs found in C! -- todo: accumulate column sizes until end? how does clusqual deal? \end{code} \subsection{Entropy-based measures} The \texttt{iscore} function calculates the variation of information between the two clustrings. Given clusterings $C$ and $K$ matrix $H$, it is calculated as: \( \sum_C H_c\log H_c + \sum_K H_k\log H_k -2\sumH_{K,C} H_{k,c} \log H_{k,c} \) Todo: implement \[I/H = (H_c - H_k)/H_{c,k} - 1\] I.e. collect H_k, H_c,k, and H_c, and use it to calculate both measures. \begin{code} type Imx = (Double,Double,Double,Double) -- iscore calculates mutual information fraction and variation of information iscore :: Matrix -> (Double,Double) -- MIF and VI iscore mx = let (n,hc,hk,hck) = iscores empty (0,0,0,0) mx vi = 1/n*(-2*hck+hc+hk) mif = (hc+hk-2*mlog n)/(hck-mlog n)-1 in (vi,mif) -- iscores - calculate n, H_c, H_k, H_c,k iscores:: Map Int Double -> Imx -> Matrix -> Imx iscores m (n,hc,hk,hck) (r:rs) = let r' = map (len.snd) $ filter (isJust.fst) r in iscores (accum m r) (n+sum r',hc,hk+mlog (sum r'),hck+sum (map mlog r')) rs iscores m (n,hc,hk,hck) [] = (n,(sum $ map mlog $ elems m),hk,hck) -- accumulate the column sums for the matrix. This must be done -- explicitly, since one clustering may contain sequences not present -- in the other accum :: Map Int Double -> Row -> Map Int Double accum m row = foldr accum1 m row where accum1 (Just cid,seqs) m = insertWith (+) cid (len seqs) m accum1 (Nothing,_) m = m mlog 0 = 0 -- error "log 0!" mlog x = x*log x/log 2 len = fromIntegral . length \end{code}