-- While working on this module you are encouraged to remove it and fix -- any warnings in the module. See -- http://hackage.haskell.org/trac/ghc/wiki/WorkingConventions#Warnings -- for details ----------------------------------------------------------------------------- -- | -- Module : Clustering -- Copyright : (c) Philipp Pribbernow -- License : BSD-style (see the file libraries/base/LICENSE) -- -- Maintainer : libraries@haskell.org -- Stability : experimental -- Portability : portable -- -- Hieraclus is a library that supports clustering of arbitrary elements in haskell. The difference to the already -- existing cluster library /hierarchical-clustering/ is the ability to work with abort criterias which allow an -- \"intelligent\" clustering. With the help of abort criterias the user can specify conditions that must be fulfilled -- in order to stop the clustering process. -- -- Another motivation of creating this library was to make the cluster process run in /O(n^2)/. However, the current -- implementation runs in /O(n^2 * log n)/. It has to be mentioned that the real runtime complexity tends to grow -- faster due to memory management, I guess. Some profiling showed that there is quite a big amount of memory -- spent managing the maps. The principle idea was not to work with a matrix, but with two maps instead. The -- first map holds the mappings from cluster pairs to distances, the second map vice versa, thus allowing to find -- the minimal distance in /O(log n)/ and not in /O(n^2)/. Two make things more efficient the data to be clustered -- initially is transformed to vector space, as all clutering operations work in vector space. The actual clustering -- thus is done with the vector representations of the input data, which finally are transformed back. -- -- The above mentioned information for the abort criterias, the maps and the element-mappings are carried through -- the cluster process in a cluster state. So the actual cluster process takes place within the state monad. -- However, the library offers a function 'cluster' that is purely functional as it returns a tuple. -- First element of the tuple is the cluster result - simply implemented as list of list. -- The second element of the tuple holds the cluster information used by the abort criterias. ----------------------------------------------------------------------------- {-# LANGUAGE DoAndIfThenElse #-} module Numeric.Statistics.Clustering.Clustering ( -- * Cluster State ClusterState(..), ClusterInfo(..), ClusterResult, -- * Cluster Map Cluster(..), ClusterMap(..), ID, singleton, fromList, getCluster, getClusterUnsafe, mergeClusters, extractClusterElements, -- * Minimum and Combination Map MinimumMap(..), CombinationMap(..), Pair(..), -- * Abort Criterias noAbort, maxTotal, nCluster, nSteps, calinski, ellbow, -- * Cluster Methods DistanceFunction(..), SimilarityFunction(..), singleLinkage, completeLinkage, averageLinkage, wardLinkage, -- ** Cluster Method Construction pairwise, clusterwise, -- ** Cost Functions addition, varianceSum, -- * Clustering Process Transformation(..), cluster, runCluster ) where -- this data structure is used to map cluster ids to clusters and has a -- space complexity of /O(n)/. import Data.IntMap (IntMap) import qualified Data.IntMap as IntMap -- this data structure is used to map and has a space complexity of -- /O(n^2)/. import Data.Map (Map) import qualified Data.Map as Map -- this data structure is used to store the calculated distances between the -- clusters and thus forms represents the distance matrix import Data.MultiSet ( MultiSet (..) ) import qualified Data.MultiSet as MS import Control.Monad.State import Maybe (fromJust) import Numeric.Statistics.Clustering.VectorUtils ( Vector(..), meanSquareV, average ) import qualified Numeric.Statistics.Clustering.VectorUtils as VU {---------------------------------------------------------------------------- ClusterMap -----------------------------------------------------------------------------} -- | the Cluster map serves to represent unions of elements. Therefore it maps -- IDs to clusters. type ClusterMap a = IntMap (Cluster a) -- | Unique ID for a cluster type ID = IntMap.Key -- | a Cluster is represented as a list of Vectors newtype Cluster a = Cluster { vals :: [Vector a] } deriving (Show) -- | the resulting clusters are represented as a lists type ClusterResult a = [[a]] -- | /O(1)/ -- creates a cluster with only one element singleton :: Maybe (Vector a) -> Cluster a singleton x = case x of Just e -> Cluster [e] otherwise -> Cluster [] -- | /O(n)/ -- creates clusters by a given map fromList :: [Vector a] -> ClusterMap a fromList = IntMap.fromList . zip [1..] . map (singleton . Just) -- /O(min(n,W))/ -- return a cluster by a given "ID" getCluster :: ClusterMap a -> ID -> Maybe (Cluster a) getCluster m id = IntMap.lookup id m -- /O(min(n,W))/ -- unsafe version of "getCluster" getClusterUnsafe :: ClusterMap a -> ID -> (Cluster a) getClusterUnsafe m = fromJust . getCluster m -- /O(log n)/ -- | merge two clusters given by their ids and return a tuple. -- The first element of the tuple is the new created cluster. -- The second element is the new resulting cluster structure mergeClusters :: ID -> ID -> ClusterMap a -> State (ClusterState a b) (Cluster a, ClusterMap a, ClusterMap a) mergeClusters i1 i2 m = do -- delete the id of the second cluster from cluster map let (oldval, newM) = IntMap.updateLookupWithKey (\_ -> const Nothing) i2 m case oldval of Nothing -> mkError $ "Cluster" ++ (show i2) ++ "not found" Just cl -> do -- delete the id of the second cluster from cluster map let (oldval', newM') = IntMap.updateLookupWithKey (\_ -> const Nothing) i1 newM case oldval' of Nothing -> mkError $ "Cluster" ++ (show i1) ++ "not found" Just cl' -> do -- insert new cluster that contains all values of i1 and i2 let newCluster = Cluster $ (vals cl' ++ vals cl) newM'' = IntMap.insert i1 newCluster newM' return (newCluster, newM', newM'') -- | extracts the original values from the cluster map. It runs in the state -- monad as it needs the mapping of vectors to original values. extractClusterElements :: Ord a => ClusterMap a -> State (ClusterState a b) [[b]] extractClusterElements clumap = do cinfo' <- return . cinfo =<< get let assocs = idents cinfo' return $ map (map (fromJust . (\v -> Map.lookup v assocs))) (map vals $ IntMap.elems clumap) {---------------------------------------------------------------------------- MinimumMap -----------------------------------------------------------------------------} -- | the minimum map saves the distance matrix as a multi set, because a distance -- can occur more than one times. The set allows to find a distance pair -- by its ids and is used to find the minimum distance in /O(log n)/ -- Note: Alternatively one could use kind of a binary heap to find -- the minimum distance in /O(1)/ -- Storage complexity is /O(n^2)/ type MinimumMap a = MultiSet (a, Pair ID) -- | a pair of ID is used for mappings from and to distances between -- two clusters. type Pair a = (a,a) -- | Like the minimum map but with the pairs as the keys, thus allowing -- to find the distance of a given pair in /O(log n)/. -- Storage complexity is /O(n^2)/ type CombinationMap a = Map (Pair ID) a -- | a Cluster Function calculates the distance between two clusters type ClusterFunction a = (Cluster a -> Cluster a -> a) -- | the cluster state contains information about all relevant maps -- that are needed for the clustering and information about the -- clustering process. The ClusterState is passed around withing -- the state monad data ClusterState a b = CS { minmap :: MinimumMap a, -- ^ holds the mappings from distances to pairs combis :: CombinationMap a, -- ^ holds the mappings from pairs to distances cinfo :: ClusterInfo a b -- ^ holds information of the clustering process that is needed by the Abort Criterias } deriving (Show) -- | the cluster process produces information about the clustering after each step. -- these information are given to functions that decide if the cluster process -- may continue or stop and return the results data ClusterInfo a b = CI { idents :: Map (Vector a) b, -- ^ holds the mapping from the representation vectors to its actual objects nElems :: Int, -- ^ the number of elements to be clustered cNew :: (Cluster a, [Cluster a]), -- ^ the new created cluster and the all other clusters costs :: a, -- ^ a quality factor of the current combining that indicates the \"costs\" of cNew total :: a, -- ^ the accmulated costs cStep :: Int, -- ^ the current clustering step cHistory :: [a] -- ^ holds a history of all costs } deriving (Show) {---------------------------------------------------------------------------- Abort Criterias -----------------------------------------------------------------------------} -- | An AbortCriterium is a constraint for the clustering process -- deciding how many cluster steps are to be done. After each cluster -- step the abort criterim is asked. /True/ means abortion of clustering. type AbortCriterium a b = ClusterInfo a b -> Bool -- | no abortion means that the cluster process is only limited by its -- maximum number of possible steps that is: /n/ - 1 where /n/ is the -- number of elements to be clustered noAbort :: AbortCriterium a b noAbort cInfo = cStep cInfo >= nElems cInfo -- | defines the max. \"costs\" of a further combining of two clusters. -- This can be the increase of the euclidean distance e.g. as -- well as the varianceSum maxTotal :: Ord a => a -> AbortCriterium a b maxTotal n cInfo = total cInfo > n -- | sets a max. number of clusters nCluster :: Int -> AbortCriterium a b nCluster n cInfo = n > (nElems cInfo - cStep cInfo) -- | sets a number of steps that has to be done nSteps :: Int -> AbortCriterium a b nSteps n cInfo = cStep cInfo > n -- | defines a tolerance for the homogeneity of the clusters -- that is the relation of the inner varianceSum of the recently -- created cluster and the outer varianceSum of all other clusters -- Developed by Calinski and Habarasz, see: calinski :: (Ord a, Floating a) => a -> AbortCriterium a b calinski tol cInfo = ( (outerV / (innerV)) * ((n-k) * (k-1)) ) > tol where k = fromIntegral $ cStep cInfo (newCluster,rest) = cNew cInfo n = (fromIntegral $ nElems cInfo) - k innerV = sum $ map (meanSquareV . vals) rest outerV = sum $ map (meanSquareV . ((++) $ vals newCluster) . vals) rest -- | calculates the ellbow criterium that is to find a cluster steps -- which costs are above average. The first parameter gives a number -- of steps that are tolerated as a kind of stabilization phase. So if -- minSteps is set to k than ellbow criterium starts calculation average -- at step k+1. The second parameter gives the max. allowed multiple of -- average inclination ellbow :: (Ord a, Num a, Floating a) => Int -> a -> AbortCriterium a b ellbow minSteps factor cInfo = (cStep cInfo) > minSteps && (not $ null history) && currInc > factor * (histAvg oldIncls) where history = cHistory cInfo (currInc:oldIncls) = inclinations history inclinations [x] = [x] inclinations xs = zipWith (-) xs (tail xs) histAvg [] = currInc histAvg [x] = x histAvg xs = average xs {---------------------------------------------------------------------------- Cluster Methods -----------------------------------------------------------------------------} -- | a distance function determines how to calculate the distance between two -- vectors type DistanceFunction a = Vector a -> Vector a -> a -- | calculates the difference of two clusters by comparing them as a whole, -- e.g. the sum of variances of the clusters can be used type SimilarityFunction a = [Vector a] -> a -- | /O(n^2 log n)/. -- Uses the single linkage method for clustering singleLinkage :: (Ord a, Eq a) => DistanceFunction a -> ClusterFunction a singleLinkage df c1 c2 = minimum $ pairwise df c1 c2 -- | /O(n^2 log n)/. -- Uses the complete linkage method for clustering completeLinkage :: (Ord a, Eq a) => DistanceFunction a -> ClusterFunction a completeLinkage df c1 c2 = maximum $ pairwise df c1 c2 -- | /O(n^2 log n)/. -- Uses the average linkage method for clustering averageLinkage :: (Ord a, Floating a) => DistanceFunction a -> ClusterFunction a averageLinkage df c1 c2 = average $ pairwise df c1 c2 -- | /O(n^2 log n)/. -- Uses the ward linkage method for clustering wardLinkage :: (Ord a) => SimilarityFunction a -> ClusterFunction a wardLinkage f = clusterwise f {---------------------------------------------------------------------------- Cluster Methods Construction -----------------------------------------------------------------------------} -- evaluates a given function for all possible element pairs of two clusters pairwise :: Ord a => DistanceFunction a -> Cluster a -> Cluster a -> [a] pairwise f e1 e2 = [ f x y | x <- vals e1, y <- vals e2 ] -- evaluates a given function for two given clusters clusterwise :: SimilarityFunction a -> ClusterFunction a clusterwise f c1 c2 = f $ (vals c1) ++ (vals c2) {---------------------------------------------------------------------------- Cost functions -----------------------------------------------------------------------------} -- | a cost function has to descide how the single results produced after each -- clustering step can be accumlated. -- First tupel element gives the costs of the current step. The second element -- gives the accumulated costs type CostFunction a = a -> a -> [[Vector a]] -> a -- the several costs of clustering may simply be added addition :: Num a => CostFunction a addition total dist _ = total + dist -- the determination of the costs are calculated by considering the -- overall varianceSum varianceSum :: Floating a => CostFunction a varianceSum _ dist cs = sum $ map meanSquareV cs {---------------------------------------------------------------------------- Clustering -----------------------------------------------------------------------------} -- | transforms the input data into a vector representation type Transformation a b = (a -> Vector b) -- executes the cluster process cluster :: (Ord a, Num a) => Transformation b a -> ClusterFunction a -> CostFunction a -> [AbortCriterium a b] -> [b] -> (ClusterResult b, ClusterInfo a b) cluster toVector f cf ac cs = let (res,cstate) = runState ( runCluster toVector f cf ac cs >>= extractClusterElements ) emptyState in (res, cinfo cstate) {---------------------------------------------------------------------------- Internal Functions -----------------------------------------------------------------------------} -- | /O(n^2)/ -- calculates the upper triangle matrix allPairs :: Ord a => [a] -> [Pair a] allPairs xs = [(x,y) | x <- xs, y <- xs, x < y] -- | Evaluates a list of pairs of ids. evalPairs :: Ord a => ClusterMap a -> ClusterFunction a -> [Pair ID] -> State (ClusterState a b) ([(a, Pair ID)]) evalPairs clumap f tupels = do ctupels <- mapM ( \p@(id1,id2) -> do let x' = getClusterUnsafe clumap id1 y' = getClusterUnsafe clumap id2 return (f x' y', p) ) tupels return ctupels -- | the main cluster routine that does most of the work clustering :: (Ord a, Num a) => Int -> ClusterFunction a -> CostFunction a -> [AbortCriterium a b] -> ClusterMap a -> State (ClusterState a b) (ClusterMap a) clustering n f cf ac xs = do cinfo' <- (\ci -> return ci{cStep = n}) . cinfo =<< get -- check abort criterias from left to right until one states true if ((not $ null ac) && (or $ map (\a -> a cinfo') ac)) || noAbort cinfo' then return xs else do (dist,(k1,k2)) <- findMin -- O (log n) (newCluster,rest,xs') <- mergeClusters k1 k2 xs -- O (log n) let total' = cf (total cinfo') dist (map vals $ IntMap.elems xs') toUpdate = (k1,k2) : updatePairs (IntMap.keys xs') k1 k2 -- O(n) cinfo'' = cinfo' { cNew = (newCluster, IntMap.elems rest), costs = dist, total = total', cHistory = total' : cHistory cinfo'} if ((not $ null ac) && (or $ map (\a -> a cinfo'') ac)) || noAbort cinfo'' then return xs else do adjustMaps xs' toUpdate f modify $ \s -> s {cinfo = cinfo'' } clustering (n+1) f cf ac xs' -- | updates the combination- and minimum map after each clustering step adjustMaps :: (Num a, Ord a) => ClusterMap a -> [Pair ID] -> ClusterFunction a -> State (ClusterState a b) () adjustMaps clumap allP@(cPair@(k1,k2):ks) f = do cstate <- get let pairsWithKey1 = filter (\(a,b) -> a == k1 || b == k1) ks pairsWithKey2 = filter (\(a,b) -> a == k2 || b == k2) ks updatedPairs <- evalPairs clumap f pairsWithKey1 -- all pairs that have to be recomputed upDistMV <- mapM (getClusterDistance clumap f) allP -- construct the tuples of the pairs to be deleted O(n) let minmap' = foldl (flip MS.delete) (minmap cstate) $ map swap upDistMV -- O (log n) minmap''= foldl (flip MS.insert) minmap' updatedPairs -- O (n) combis' = foldl (flip Map.delete) (combis cstate) $ cPair : pairsWithKey2 combis''= foldl (\m (v,pos) -> Map.update (const $ Just v) pos m) combis' updatedPairs modify $ \s -> s{minmap = minmap'', combis = combis''} return () -- | /O(n)/ -- caclulates the pairs of clusters that has to be updated by giving the -- the two recently combined cluster ids updatePairs :: [ID] -> ID -> ID -> [Pair ID] updatePairs xs a b = [ if x < y then (x,y) else (y,x) | x <- xs, y <- [a,b], x /= y && x /= a && x /= b ] -- | an empty state intializing all maps with empty emptyState :: Num a => ClusterState a b emptyState = CS { combis = Map.empty, minmap = MS.empty, cinfo = emptyInfo } -- | initializes the cluster info with default values emptyInfo :: Num a => ClusterInfo a b emptyInfo = CI Map.empty 0 (singleton Nothing,[]) 0 0 0 [] -- | a wrapper for the acutal clustering function running in the -- state monad receiving the needed parameters to transform them for it runCluster :: (Ord a, Num a) => (b -> Vector a) -> ClusterFunction a -> CostFunction a -> [AbortCriterium a b] -> [b] -> State (ClusterState a b) (ClusterMap a) runCluster toVector f cf ac xs = do let -- map values into vector space mappedValues = map toVector xs clumap = fromList $ mappedValues pairs <- evalPairs clumap f $ allPairs [1..length xs] modify $ \s -> s{ combis = Map.fromList $ map swap pairs, minmap = MS.fromList pairs, cinfo = emptyInfo{nElems = length xs, idents = Map.fromList $ zip mappedValues xs } } clustering 1 f cf ac clumap -- | /O(log n)/ -- searches for the minimum distance in the minimum map findMin :: State (ClusterState a b) (a, Pair ID) findMin = return . MS.findMin . minmap =<< get -- | calculates the distance between two clusters given by their ids getClusterDistance :: ClusterMap a -> ClusterFunction a -> (ID,ID) -> State (ClusterState a b) (Pair ID,a) getClusterDistance clumap f pair = (\m -> case Map.lookup pair m of Nothing -> do let x' = getClusterUnsafe clumap $ fst pair y' = getClusterUnsafe clumap $ snd pair res = f x' y' modify $ \s -> s{combis = Map.insert pair res $ combis s} return (pair, res) Just e -> return (pair, e) ) . combis =<< get {---------------------------------------------------------------------------- Helper functions -----------------------------------------------------------------------------} -- | swaps the elements of a tuple swap :: (a,b) -> (b, a) swap (x,y) = (y,x) -- | creates an error message mkError :: String -> a mkError = error . (++) "Clustering: "