{-# LANGUAGE CPP #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE DeriveTraversable #-} {-# LANGUAGE BangPatterns #-} {- | Module : Math.KMeans Copyright : (c) Alp Mestanogullari, Ville Tirronen, 2011-2015 License : BSD3 Maintainer : Alp Mestanogullari Stability : experimental An implementation of the k-means clustering algorithm based on the vector package. The core functions of this module are 'kmeans' and 'kmeansWith'. See some examples on . -} module Math.KMeans ( -- * The meat of this package: 'kmeans' kmeans , kmeansWith , -- * Types Distance , Clusters , Cluster(..) , Centroids , -- * Misc. partition , euclidSq , l1dist , linfdist ) where import Control.Monad.Identity import qualified Data.Vector.Unboxed as V import qualified Data.Vector as G import qualified Data.List as L import Data.Function (on) #if !MIN_VERSION_base(4, 8, 0) import Data.Foldable import Data.Monoid import Data.Traversable #endif -- | A distance on vectors type Distance = V.Vector Double -> V.Vector Double -> Double -- | The euclidean distance without taking the final square root -- This would waste cycles without changing the behavior of the algorithm euclidSq :: Distance euclidSq v1 v2 = V.sum $ V.zipWith diffsq v1 v2 where diffsq a b = (a-b)^(2::Int) {-# INLINE euclidSq #-} -- | L1 distance of two vectors: d(v1, v2) = sum on i of |v1_i - v2_i| l1dist :: Distance l1dist v1 v2 = V.sum $ V.zipWith diffabs v1 v2 where diffabs a b = abs (a - b) {-# INLINE l1dist #-} -- | L-inf distance of two vectors: d(v1, v2) = max |v1_i - v2_i] linfdist :: Distance linfdist v1 v2 = V.maximum $ V.zipWith diffabs v1 v2 where diffabs a b = abs (a - b) {-# INLINE linfdist #-} -- | This is what 'kmeans' hands you back. It's just a 'G.Vector' of clusters -- that will hopefully be of length 'k'. type Clusters a = G.Vector (Cluster a) -- | This type is used internally by 'kmeans'. It represents our (hopefully) -- @k@ centroids, obtained by computing the new centroids of a 'Cluster' type Centroids = G.Vector (V.Vector Double) -- | A 'Cluster' of points is just a list of points newtype Cluster a = Cluster { elements :: [a] -- ^ elements that belong to that cluster } deriving (Eq, Show, Functor, Monoid, Foldable, Traversable) clusterAdd :: Cluster a -> a -> Cluster a clusterAdd (Cluster c) x = Cluster (x:c) emptyCluster :: Cluster a emptyCluster = Cluster [] addCentroids :: V.Vector Double -> V.Vector Double -> V.Vector Double addCentroids v1 v2 = V.zipWith (+) v1 v2 -- | This is the current partitionning strategy used -- by 'kmeans'. If we want @k@ clusters, we just -- try to regroup consecutive elements in @k@ buckets partition :: Int -> [a] -> Clusters a partition k vs = G.fromList $ go vs where go l = case L.splitAt n l of (vs', []) -> [Cluster vs'] (vs', vss) -> Cluster vs' : go vss n = (length vs + k - 1) `div` k -- | Run the kmeans clustering algorithm. -- -- > kmeans f distance k points -- -- will run the algorithm using 'f' to extract features from your type, -- using 'distance' to measure the distance between vectors, -- trying to separate 'points' in 'k' clusters. -- -- Extracting features just means getting a 'V.Vector' -- with 'Double' coordinates that will represent your type -- in the space in which 'kmeans' will run. kmeans :: (a -> V.Vector Double) -- ^ feature extraction -> Distance -- ^ distance function -> Int -- ^ the 'k' to run 'k'-means with (i.e number of desired clusters) -> [a] -- ^ input list of 'points' -> Clusters a -- ^ result, hopefully 'k' clusters of points kmeans extract dist k points = runIdentity $ kmeansWith (\n ps -> return $ partition n ps) extract dist k points -- | Same as 'kmeans', except that instead of using 'partition', you supply your own -- function for choosing the initial clustering. Two important things to note: -- -- * If you don't need any kind of effect and just have a 'partition'-like function -- you want to use, @m@ will can just be 'Identity' here. If that's too -- obnoxious to work with, please let me know and I may just provide a separate -- 'kmeansWith' function with no there. But most of the time, you'll probably just -- be interested in the following scenario. -- -- * Most likely, you want to have something smarter than our simple 'partition' function. -- A couple of papers I have read claim very decent results by using some precise -- probabilistic schemas for the initial partitionning. In this case, your @m@ would -- probably be 'IO' or 'ST' (e.g using my package) -- and you could fine-tune the way the initial clusters are picked so that the algorithm -- may give better results. Of course, if your initialization is monadic, so is the result. kmeansWith :: Monad m => (Int -> [a] -> m (Clusters a)) -- ^ how should we partition the points? -> (a -> V.Vector Double) -- ^ get the coordinates of a "point" -> Distance -- ^ what distance do we use -> Int -- ^ number of desired clusters -> [a] -- ^ list of points -> m (Clusters a) -- ^ resulting clustering kmeansWith initF extract dist k points = go `liftM` initF k points where -- go :: Clusters a -> Clusters a go pgroups = case kmeansStep pgroups of pgroups' | pgroupsEqualUnder pgroups pgroups' -> pgroups | otherwise -> go pgroups' -- kmeansStep :: Clusters a -> Clusters a kmeansStep clusters = case centroidsOf clusters of centroids -> G.filter (not . null . elements) . G.unsafeAccum clusterAdd (G.replicate k emptyCluster) . map (pairToClosestCentroid centroids) $ points -- centroidsOf :: Clusters a -> Centroids centroidsOf cs = G.map centroidOf cs where centroidOf (Cluster elts) = V.map (/n) . L.foldl1' addCentroids $ map extract elts where n = fromIntegral (length elts) -- pairToClosestCentroid :: Centroids -> a -> (Int, a) pairToClosestCentroid cs a = (minDistIndex, a) where !minDistIndex = G.minIndexBy (compare `on` dist (extract a)) cs -- pgroupsEqualUnder :: Clusters a -> Clusters a -> Bool pgroupsEqualUnder g1 g2 = G.map (map extract . elements) g1 == G.map (map extract . elements) g2 {-# INLINE kmeansWith #-}