{-# LANGUAGE CPP #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE BangPatterns #-}
{- |
Module : Math.KMeans
Copyright : (c) Alp Mestanogullari, Ville Tirronen, 2011-2015
License : BSD3
Maintainer : Alp Mestanogullari
Stability : experimental
An implementation of the k-means clustering algorithm based on the vector package.
The core functions of this module are 'kmeans' and 'kmeansWith'. See some examples
on .
-}
module Math.KMeans
( -- * The meat of this package: 'kmeans'
kmeans
, kmeansWith
, -- * Types
Distance
, Clusters
, Cluster(..)
, Centroids
, -- * Misc.
partition
, euclidSq
, l1dist
, linfdist
) where
import Control.Monad.Identity
import qualified Data.Vector.Unboxed as V
import qualified Data.Vector as G
import qualified Data.List as L
import Data.Function (on)
#if !MIN_VERSION_base(4, 8, 0)
import Data.Foldable
import Data.Monoid
import Data.Traversable
#endif
-- | A distance on vectors
type Distance = V.Vector Double -> V.Vector Double -> Double
-- | The euclidean distance without taking the final square root
-- This would waste cycles without changing the behavior of the algorithm
euclidSq :: Distance
euclidSq v1 v2 = V.sum $ V.zipWith diffsq v1 v2
where diffsq a b = (a-b)^(2::Int)
{-# INLINE euclidSq #-}
-- | L1 distance of two vectors: d(v1, v2) = sum on i of |v1_i - v2_i|
l1dist :: Distance
l1dist v1 v2 = V.sum $ V.zipWith diffabs v1 v2
where diffabs a b = abs (a - b)
{-# INLINE l1dist #-}
-- | L-inf distance of two vectors: d(v1, v2) = max |v1_i - v2_i]
linfdist :: Distance
linfdist v1 v2 = V.maximum $ V.zipWith diffabs v1 v2
where diffabs a b = abs (a - b)
{-# INLINE linfdist #-}
-- | This is what 'kmeans' hands you back. It's just a 'G.Vector' of clusters
-- that will hopefully be of length 'k'.
type Clusters a = G.Vector (Cluster a)
-- | This type is used internally by 'kmeans'. It represents our (hopefully)
-- @k@ centroids, obtained by computing the new centroids of a 'Cluster'
type Centroids = G.Vector (V.Vector Double)
-- | A 'Cluster' of points is just a list of points
newtype Cluster a =
Cluster { elements :: [a] -- ^ elements that belong to that cluster
} deriving (Eq, Show, Functor, Monoid, Foldable, Traversable)
clusterAdd :: Cluster a -> a -> Cluster a
clusterAdd (Cluster c) x = Cluster (x:c)
emptyCluster :: Cluster a
emptyCluster = Cluster []
addCentroids :: V.Vector Double -> V.Vector Double -> V.Vector Double
addCentroids v1 v2 = V.zipWith (+) v1 v2
-- | This is the current partitionning strategy used
-- by 'kmeans'. If we want @k@ clusters, we just
-- try to regroup consecutive elements in @k@ buckets
partition :: Int -> [a] -> Clusters a
partition k vs = G.fromList $ go vs
where go l = case L.splitAt n l of
(vs', []) -> [Cluster vs']
(vs', vss) -> Cluster vs' : go vss
n = (length vs + k - 1) `div` k
-- | Run the kmeans clustering algorithm.
--
-- > kmeans f distance k points
--
-- will run the algorithm using 'f' to extract features from your type,
-- using 'distance' to measure the distance between vectors,
-- trying to separate 'points' in 'k' clusters.
--
-- Extracting features just means getting a 'V.Vector'
-- with 'Double' coordinates that will represent your type
-- in the space in which 'kmeans' will run.
kmeans :: (a -> V.Vector Double) -- ^ feature extraction
-> Distance -- ^ distance function
-> Int -- ^ the 'k' to run 'k'-means with (i.e number of desired clusters)
-> [a] -- ^ input list of 'points'
-> Clusters a -- ^ result, hopefully 'k' clusters of points
kmeans extract dist k points =
runIdentity $ kmeansWith (\n ps -> return $ partition n ps) extract dist k points
-- | Same as 'kmeans', except that instead of using 'partition', you supply your own
-- function for choosing the initial clustering. Two important things to note:
--
-- * If you don't need any kind of effect and just have a 'partition'-like function
-- you want to use, @m@ will can just be 'Identity' here. If that's too
-- obnoxious to work with, please let me know and I may just provide a separate
-- 'kmeansWith' function with no there. But most of the time, you'll probably just
-- be interested in the following scenario.
--
-- * Most likely, you want to have something smarter than our simple 'partition' function.
-- A couple of papers I have read claim very decent results by using some precise
-- probabilistic schemas for the initial partitionning. In this case, your @m@ would
-- probably be 'IO' or 'ST' (e.g using my package)
-- and you could fine-tune the way the initial clusters are picked so that the algorithm
-- may give better results. Of course, if your initialization is monadic, so is the result.
kmeansWith :: Monad m
=> (Int -> [a] -> m (Clusters a)) -- ^ how should we partition the points?
-> (a -> V.Vector Double) -- ^ get the coordinates of a "point"
-> Distance -- ^ what distance do we use
-> Int -- ^ number of desired clusters
-> [a] -- ^ list of points
-> m (Clusters a) -- ^ resulting clustering
kmeansWith initF extract dist k points = go `liftM` initF k points
where
-- go :: Clusters a -> Clusters a
go pgroups =
case kmeansStep pgroups of
pgroups' | pgroupsEqualUnder pgroups pgroups' -> pgroups
| otherwise -> go pgroups'
-- kmeansStep :: Clusters a -> Clusters a
kmeansStep clusters =
case centroidsOf clusters of
centroids ->
G.filter (not . null . elements)
. G.unsafeAccum clusterAdd (G.replicate k emptyCluster)
. map (pairToClosestCentroid centroids)
$ points
-- centroidsOf :: Clusters a -> Centroids
centroidsOf cs = G.map centroidOf cs
where
centroidOf (Cluster elts) =
V.map (/n)
. L.foldl1' addCentroids
$ map extract elts
where n = fromIntegral (length elts)
-- pairToClosestCentroid :: Centroids -> a -> (Int, a)
pairToClosestCentroid cs a = (minDistIndex, a)
where !minDistIndex = G.minIndexBy (compare `on` dist (extract a)) cs
-- pgroupsEqualUnder :: Clusters a -> Clusters a -> Bool
pgroupsEqualUnder g1 g2 =
G.map (map extract . elements) g1 == G.map (map extract . elements) g2
{-# INLINE kmeansWith #-}