{- | Module : Data.KMeans Copyright : (c) Keegan Carruthers-Smith, 2009 License : BSD 3 Clause Maintainer : gershomb@gmail.com Stability : experimental A simple implementation of the standard k-means clustering algorithm: . K-means clustering partitions points into clusters, with each point belonging to the cluster with th nearest mean. As the general problem is NP hard, the standard algorithm, which is relatively rapid, is heuristic and not guaranteed to converge to a global optimum. Varying the input order, from which the initial clusters are generated, can yield different results. For degenerate and malicious cases, the algorithm may take exponential time. -} module Data.KMeans (kmeans, kmeansGen) where import Data.List (transpose, sort, groupBy, minimumBy) import Data.Function (on) import Data.Ord (comparing) data WrapType a = WrapType {getVect :: [Double], getVal :: a} instance Eq (WrapType a) where (==) = (==) `on` getVect instance Ord (WrapType a) where compare = comparing getVect dist a b = sqrt . sum $ zipWith (\x y-> (x-y) ^ 2) a b centroid points = map (flip (/) l . sum) $ transpose (map getVect points) where l = fromIntegral $ length points closest points point = minimumBy (comparing $ dist point) points recluster' centroids points = map (map snd) $ groupBy ((==) `on` fst) reclustered where reclustered = sort [(closest centroids (getVect a), a) | a <- points] recluster clusters = recluster' centroids $ concat clusters where centroids = map centroid clusters part :: (Eq a) => Int -> [a] -> [[a]] part x ys | zs' == [] = [zs] | otherwise = zs : part x zs' where (zs, zs') = splitAt x ys -- | Recluster points kmeans'' clusters | clusters == clusters' = clusters | otherwise = kmeans'' clusters' where clusters' = recluster clusters kmeans' k points = kmeans'' $ part l points where l = (length points + k - 1) `div` k -- | Cluster points in a Euclidian space, represented as lists of Doubles, into at most k clusters. -- The initial clusters are chosen arbitrarily. kmeans :: Int -> [[Double]] -> [[[Double]]] kmeans = kmeansGen id -- | A generalized kmeans function. This function operates not on points, but an arbitrary type which may be projected into a Euclidian space. Since the projection may be chosen freely, this allows for weighting dimensions to different degrees, etc. kmeansGen :: (a -> [Double]) -> Int -> [a] -> [[a]] kmeansGen f k points = map (map getVal) . kmeans' k . map (\x -> WrapType (f x) x) $ points