module Math.KMeans (kmeans, Point, Cluster(..), computeClusters) where
import qualified Data.Vector.Unboxed as V
import qualified Data.Vector as G
import qualified Data.List as L
import Data.Function (on)
type Point a = (V.Vector Double, a)
data Cluster = Cluster {
cid :: !Int,
center :: !(V.Vector Double)
}
distance :: Point a -> V.Vector Double -> Double
distance (u,_) v = V.sum $ V.zipWith (\a b -> (a b)^2) u v
partition :: Int -> [a] -> [[a]]
partition k vs = go vs
where go vs = case L.splitAt n vs of
(vs', []) -> [vs']
(vs', vss) -> vs' : go vss
n = (length vs + k 1) `div` k
computeClusters :: [[V.Vector Double]] -> [Cluster]
computeClusters = zipWith Cluster [0..] . map f
where f (x:xs) = let (n, v) = L.foldl' (\(k, s) v' -> (k+1, V.zipWith (+) s v')) (1, x) xs
in V.map (\x -> x / (fromIntegral n)) v
regroupPoints :: forall a. [Cluster] -> [Point a] -> [[Point a]]
regroupPoints clusters points = L.filter (not.null) . G.toList . G.accum (flip (:)) (G.replicate (length clusters) []) . map closest $ points
where
closest p = (cid (L.minimumBy (compare `on` (distance p . center)) clusters),p)
regroupPoints' :: forall a. [Cluster] -> [Point a] -> [[Point a]]
regroupPoints' clusters points = go points
where go points = map (map snd) . L.groupBy ((==) `on` fst) . L.sortBy (compare `on` fst) $ map (\p -> (closest p, p)) points
closest p = cid $ L.minimumBy (compare `on` (distance p . center)) clusters
kmeansStep :: [Point a] -> [[Point a]] -> [[Point a]]
kmeansStep points pgroups = regroupPoints (computeClusters . map (map fst) $ pgroups) points
kmeansAux :: [Point a] -> [[Point a]] -> [[Point a]]
kmeansAux points pgroups = let pss = kmeansStep points pgroups in
case map (map fst) pss == map (map fst) pgroups of
True -> pgroups
False -> kmeansAux points pss
kmeans :: Int -> [Point a] -> [[Point a]]
kmeans k points = kmeansAux points pgroups
where pgroups = partition k points