module Data.Cluster.ROC( -- * Algorithm configuration ROCConfig , rocThreshold , rocMaxClusters , defaultROCConfig -- * Cluster definition , Prototype , newPrototype , prototypeValue , prototypeWeight -- * API , ClusterSpace(..) , ROCContext , emptyROCContext , loadROCContext , rocPrototypes , clusterize -- * Fine grain API , clusterizeAddMerge , clusterizeSingle , clusterizeMerge , clusterizeNewPrototype , clusterizePostprocess ) where import Data.Data import Data.Monoid import Data.Ord import Data.Vector (Vector) import GHC.Generics import qualified Data.Foldable as F import qualified Data.Vector as V -- | Configuration of ROC clusterization data ROCConfig = ROCConfig { -- | If weight of prototype is less than the value, it is removed at final -- step. rocThreshold :: !Double -- | Maximum count of clusters, could be less , rocMaxClusters :: !Int } deriving (Generic, Data) -- | Default configuration: -- @ -- ROCConfig { -- rocThreshold = 0 -- , rocMaxClusters = 10 -- } -- @ defaultROCConfig :: ROCConfig defaultROCConfig = ROCConfig { rocThreshold = 0 , rocMaxClusters = 10 } -- | Operations that value has to support to use in ROC clusterisation class ClusterSpace a where -- | Zero point in space pointZero :: a -- | Addition of vectors in space pointAdd :: a -> a -> a -- | Scaling by a scalar pointScale :: Double -> a -> a -- | Kernel function pointKernel :: a -> a -> Double -- | Square of distance between of points (defined via kernel) and exposed -- only for possible optimizations as for Gaussian kernel (2 - 2 * pointKernel x y) pointDistanceSquared :: a -> a -> Double pointDistanceSquared x y = pointKernel x x - 2 * pointKernel x y + pointKernel y y {-# INLINE pointDistanceSquared #-} -- | Cluster information data Prototype a = Prototype { prototypeValue :: !a , prototypeWeight :: !Double } deriving (Eq, Show, Generic, Functor) -- | Create prototype with given point as center and zero weight newPrototype :: a -> Prototype a newPrototype a = Prototype a 0 instance ClusterSpace a => Monoid (Prototype a) where mempty = Prototype pointZero 0 mappend p1 p2 = Prototype pos w where w = prototypeWeight p1 + prototypeWeight p2 pos = (1/w) `pointScale` ((prototypeWeight p1 `pointScale` prototypeValue p1) `pointAdd` (prototypeWeight p2 `pointScale` prototypeValue p2)) {-# INLINE mempty #-} {-# INLINE mappend #-} -- | Internal context of algorithm data ROCContext a = ROCContext { cntxPrototypes :: !(Vector (Prototype a)) , cntxConfig :: !ROCConfig } deriving (Generic, Functor) -- | Create new context for clusterization from scratch emptyROCContext :: ROCConfig -> ROCContext a emptyROCContext cfg = ROCContext { cntxPrototypes = mempty , cntxConfig = cfg } -- | Load context from set of prototypes loadROCContext :: Foldable f => ROCConfig -> f (Prototype a) -> ROCContext a loadROCContext cfg ps = (emptyROCContext cfg) { cntxPrototypes = V.fromList . F.toList $ ps } -- | Get collection of prototypes from ROC context rocPrototypes :: ROCContext a -> [Prototype a] rocPrototypes = F.toList . cntxPrototypes -- | Perform clusterization of next part of data clusterize :: forall a f . (ClusterSpace a, Foldable f) => f a -- ^ Set of data that need to be added to clusters -> ROCContext a -- ^ Context with current prototypes -> ROCContext a -- ^ Updated context clusterize xs cntx0 = clusterizePostprocess addAll where addAll = F.foldl' (flip clusterizeAddMerge) cntx0 xs -- | Cluster a single value (step 2-6 in original paper). Moves existing clusters, -- creates new clusters and merges close clusters. clusterizeAddMerge :: forall a . (ClusterSpace a) => a -- ^ Single point -> ROCContext a -- ^ Context with current prototypes -> ROCContext a -- ^ Updated context clusterizeAddMerge x cntx = clusterizeNewPrototype x $ if n >= nmax then clusterizeMerge cntx' else cntx' where cntx' = clusterizeSingle x cntx n = V.length . cntxPrototypes $ cntx' nmax = rocMaxClusters . cntxConfig $ cntx' {-# INLINE clusterizeAddMerge #-} -- | Cluster a single value (step 2 in original paper). This step updates only existing -- clusters. clusterizeSingle :: forall a . (ClusterSpace a) => a -- ^ Single point -> ROCContext a -- ^ Context with current prototypes -> ROCContext a -- ^ Updated context clusterizeSingle x ctx@ROCContext{..} | V.null cntxPrototypes = ctx | otherwise = ctx { cntxPrototypes = cntxPrototypes V.// [(winnerIndex, winner')] } where winnerIndex = V.minIndex . fmap (pointDistanceSquared x . prototypeValue) $ cntxPrototypes winner = cntxPrototypes V.! winnerIndex winner' = let Prototype{..} = winner сwinner = prototypeWeight + pointKernel x prototypeValue ywinner = prototypeValue `pointAdd` ( (1 / сwinner) `pointScale` (x `pointAdd` pointScale (-1) prototypeValue) ) in Prototype ywinner сwinner {-# INLINE clusterizeSingle #-} -- | Merge the most closest clusters (step 4 in original paper). clusterizeMerge :: forall a . (ClusterSpace a) => ROCContext a -- ^ Context with current prototypes -> ROCContext a -- ^ Updated context clusterizeMerge ctx@ROCContext{..} | V.length cntxPrototypes <= 1 = ctx | otherwise = ctx { cntxPrototypes = cntxPrototypes' } where -- find two prototypes that have minimum distance (warning, Vector monad!) (minxi, minyi, _) = V.minimumBy (comparing $ \(_, _, a) -> a) $ do (xi, xv) <- V.indexed cntxPrototypes (yi, yv) <- V.take xi $ V.indexed cntxPrototypes pure (xi, yi, prototypeValue yv `pointDistanceSquared` prototypeValue xv) x = cntxPrototypes V.! minxi y = cntxPrototypes V.! minyi x' = x <> y removeAt i v = V.slice 0 i v <> V.slice (i+1) (V.length v - i - 1) v cntxPrototypes' = removeAt minyi $ cntxPrototypes V.// [(minxi, x')] {-# INLINE clusterizeMerge #-} -- | Form a new prototype from single point (step 5 in original paper) clusterizeNewPrototype :: forall a . (ClusterSpace a) => a -- ^ Point -> ROCContext a -- ^ Context with current prototypes -> ROCContext a -- ^ Updated context clusterizeNewPrototype a ctx@ROCContext{..} = ctx { cntxPrototypes = cntxPrototypes `V.snoc` newProto } where newProto = Prototype a 0 {-# INLINE clusterizeNewPrototype #-} -- | Remove clusters that have negligible weights (step 6 in original paper) clusterizePostprocess :: forall a . (ClusterSpace a) => ROCContext a -- ^ Context with current prototypes -> ROCContext a -- ^ Updated context clusterizePostprocess ctx@ROCContext{..} = ctx { cntxPrototypes = V.filter isValuable cntxPrototypes } where threshold = rocThreshold cntxConfig isValuable p = prototypeWeight p >= threshold {-# INLINE clusterizePostprocess #-}