{- Math.Clustering.Hierarchical.Spectral.Eigen.FeatureMatrix Gregory W. Schwartz Collects the functions pertaining to hierarchical spectral clustering for feature matrices. -} {-# LANGUAGE BangPatterns #-} module Math.Clustering.Hierarchical.Spectral.Eigen.FeatureMatrix ( hierarchicalSpectralCluster , FeatureMatrix (..) , B (..) , Items (..) , ShowB (..) ) where -- Remote import Data.Bool (bool) import Data.Clustering.Hierarchical (Dendrogram (..)) import Data.Maybe (fromMaybe) import Data.Tree (Tree (..)) import Math.Clustering.Spectral.Eigen.FeatureMatrix (B (..), getB, spectralCluster, spectralClusterK) import Math.Modularity.Eigen.Sparse (getBModularity) import Math.Modularity.Types (Q (..)) import qualified Data.Foldable as F import qualified Data.Set as Set import qualified Data.Eigen.SparseMatrix as S import qualified Data.Vector as V import qualified Data.Vector.Storable as VS -- Local import Math.Clustering.Hierarchical.Spectral.Types import Math.Clustering.Hierarchical.Spectral.Utility type FeatureMatrix = S.SparseMatrixXd type Items a = V.Vector a type ShowB = ((Int, Int), [(Int, Int, Double)]) type NormalizeFlag = Bool -- | Check if there is more than one cluster. hasMultipleClusters :: S.SparseMatrixXd -> Bool hasMultipleClusters = (> 1) . Set.size . Set.fromList . concat . S.toDenseList -- | Generates a tree through divisive hierarchical clustering using -- Newman-Girvan modularity as a stopping criteria. Can use minimum number of -- observations in a cluster as a stopping criteria. Assumes the feature matrix -- has column features and row observations. Items correspond to rows. Can -- use FeatureMatrix or a pre-generated B matrix. See Shu et al., "Efficient -- Spectral Neighborhood Blocking for Entity Resolution", 2011. hierarchicalSpectralCluster :: EigenGroup -> NormalizeFlag -> Maybe NumEigen -> Maybe Int -> Maybe Q -> Items a -> Either FeatureMatrix B -> ClusteringTree a hierarchicalSpectralCluster eigenGroup normFlag numEigenMay minSizeMay minModMay initItems initMat = go initItems initB where initB = either (getB normFlag) id $ initMat minMod = fromMaybe (Q 0) minModMay minSize = fromMaybe 1 minSizeMay numEigen = fromMaybe 1 numEigenMay go :: Items a -> B -> ClusteringTree a go !items !b = if (S.rows $ unB b) > 1 && hasMultipleClusters clusters && ngMod > minMod && S.rows (unB left) >= minSize && S.rows (unB right) >= minSize then Node { rootLabel = vertex , subForest = [ go (subsetVector items leftIdxs) left , go (subsetVector items rightIdxs) right ] } else Node {rootLabel = vertex, subForest = []} where vertex = ClusteringVertex { _clusteringItems = items , _ngMod = ngMod } clusters :: S.SparseMatrixXd clusters = spectralClustering eigenGroup b spectralClustering :: EigenGroup -> B -> S.SparseMatrixXd spectralClustering SignGroup = spectralCluster spectralClustering KMeansGroup = spectralClusterK numEigen 2 ngMod :: Q ngMod = getBModularity clusters b getSortedIdxs :: Double -> S.SparseMatrixXd -> [Int] getSortedIdxs val = VS.ifoldr' (\ !i !v !acc -> bool acc (i:acc) $ v == val) [] . VS.fromList . concat . S.toDenseList leftIdxs :: [Int] leftIdxs = getSortedIdxs 0 clusters rightIdxs :: [Int] rightIdxs = getSortedIdxs 1 clusters left :: B left = B $ extractRows (unB b) leftIdxs right :: B right = B $ extractRows (unB b) rightIdxs extractRows :: S.SparseMatrixXd -> [Int] -> S.SparseMatrixXd extractRows mat [] = S.fromList 0 0 [] extractRows mat xs = S.fromRows . fmap (flip S.getRow mat) $ xs