module Data.Clustering.Hierarchical (-- * Dendrogram data type Dendrogram(..) ,Distance ,elements ,cutAt -- * Linkage data type ,Linkage(..) -- * Clustering function ,dendrogram ) where import Data.Clustering.Hierarchical.Internal.Types (Dendrogram(..), Linkage(..), Distance) import qualified Data.Clustering.Hierarchical.Internal.DistanceMatrix as DM import qualified Data.Clustering.Hierarchical.Internal.Optimal as O -- | List of elements in a dendrogram. elements :: Dendrogram a -> [a] elements = go [] where go acc (Leaf x) = x : acc go acc (Branch _ l r) = go (go acc r) l -- | @dendro \`cutAt\` threshold@ cuts the dendrogram @dendro@ at -- all branches which have distances strictly greater than -- @threshold@. -- -- For example, suppose we have -- -- @ -- dendro = Branch 0.8 -- (Branch 0.5 -- (Branch 0.2 -- (Leaf \'A\') -- (Leaf \'B\')) -- (Leaf \'C\')) -- (Leaf \'D\') -- @ -- -- Then: -- -- @ -- dendro \`cutAt\` 0.9 == dendro \`cutAt\` 0.8 == [dendro] -- no changes -- dendro \`cutAt\` 0.7 == dendro \`cutAt\` 0.5 == [Branch 0.5 (Branch 0.2 (Leaf \'A\') (Leaf \'B\')) (Leaf \'C\'), Leaf \'D\'] -- dendro \`cutAt\` 0.4 == dendro \`cutAt\` 0.2 == [Branch 0.2 (Leaf \'A\') (Leaf \'B\'), Leaf \'C\', Leaf \'D\'] -- dendro \`cutAt\` 0.1 == [Leaf \'A\', Leaf \'B\', Leaf \'C\', Leaf \'D\'] -- no branches at all -- @ cutAt :: Dendrogram a -> Distance -> [Dendrogram a] cutAt dendro threshold = go [] dendro where go acc x@(Leaf _) = x : acc go acc x@(Branch d l r) | d <= threshold = x : acc | otherwise = go (go acc r) l -- cut! -- | Calculates a complete, rooted dendrogram for a list of items -- and a linkage type. The following are the time and space -- complexities for each linkage: -- -- ['SingleLinkage'] /O(n^2)/ time and /O(n)/ space, using the -- SLINK algorithm. This algorithm is optimal in both space -- and time and gives the same answer as the naive algorithm -- using a distance matrix. -- -- ['CompleteLinkage'] /O(n^3)/ time and /O(n^2)/ space, using -- the naive algorithm with a distance matrix. Use 'CLINK' if -- you need more performance. -- -- [Complete linkage with 'CLINK'] /O(n^2)/ time and /O(n)/ -- space, using the CLINK algorithm. Note that this algorithm -- doesn't always give the same answer as the naive algorithm -- using a distance matrix, but it's much faster. -- -- ['UPGMA'] /O(n^3)/ time and /O(n^2)/ space, using the naive -- algorithm with a distance matrix. -- -- ['FakeAverageLinkage'] /O(n^3)/ time and /O(n^2)/ space, using -- the naive algorithm with a distance matrix. dendrogram :: Linkage -- ^ Linkage type to be used. -> [a] -- ^ Items to be clustered. -> (a -> a -> Distance) -- ^ Distance function between items. -> Dendrogram a -- ^ Complete dendrogram. dendrogram SingleLinkage = O.singleLinkage dendrogram CompleteLinkage = DM.completeLinkage dendrogram CLINK = O.completeLinkage dendrogram UPGMA = DM.upgma dendrogram FakeAverageLinkage = DM.fakeAverageLinkage