{- | Implement clustering -} module Bio.Clustering where import qualified Data.Set as S import Data.List (partition,foldl') -- | Data structure for storing hierarchical clusters data Clustered score datum = Branch score (Clustered score datum) (Clustered score datum) | Leaf datum deriving Show -- | Single linkage agglomerative clustering. -- Cluster elements by slurping a sorted list of pairs with score (i.e. triples :-) -- Keeps a set of contained elements at each branch's root, so O(n log n), -- and requires elements to be in Ord. -- For this to work, the triples must be sorted on score. Earlier scores in the list will -- make up the lower nodes, so sort descending for similarity, ascending for distance. cluster_sl :: (Ord a, Ord s) => [(s,a,a)] -> [Clustered s a] cluster_sl = map fst . foldl' csl [] where csl cs (s,a,b) = -- can be short circuited for more performance let (acs,tmp) = partition (\(_,objs) -> a `S.member` objs) cs (bcs,rest) = partition (\(_,objs) -> b `S.member` objs) tmp in case (acs,bcs) of ([(ac,ao)],[(bc,bo)]) -> (Branch s ac bc,S.union ao bo):rest ([(ac,ao)],[]) -> if b `S.member` ao then (ac,ao):rest else (Branch s ac (Leaf b),S.insert b ao):rest ([],[(bc,bo)]) -> if a `S.member` bo then (bc,bo):rest else (Branch s bc (Leaf a),S.insert a bo):rest ([],[]) -> (Branch s (Leaf a) (Leaf b),S.fromList [a,b]):rest _ -> error "Grave mistake" -- cluster_gen :: [a] -> (a->a->Bool) ->