{- | Implement clustering


module Bio.Clustering where

import qualified Data.Set as S
import Data.List (partition,foldl')

-- | Data structure for storing hierarchical clusters
data Clustered score datum = Branch score (Clustered score datum) (Clustered score datum)
                           | Leaf datum  deriving Show

-- | Single linkage agglomerative clustering.
--   Cluster elements by slurping a sorted list of pairs with score (i.e. triples :-)
--   Keeps a set of contained elements at each branch's root, so O(n log n),
--   and requires elements to be in Ord.
--   For this to work, the triples must be sorted on score. Earlier scores in the list will
--   make up the lower nodes, so sort descending for similarity, ascending for distance.
cluster_sl  :: (Ord a, Ord s) => [(s,a,a)] -> [Clustered s a]
cluster_sl = map fst . foldl' csl []
    where csl cs (s,a,b) = 
              -- can be short circuited for more performance
              let (acs,tmp)   = partition (\(_,objs) -> a `S.member` objs) cs
                  (bcs,rest)  = partition (\(_,objs) -> b `S.member` objs) tmp
              in case (acs,bcs) of 
                   ([(ac,ao)],[(bc,bo)]) -> (Branch s ac bc,S.union ao bo):rest
                   ([(ac,ao)],[])        -> if b `S.member` ao then (ac,ao):rest
                                            else (Branch s ac (Leaf b),S.insert b ao):rest
                   ([],[(bc,bo)])        -> if a `S.member` bo then (bc,bo):rest
                                            else (Branch s bc (Leaf a),S.insert a bo):rest
                   ([],[])               -> (Branch s (Leaf a) (Leaf b),S.fromList [a,b]):rest
                   _                     -> error "Grave mistake"

-- cluster_gen :: [a] -> (a->a->Bool) ->