module Data.Weighting.GSC where import Data.List (foldl') import Data.Clustering.Hierarchical (Dendrogram(..)) -- | /O(n^2)/ Calculates the Gerstein\/Sonnhammer\/Chothia -- weights for all elements of a dendrogram. Weights are -- annotated to the leafs of the dendrogram while distances in -- branches are kept unchanged. -- -- Distances @d@ in branches should be non-increasing and between -- @0@ (in the leafs) and @1@. The final weights are normalized -- to average to @1@ (i.e. sum to the number of sequences, the -- same they would sum if all weights were @1@). -- -- For example, suppose we have -- -- @ -- dendro = Branch 0.8 -- (Branch 0.5 -- (Branch 0.2 -- (Leaf 'A') -- (Leaf 'B')) -- (Leaf 'C')) -- (Leaf 'D') -- @ -- -- This is the same as GSC paper's example, however they used -- similarities while we are using distances (i.e. @fmap (1-) -- dendro@ would be exactly their example). Then @gsc dendro@ is -- -- @ -- gsc dendro == Branch 0.8 -- (Branch 0.5 -- (Branch 0.2 -- (Leaf ('A',0.7608695652173914)) -- (Leaf ('B',0.7608695652173914))) -- (Leaf ('C',1.0869565217391306))) -- (Leaf ('D',1.3913043478260871)) -- @ -- -- which is exactly what they calculated. gsc :: Fractional d => Dendrogram d a -> Dendrogram d (a, d) gsc (Leaf x) = Leaf (x,1) gsc dendrogram = ret where (wsum, nsum, ret) = go undefined [] dendrogram wfinal = (wsum / fromIntegral nsum) position (Leaf _) = 0 -- no difference from itself position (Branch d _ _) = d go _ cs (Branch d l r) = let (wl, nl, l') = go d ((el / wl) : cs) l (wr, nr, r') = go d ((er / wr) : cs) r el = d - position l -- edge length to left branch er = d - position r -- ...to right branch wsum = wl + wr + el + er nsum = nl + nr in wsum `seq` nsum `seq` (wsum, nsum, Branch d l' r') go d cs (Leaf x) = -- O(n) worst case, O(log n) best case (balanced dendrogram) let w = foldl' (\curw c -> curw + curw * c) d (tail cs) in (0, 1 :: Int, Leaf (x, w / wfinal))