module Cluster
( getClusterIdentity
, groupBy'
) where
import qualified Data.Map.Strict as Map
import qualified Data.Sequence as Seq
import qualified Data.Foldable as F
import qualified Data.Text as T
import Data.Fasta.Text
import Types
negHamming :: T.Text -> T.Text -> Double
negHamming x = sum . map (\(x, y) -> if x == y then 1 else 0) . T.zip x
getClusterIdentity :: Identity
-> Seq.Seq FastaSequence
-> ClusterMap
getClusterIdentity identity = ClusterMap
. Map.fromList
. zip [1..]
. clusterIdentityGo identity
clusterIdentityGo :: Identity
-> Seq.Seq FastaSequence
-> [Seq.Seq FastaSequence]
clusterIdentityGo identity = F.toList
. groupBy' (compareSeqs identity)
groupBy' :: (a -> a -> Bool) -> Seq.Seq a -> Seq.Seq (Seq.Seq a)
groupBy' _ (Seq.null -> True) = Seq.empty
groupBy' f (Seq.viewl -> x Seq.:< xs) = eqX Seq.<| groupBy' f neqX
where
(!eqX, !neqX) = eqTo f Seq.empty (Seq.singleton x) xs
eqTo :: (a -> a -> Bool)
-> Seq.Seq a
-> Seq.Seq a
-> Seq.Seq a
-> (Seq.Seq a, Seq.Seq a)
eqTo _ acc (Seq.null -> True) zs = (acc, zs)
eqTo f acc (Seq.viewl -> x Seq.:< xs) zs =
eqTo f (x Seq.<| acc) (eqX Seq.>< xs) neqX
where
(!eqX, !neqX) = Seq.partition (f x) zs
compareSeqs :: Identity -> FastaSequence -> FastaSequence -> Bool
compareSeqs identity x y = getIdentity x y >= identity
getIdentity :: FastaSequence -> FastaSequence -> Identity
getIdentity xs ys = Identity
. (* 100)
. (/ (fromIntegral . T.length . fastaSeq $ xs))
. negHamming (fastaSeq xs)
$ (fastaSeq ys)