{- Cluster
by Gregory Schwartz

-- | Collects the functions pertaining to the clustering of sequences by hamming
distance identity
-}

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ViewPatterns #-}

module Cluster
    ( getClusterIdentity
    , groupBy'
    ) where

-- Standard
import qualified Data.Map.Strict as Map
import qualified Data.Sequence as Seq
import qualified Data.Foldable as F

-- Cabal
import qualified Data.Text as T
import Data.Fasta.Text

-- Local
import Types

-- | Like hamming, but similarity rather than distance
negHamming :: T.Text -> T.Text -> Double
negHamming x = sum . map (\(x, y) -> if x == y then 1 else 0) . T.zip x

-- | Takes in an identity and fasta sequences and returns the sequences
-- grouped together by hamming distance identity
getClusterIdentity :: Identity
                -> Seq.Seq FastaSequence
                -> ClusterMap
getClusterIdentity identity = ClusterMap
                            . Map.fromList
                            . zip [1..]
                            . clusterIdentityGo identity

-- | Keep comparing clusters until no more fusions (no change in size) make
-- sense
clusterIdentityGo :: Identity
                  -> Seq.Seq FastaSequence
                  -> [Seq.Seq FastaSequence]
clusterIdentityGo identity = F.toList
                           . groupBy' (compareSeqs identity)

-- | Group together by all pairings rather than adjacent. Altered from
-- lyxia's original to use sequences.
groupBy' :: (a -> a -> Bool) -> Seq.Seq a -> Seq.Seq (Seq.Seq a)
groupBy' _ (Seq.null -> True) = Seq.empty
groupBy' f (Seq.viewl -> x Seq.:< xs) = eqX Seq.<| groupBy' f neqX
  where
    (!eqX, !neqX) = eqTo f Seq.empty (Seq.singleton x) xs

-- | Helper to groupBy'. Altered from lyxia's original to use sequences.
eqTo :: (a -> a -> Bool)
     -> Seq.Seq a
     -> Seq.Seq a
     -> Seq.Seq a
     -> (Seq.Seq a, Seq.Seq a)
eqTo _ acc (Seq.null -> True) zs = (acc, zs)
eqTo f acc (Seq.viewl -> x Seq.:< xs) zs =
    eqTo f (x Seq.<| acc) (eqX Seq.>< xs) neqX
  where
    (!eqX, !neqX) = Seq.partition (f x) zs

-- | Either the sequences are similar or not
compareSeqs :: Identity -> FastaSequence -> FastaSequence -> Bool
compareSeqs identity x y = getIdentity x y >= identity

-- | Get the identity between two sequences
getIdentity :: FastaSequence -> FastaSequence -> Identity
getIdentity xs ys = Identity
                  . (* 100)
                  . (/ (fromIntegral . T.length . fastaSeq $ xs))
                  . negHamming (fastaSeq xs)
                  $ (fastaSeq ys)