Haskell Code by HsColour

{- | Data structures and helper functions for calculating alignments

   There are two ways to view an alignment: either as a list of edits
   (i.e., insertions, deletions, or substitutions), or as a set of sequences
   with inserted gaps.

   The edit list approach is perhaps more restrictive model but doesn't generalize
   to multiple alignments.

   The gap approach is more general, and probably more commonly used by other software
   (see e.g. the ACE file format).

-}

{-# OPTIONS -fglasgow-exts #-}

module Bio.Alignment.AlignData (
    -- * Data types for gap-based alignemnts
    Dir(..), Gaps, Alignment
    -- * Helper functions
    , extractGaps, insertGaps
    -- * Data types for edit-based alignments
    , Edit(..), EditList, SubstMx, Selector, Chr
    -- * Helper functions
    , columns, eval, isRepl, on
    , showalign, toStrings
    ) where

import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as BC
import Bio.Sequence.SeqData
import Data.List (unfoldr)
import Data.Word
import Data.Char (chr)

-- ----------------------------------------

data Dir = Fwd | Rev deriving (Eq,Show)
type Gaps = [Offset]
type Alignment = [(Offset,Dir,Sequence,Gaps)]

-- | Gaps are coded as '*'s, this function removes them, and returns
--   the sequence along with the list of gap positions.
extractGaps :: SeqData -> (SeqData,Gaps)
extractGaps str = (BC.filter (/='*') str,BC.elemIndices '*' str)

-- todo: faster to lift concat out of the inner loop?
insertGaps :: Char -> (SeqData,Gaps) -> SeqData
insertGaps c (str',gaps) = go str' B.empty 0 gaps
    where go str acc p (next:rest) = let (a,b) = BC.splitAt (next-p) str
                                     in go b (BC.concat [acc,a,BC.pack [c]]) (next+1) rest
          go str acc _ [] = BC.append acc str

-- ----------------------------------------

-- Q&D helper function
showalign a = let (s1,s2) = toStrings a in s1++"\n"++s2

-- | turn an alignment into sequences with '-' representing gaps
-- (for checking, filtering out the '-' characters should return
-- the original sequences, provided '-' isn't part of the sequence
-- alphabet)
toStrings :: EditList -> (String,String)
toStrings [] = ("","")
toStrings (x:xs) = let (a1',a2') = toStrings xs
                       chr' = chr . fromIntegral
                   in case x of Ins c -> ('-':a1', chr' c:a2')
                                Del c -> (chr' c:a1', '-':a2')
                                Repl c1 c2 -> (chr' c1:a1', chr' c2:a2')

-- | The sequence element type, used in alignments.
type Chr = Word8

-- | An Edit is either the insertion, the deletion,
--   or the replacement of a character.
data Edit = Ins Chr | Del Chr | Repl Chr Chr deriving (Show,Eq)

-- | An alignment is a sequence of edits.
type EditList = [Edit]

-- | True if the Edit is a Repl.
isRepl :: Edit -> Bool
isRepl (Repl _ _) = True
isRepl _ = False

-- | A substitution matrix gives scores for replacing a character with another.
--   Typically, it will be symmetric.
type SubstMx a = (Chr,Chr) -> a

-- | Evaluate an Edit based on SubstMx and gap penalty
eval :: SubstMx a -> a -> Edit -> a
eval mx g c = case c of Ins _ -> g; Del _ -> g; Repl x y -> mx (x,y)

-- | A Selector consists of a zero element, and a funcition
--   that chooses a possible Edit operation, and generates an updated result.
type Selector a = [(a,Edit)] -> a

-- ------------------------------------------------------------
-- | Calculate a set of columns containing scores
--   This represents the columns of the alignment matrix, but will only require linear space
--   for score calculation.
columns :: Selector a -> a -> Sequence -> Sequence -> [[a]]
columns f z (Seq _ s1 _) (Seq _ s2 _) = columns' f z s1 s2

columns' :: Selector a -> a -> SeqData -> SeqData -> [[a]]
columns' f zero s1 s2 = let
        -- the first column consists of increasing numbers of insertions
        c0 = zero : map (f.return) (zip c0 (map Ins (B.unpack s2)))
        -- given the previous column, and the remains of s2, calculate the next column
        mkcol (p0:prev,x) = if B.null x then Nothing
                            else let xi = B.head x
                                     ys = B.unpack s2
                                     c  = f [(p0,Del xi)] : [f [del,ins,rep] | del <- zip prev $ repeat (Del xi)
                                                                             | ins <- zip c $ map Ins ys
                                                                             | rep <- zip (p0:prev) $ map (Repl xi) ys]
                                 in Just (c,(c,B.tail x))
    in c0 : unfoldr mkcol (c0,s1)

on c f x y = c (f x) (f y)