{- | Data structures and helper functions for calculating alignments There are two ways to view an alignment: either as a list of edits (i.e., insertions, deletions, or substitutions), or as a set of sequences with inserted gaps. The edit list approach is perhaps more restrictive model but doesn't generalize to multiple alignments. The gap approach is more general, and probably more commonly used by other software (see e.g. the ACE file format). -} {-# OPTIONS -fglasgow-exts #-} module Bio.Alignment.AlignData ( -- * Data types for gap-based alignemnts Dir(..), Gaps, Alignment -- * Helper functions , extractGaps, insertGaps -- * Data types for edit-based alignments , Edit(..), EditList, SubstMx, Selector, Chr -- * Helper functions , columns, eval, isRepl, on , showalign, toStrings ) where import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as BC import Bio.Sequence.SeqData import Data.List (unfoldr) import Data.Word import Data.Char (chr) -- ---------------------------------------- data Dir = Fwd | Rev deriving (Eq,Show) type Gaps = [Offset] type Alignment = [(Offset,Dir,Sequence,Gaps)] -- | Gaps are coded as '*'s, this function removes them, and returns -- the sequence along with the list of gap positions. extractGaps :: SeqData -> (SeqData,Gaps) extractGaps str = (BC.filter (/='*') str,BC.elemIndices '*' str) -- todo: faster to lift concat out of the inner loop? insertGaps :: Char -> (SeqData,Gaps) -> SeqData insertGaps c (str',gaps) = go str' B.empty 0 gaps where go str acc p (next:rest) = let (a,b) = BC.splitAt (next-p) str in go b (BC.concat [acc,a,BC.pack [c]]) (next+1) rest go str acc _ [] = BC.append acc str -- ---------------------------------------- -- Q&D helper function showalign a = let (s1,s2) = toStrings a in s1++"\n"++s2 -- | turn an alignment into sequences with '-' representing gaps -- (for checking, filtering out the '-' characters should return -- the original sequences, provided '-' isn't part of the sequence -- alphabet) toStrings :: EditList -> (String,String) toStrings [] = ("","") toStrings (x:xs) = let (a1',a2') = toStrings xs chr' = chr . fromIntegral in case x of Ins c -> ('-':a1', chr' c:a2') Del c -> (chr' c:a1', '-':a2') Repl c1 c2 -> (chr' c1:a1', chr' c2:a2') -- | The sequence element type, used in alignments. type Chr = Word8 -- | An Edit is either the insertion, the deletion, -- or the replacement of a character. data Edit = Ins Chr | Del Chr | Repl Chr Chr deriving (Show,Eq) -- | An alignment is a sequence of edits. type EditList = [Edit] -- | True if the Edit is a Repl. isRepl :: Edit -> Bool isRepl (Repl _ _) = True isRepl _ = False -- | A substitution matrix gives scores for replacing a character with another. -- Typically, it will be symmetric. type SubstMx a = (Chr,Chr) -> a -- | Evaluate an Edit based on SubstMx and gap penalty eval :: SubstMx a -> a -> Edit -> a eval mx g c = case c of Ins _ -> g; Del _ -> g; Repl x y -> mx (x,y) -- | A Selector consists of a zero element, and a funcition -- that chooses a possible Edit operation, and generates an updated result. type Selector a = [(a,Edit)] -> a -- ------------------------------------------------------------ -- | Calculate a set of columns containing scores -- This represents the columns of the alignment matrix, but will only require linear space -- for score calculation. columns :: Selector a -> a -> Sequence -> Sequence -> [[a]] columns f z (Seq _ s1 _) (Seq _ s2 _) = columns' f z s1 s2 columns' :: Selector a -> a -> SeqData -> SeqData -> [[a]] columns' f zero s1 s2 = let -- the first column consists of increasing numbers of insertions c0 = zero : map (f.return) (zip c0 (map Ins (B.unpack s2))) -- given the previous column, and the remains of s2, calculate the next column mkcol (p0:prev,x) = if B.null x then Nothing else let xi = B.head x ys = B.unpack s2 c = f [(p0,Del xi)] : [f [del,ins,rep] | del <- zip prev $ repeat (Del xi) | ins <- zip c $ map Ins ys | rep <- zip (p0:prev) $ map (Repl xi) ys] in Just (c,(c,B.tail x)) in c0 : unfoldr mkcol (c0,s1) on c f x y = c (f x) (f y)