{- | Data structures and helper functions for calculating alignments There are two ways to view an alignment: either as a list of edits (i.e., insertions, deletions, or substitutions), or as a set of sequences with inserted gaps. The edit list approach is perhaps more restrictive model but doesn't generalize to multiple alignments. The gap approach is more general, and probably more commonly used by other software (see e.g. the ACE file format). -} {-# LANGUAGE ParallelListComp #-} module Bio.Alignment.AlignData ( -- * Data types for gap-based alignemnts Gaps, Alignment -- * Helper functions , extractGaps, insertGaps -- * Data types for edit-based alignments , Edit(..), EditList, SubstMx, Selector, Chr -- * Helper functions , columns, eval, isRepl, on , showalign, toStrings ) where import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as BC import Bio.Core.Sequence import Bio.Core.Strand import Data.List (unfoldr) import Data.Word import Data.Char (chr) -- ---------------------------------------- data Sequence = Seq SeqData (Maybe QualData) -- ^ header and actual sequence deriving Eq type Gaps = [Offset] type Alignment a = [(Offset, Strand, Sequence, Gaps)] -- | Gaps are coded as '*'s, this function removes them, and returns -- the sequence along with the list of gap positions. -- note that gaps are positioned relative to the *gapped* sequence -- (contrast to stmassembler/Cluster.hs) extractGaps :: SeqData -> (SeqData, Gaps) extractGaps str = (SeqData {unSD = (BC.filter (/='*') (unSD str))}, map (\x -> Offset {unOff = x}) $ BC.elemIndices '*' (unSD str)) -- todo: faster to lift concat out of the inner loop? insertGaps :: Char -> (SeqData, Gaps) -> SeqData insertGaps c (str', gaps) = SeqData {unSD = go (unSD str') B.empty 0 (map unOff gaps)} where go str acc p (next:rest) = let (a,b) = BC.splitAt (next-p) str in go b (BC.concat [acc,a,BC.pack [c]]) (next+1) rest go str acc _ [] = BC.append acc str -- ---------------------------------------- -- Q&D helper function showalign a = let (s1,s2) = toStrings a in s1++"\n"++s2 -- | turn an alignment into sequences with '-' representing gaps -- (for checking, filtering out the '-' characters should return -- the original sequences, provided '-' isn't part of the sequence -- alphabet) toStrings :: EditList -> (String,String) toStrings [] = ("","") toStrings (x:xs) = let (a1',a2') = toStrings xs chr' = chr . fromIntegral in case x of Ins c -> ('-':a1', chr' c:a2') Del c -> (chr' c:a1', '-':a2') Repl c1 c2 -> (chr' c1:a1', chr' c2:a2') -- | The sequence element type, used in alignments. type Chr = Word8 -- | An Edit is either the insertion, the deletion, -- or the replacement of a character. data Edit = Ins Chr | Del Chr | Repl Chr Chr deriving (Show,Eq) -- | An alignment is a sequence of edits. type EditList = [Edit] -- | True if the Edit is a Repl. isRepl :: Edit -> Bool isRepl (Repl _ _) = True isRepl _ = False -- | A substitution matrix gives scores for replacing a character with another. -- Typically, it will be symmetric. It is type-tagged with the alphabet - Nuc or Amino. type SubstMx t a = (Chr,Chr) -> a -- | Evaluate an Edit based on SubstMx and gap penalty eval :: SubstMx t a -> a -> Edit -> a eval mx g c = case c of Ins _ -> g; Del _ -> g; Repl x y -> mx (x,y) -- | A Selector consists of a zero element, and a funcition -- that chooses a possible Edit operation, and generates an updated result. type Selector a = [(a,Edit)] -> a -- ------------------------------------------------------------ -- | Calculate a set of columns containing scores -- This represents the columns of the alignment matrix, but will only require linear space -- for score calculation. columns :: Selector a -> a -> Sequence -> Sequence -> [[a]] columns f z (Seq s1 _) (Seq s2 _) = columns' f z s1 s2 columns' :: Selector a -> a -> SeqData -> SeqData -> [[a]] columns' f zero s1 s2 = let -- the first column consists of increasing numbers of insertions c0 = zero : map (f.return) (zip c0 (map Ins (B.unpack $ unSD s2))) -- given the previous column, and the remains of s2, calculate the next column mkcol (p0:prev,x) = if B.null x then Nothing else let xi = B.head x ys = B.unpack $ unSD s2 c = f [(p0,Del xi)] : [f [del,ins,rep] | del <- zip prev $ repeat (Del xi) | ins <- zip c $ map Ins ys | rep <- zip (p0:prev) $ map (Repl xi) ys] in Just (c,(c,B.tail x)) in c0 : unfoldr mkcol (c0, unSD s1) on c f x y = c (f x) (f y)