module Bio.Sequence.SeqData
(
Sequence(..), Offset, SeqData,
Qual, QualData
, (!), seqlength,seqlabel,seqheader,seqdata
, (?), hasqual, seqqual
, appendHeader, setHeader
, fromStr, toStr
, compl, revcompl
, Amino(..), translate, fromIUPAC, toIUPAC
) where
import Data.List (unfoldr)
import Data.Char (toUpper)
import Data.Int
import Data.Word
import Data.Maybe (fromJust, isJust)
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
type Offset = Int64
type SeqData = B.ByteString
type Qual = Word8
type QualData = BB.ByteString
data Sequence = Seq !SeqData !SeqData !(Maybe QualData)
deriving (Show,Eq)
fromStr :: String -> SeqData
fromStr = B.pack
toStr :: SeqData -> String
toStr = B.unpack
(!) :: Sequence -> Offset -> Char
(!) (Seq _ bs _) = B.index bs
(?) :: Sequence -> Offset -> Qual
(?) (Seq _ _ (Just qs)) = BB.index qs
(?) (Seq _ _ Nothing) =
error "Attempting to use 'seqqual' on sequence without associated quality data"
seqlength :: Sequence -> Offset
seqlength (Seq _ bs _) = B.length bs
seqlabel :: Sequence -> SeqData
seqlabel (Seq l _ _) = head (B.words l)
seqheader :: Sequence -> SeqData
seqheader (Seq l _ _) = l
seqdata :: Sequence -> SeqData
seqdata (Seq _ d _) = d
seqqual :: Sequence -> QualData
seqqual (Seq _ _ (Just q)) = q
seqqual (Seq _ _ Nothing) =
error "Attempting to use 'seqqual' on sequence without associated quality data"
hasqual :: Sequence -> Bool
hasqual (Seq _ _ q) = isJust q
appendHeader, setHeader :: Sequence -> String -> Sequence
appendHeader (Seq h d q) t = (Seq (B.append h (B.pack (" "++t))) d q)
setHeader (Seq h d q) t = (Seq (B.append (head $ B.words h) (B.pack (" "++t))) d q)
revcompl :: Sequence -> Sequence
revcompl (Seq l bs qs) = Seq l (B.reverse $ B.map compl bs)
(maybe Nothing (Just . BB.reverse) qs)
compl :: Char -> Char
compl 'A' = 'T'
compl 'T' = 'A'
compl 'C' = 'G'
compl 'G' = 'C'
compl 'a' = 't'
compl 't' = 'a'
compl 'c' = 'g'
compl 'g' = 'c'
compl x = x
translate :: Sequence -> Offset -> [Amino]
translate s' o' = unfoldr triples (s',o')
where triples (s,o) =
if o > seqlength s 3 then Nothing
else Just (trans1 (map (s!) [o,o+1,o+2]),(s,o+3))
trans1 :: String -> Amino
trans1 = maybe Xaa id . flip lookup trans_tbl . map (repUT . toUpper)
where repUT x = if x == 'U' then 'T' else x
data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly
| His | Ile | Leu | Lys | Met | Phe | Pro | Ser
| Thr | Tyr | Trp | Val
| STP | Asx | Glx | Xle | Xaa
deriving (Show,Eq)
trans_tbl :: [(String,Amino)]
trans_tbl = [("GCT",Ala),("GCC",Ala),("GCA",Ala),("GCG",Ala),
("CGT",Arg),("CGC",Arg),("CGA",Arg),("CGG",Arg),("AGA",Arg),("AGG",Arg),
("AAT",Asn),("AAC",Asn),
("GAT",Asp),("GAC",Asp),
("TGT",Cys),("TGC",Cys),
("CAA",Gln),("CAG",Gln),
("GAA",Glu),("GAG",Glu),
("GGT",Gly),("GGC",Gly),("GGA",Gly),("GGG",Gly),
("CAT",His),("CAC",His),
("ATT",Ile),("ATC",Ile),("ATA",Ile),
("TTA",Leu),("TTG",Leu),("CTT",Leu),("CTC",Leu),("CTA",Leu),("CTG",Leu),
("AAA",Lys),("AAG",Lys),
("ATG",Met),
("TTT",Phe),("TTC",Phe),
("CCT",Pro),("CCC",Pro),("CCA",Pro),("CCG",Pro),
("TCT",Ser),("TCC",Ser),("TCA",Ser),("TCG",Ser),("AGT",Ser),("AGC",Ser),
("ACT",Thr),("ACC",Thr),("ACA",Thr),("ACG",Thr),
("TAT",Tyr),("TAC",Tyr),
("TGG",Trp),
("GTT",Val),("GTC",Val),("GTA",Val),("GTG",Val),
("TAG",STP),("TGA",STP),("TAA",STP)
]
toIUPAC :: [Amino] -> SeqData
toIUPAC = B.pack . map (fromJust . flip lookup iupac)
fromIUPAC :: SeqData -> [Amino]
fromIUPAC = map (maybe Xaa id . flip lookup iupac') . B.unpack
iupac :: [(Amino,Char)]
iupac = [(Ala,'A') ,(Arg,'R') ,(Asn,'N')
,(Asp,'D') ,(Cys,'C') ,(Gln,'Q')
,(Glu,'E') ,(Gly,'G') ,(His,'H')
,(Ile,'I') ,(Leu,'L') ,(Lys,'K')
,(Met,'M') ,(Phe,'F') ,(Pro,'P')
,(Ser,'S') ,(Thr,'T') ,(Tyr,'Y')
,(Trp,'W') ,(Val,'V')
,(Asx,'B')
,(Glx,'Z')
,(Xle,'J')
,(STP,'X') ,(Xaa,'X')
]
iupac' :: [(Char,Amino)]
iupac' = map (\(a,b)->(b,a)) iupac