module Bio.Sequence.SeqData
(
Sequence(..), Offset, SeqData,
Qual, QualData
, (!), seqlength,seqlabel,seqheader,seqdata
, (?), hasqual, seqqual
, appendHeader, setHeader
, fromStr, toStr
, defragSeq, seqmap, castSeq
, compl, revcompl, revcompl', Nuc, castToNuc
, Amino(..), translate, fromIUPAC, toIUPAC
, castToAmino
, putSeqLn, seqToStr
, Unknown
) where
import Data.List (unfoldr, intercalate, isPrefixOf)
import Data.Char (toUpper, isNumber)
import Data.Int
import Data.Word
import Data.Maybe (fromJust, isJust)
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString as BBS
import Text.Printf (printf)
type Offset = Int64
type SeqData = B.ByteString
type Qual = Word8
type QualData = BB.ByteString
data Sequence t = Seq !SeqData !SeqData !(Maybe QualData)
deriving Eq
data Nuc
data Unknown
castSeq :: Sequence a -> Sequence b
castSeq (Seq h d mq) = Seq h d mq
castToNuc :: Sequence a -> Sequence Nuc
castToNuc = castSeq
castToAmino :: Sequence a -> Sequence Amino
castToAmino = castSeq
instance Show (Sequence a) where
show s = seqToStr s 6 10 []
seqToStr :: Sequence a -> Int -> Int -> [(Int, Int)] -> [Char]
seqToStr (Seq header sd _) chunks len parts =
let strSeq = toStr sd
strHeader = toStr header
identifier = head (words strHeader)
comment = intercalate " " $ tail (words strHeader)
numLength = (length . show . length) strSeq
width = chunks * len + numLength + chunks
format = intercalate "\n" . splits (width)
in
(line "ID" width) ++ (format identifier) ++ "\n\n" ++
(line "COMMENT" width) ++ (format comment) ++ "\n\n" ++
(line "DATA" width) ++ (showDNA strSeq len chunks parts)
putSeqLn :: Sequence a -> Int -> Int -> [(Int, Int)] -> IO ()
putSeqLn s chunks len parts = putStrLn $ seqToStr s chunks len parts
line :: [Char] -> Int -> [Char]
line header width =
header ++ " " ++ (take (width (length header) 1) (repeat '-')) ++ "\n"
splits :: Int -> [t] -> [[t]]
splits _ [] = []
splits width xs =
let (a, b) = splitAt width xs in
[a] ++ (splits width b)
beginHighlight, endHighlight :: String
beginHighlight = "\ESC[7m"
endHighlight = "\ESC[0m"
showDNA :: [Char] -> Int -> Int -> [(Int, Int)] -> [Char]
showDNA xs len chunks highs =
let hs = highlight xs highs
ks = splitsSkip hs len [beginHighlight, endHighlight]
ch = chunkify ks chunks
ns = numberize ch chunks len
ns' = clean ns False
in ns'
numberize :: [[Char]] -> Int -> Int -> [Char]
numberize ls chunks len =
let
numbered = zip [0, (chunks * len)..] ls
numLength = (show . length . show ) (chunks * len * (length ls))
formatStr = "%" ++ numLength ++ "d "
toNum (a,b) = (printf formatStr a) ++ b
in intercalate "\n" (map toNum numbered)
chunkify :: [[Char]] -> Int -> [[Char]]
chunkify [] _n = []
chunkify xs n =
[intercalate " " (take n xs)] ++ (chunkify (drop n xs) n)
clean :: [Char] -> Bool -> [Char]
clean [] _ = []
clean (' ':xs) True = endHighlight ++ " " ++ beginHighlight ++ clean xs True
clean (' ':xs) False = " " ++ clean xs False
clean ('\n':xs) True =
let (num, dna) = span (\c -> isNumber c) xs
in endHighlight ++ "\n" ++ num ++ beginHighlight ++ clean dna True
clean ('\n':xs) False = "\n" ++ clean xs False
clean str@(x:xs) isHigh =
let a = isPrefixOf beginHighlight str
b = isPrefixOf endHighlight str
in if a then x : clean xs True
else if b then x : clean xs False
else x : clean xs isHigh
highlight :: [Char] -> [(Int, Int)] -> [Char]
highlight ss [] = ss
highlight ss ((start, end):rest) =
let sl = [beginHighlight, endHighlight]
a = insert ss start beginHighlight sl
b = insert a (end+1) endHighlight sl
in highlight b rest
splitsSkip :: (Eq t) => [t] -> Int -> [[t]] -> [[t]]
splitsSkip [] _ _ = []
splitsSkip xs len skips =
(takeSkip xs len skips) : splitsSkip (dropSkip xs len skips) len skips
takeSkip :: (Eq a) => [a] -> Int -> [[a]] -> [a]
takeSkip [] _ _ = []
takeSkip _ 0 _ = []
takeSkip str@(x:xs) n skips =
let n' = case (testPrefixes str skips) of
Nothing -> n1
Just len -> n + len 1
in x : takeSkip xs n' skips
dropSkip :: (Eq a) => [a] -> Int -> [[a]] -> [a]
dropSkip [] _ _ = []
dropSkip xs 0 _ = xs
dropSkip str@(_:xs) n skips =
let n' = case (testPrefixes str skips) of
Nothing -> n1
Just len -> n + len 1
in dropSkip xs n' skips
insert :: (Eq a) => [a] -> Int -> [a] -> [[a]] -> [a]
insert [] _ _ _ = []
insert xs 0 e _ = e ++ xs
insert str@(x:xs) pos e skips =
let pos' = case (testPrefixes str skips) of
Nothing -> pos1
Just len -> pos + len 1
in x : insert xs pos' e skips
testPrefixes :: (Eq a) => [a] -> [[a]] -> Maybe Int
testPrefixes _ [] = Nothing
testPrefixes xs (p:ps) =
if (isPrefixOf p xs)
then return (length p)
else testPrefixes xs ps
fromStr :: String -> SeqData
fromStr = B.pack
toStr :: SeqData -> String
toStr = B.unpack
(!) :: Sequence a -> Offset -> Char
(!) (Seq _ bs _) = B.index bs
(?) :: Sequence a -> Offset -> Qual
(?) (Seq _ _ (Just qs)) = BB.index qs
(?) (Seq _ _ Nothing) =
error "Attempting to use 'seqqual' on sequence without associated quality data"
seqlength :: Sequence a -> Offset
seqlength (Seq _ bs _) = B.length bs
seqlabel :: Sequence a -> SeqData
seqlabel (Seq l _ _) = case B.words l of (x:_) -> x; [] -> B.empty
seqheader :: Sequence a -> SeqData
seqheader (Seq l _ _) = l
seqdata :: Sequence a -> SeqData
seqdata (Seq _ d _) = d
seqqual :: Sequence a -> QualData
seqqual (Seq _ _ (Just q)) = q
seqqual (Seq _ _ Nothing) =
error "Attempting to use 'seqqual' on sequence without associated quality data"
hasqual :: Sequence a -> Bool
hasqual (Seq _ _ q) = isJust q
appendHeader, setHeader :: Sequence a -> String -> Sequence a
appendHeader (Seq h d q) t = (Seq (B.append h (B.pack (" "++t))) d q)
setHeader s@(Seq _ d q) t = (Seq (B.append (seqlabel s) (B.pack (" "++t))) d q)
defragSeq :: Sequence t -> Sequence t
defragSeq (Seq name sequ qual) = Seq (defragB name) (defragB sequ) (fmap defragBB qual)
where defragB = B.fromChunks . (: []) . BBS.concat . B.toChunks
defragBB = BB.fromChunks . (: []) . BBS.concat . BB.toChunks
seqmap :: ((Char,Qual) -> (Char,Qual)) -> Sequence t -> Sequence t
seqmap f (Seq n ds Nothing) = Seq n xs Nothing
where xs = B.map (\x -> fst $ f (x,e)) ds
e = error "seqmap: tried to access qual data, but none available"
seqmap f (Seq n ds (Just qs)) = Seq n (B.pack xs) (Just $ BB.pack ys)
where (xs,ys) = unzip $ map f $ zip (B.unpack ds) (BB.unpack qs)
revcompl :: Sequence Nuc -> Sequence Nuc
revcompl (Seq l bs qs) = Seq l (revcompl' bs)
(maybe Nothing (Just . BB.reverse) qs)
revcompl' :: SeqData -> SeqData
revcompl' = B.map compl . B.reverse
compl :: Char -> Char
compl 'A' = 'T'
compl 'T' = 'A'
compl 'C' = 'G'
compl 'G' = 'C'
compl 'a' = 't'
compl 't' = 'a'
compl 'c' = 'g'
compl 'g' = 'c'
compl x = x
translate :: Sequence Nuc -> Offset -> [Amino]
translate s' o' = unfoldr triples (s',o')
where triples (s,o) =
if o > seqlength s 3 then Nothing
else Just (trans1 (map (s!) [o,o+1,o+2]),(s,o+3))
trans1 :: String -> Amino
trans1 = maybe Xaa id . flip lookup trans_tbl . map (repUT . toUpper)
where repUT x = if x == 'U' then 'T' else x
data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly
| His | Ile | Leu | Lys | Met | Phe | Pro | Ser
| Thr | Tyr | Trp | Val
| STP | Asx | Glx | Xle | Xaa
deriving (Show,Eq)
trans_tbl :: [(String,Amino)]
trans_tbl = [("GCT",Ala),("GCC",Ala),("GCA",Ala),("GCG",Ala),
("CGT",Arg),("CGC",Arg),("CGA",Arg),("CGG",Arg),("AGA",Arg),("AGG",Arg),
("AAT",Asn),("AAC",Asn),
("GAT",Asp),("GAC",Asp),
("TGT",Cys),("TGC",Cys),
("CAA",Gln),("CAG",Gln),
("GAA",Glu),("GAG",Glu),
("GGT",Gly),("GGC",Gly),("GGA",Gly),("GGG",Gly),
("CAT",His),("CAC",His),
("ATT",Ile),("ATC",Ile),("ATA",Ile),
("TTA",Leu),("TTG",Leu),("CTT",Leu),("CTC",Leu),("CTA",Leu),("CTG",Leu),
("AAA",Lys),("AAG",Lys),
("ATG",Met),
("TTT",Phe),("TTC",Phe),
("CCT",Pro),("CCC",Pro),("CCA",Pro),("CCG",Pro),
("TCT",Ser),("TCC",Ser),("TCA",Ser),("TCG",Ser),("AGT",Ser),("AGC",Ser),
("ACT",Thr),("ACC",Thr),("ACA",Thr),("ACG",Thr),
("TAT",Tyr),("TAC",Tyr),
("TGG",Trp),
("GTT",Val),("GTC",Val),("GTA",Val),("GTG",Val),
("TAG",STP),("TGA",STP),("TAA",STP)
]
toIUPAC :: [Amino] -> SeqData
toIUPAC = B.pack . map (fromJust . flip lookup iupac)
fromIUPAC :: SeqData -> [Amino]
fromIUPAC = map (maybe Xaa id . flip lookup iupac' . toUpper) . B.unpack
iupac :: [(Amino,Char)]
iupac = [(Ala,'A') ,(Arg,'R') ,(Asn,'N')
,(Asp,'D') ,(Cys,'C') ,(Gln,'Q')
,(Glu,'E') ,(Gly,'G') ,(His,'H')
,(Ile,'I') ,(Leu,'L') ,(Lys,'K')
,(Met,'M') ,(Phe,'F') ,(Pro,'P')
,(Ser,'S') ,(Thr,'T') ,(Tyr,'Y')
,(Trp,'W') ,(Val,'V')
,(Asx,'B')
,(Glx,'Z')
,(Xle,'J')
,(STP,'X') ,(Xaa,'X')
]
iupac' :: [(Char,Amino)]
iupac' = map (\(a,b)->(b,a)) iupac