{- | Data structures for manipulating (biological) sequences. Generally supports both nucleotide and protein sequences, some functions, like @revcompl@, only makes sense for nucleotides. -} {-# LANGUAGE EmptyDataDecls #-} module Bio.Sequence.SeqData ( -- * Data structure -- | A sequence is a header, sequence data itself, and optional quality data. -- Sequences are type-tagged to identify them as nucleotide, amino acids, -- or unknown type. -- All items are lazy bytestrings. The Offset type can be used for indexing. Sequence(..), Offset, SeqData, -- | Quality data is normally associated with nucleotide sequences Qual, QualData -- * Accessor functions , (!), seqlength,seqlabel,seqheader,seqdata , (?), hasqual, seqqual -- * Adding information to header , appendHeader, setHeader -- * Converting to and from [Char] , fromStr, toStr -- * Sequence utilities , defragSeq, seqmap, castSeq -- * Nucleotide functionality -- | Nucleotide sequences contain the alphabet [A,C,G,T]. -- IUPAC specifies an extended nucleotide alphabet with wildcards, but -- it is not supported at this point. , compl, revcompl, revcompl', Nuc, castToNuc -- * Protein functionality -- | Proteins are chains of amino acids, represented by the IUPAC alphabet. , Amino(..), translate, fromIUPAC, toIUPAC -- amino acids , castToAmino -- * Display a nicely formated sequence. , putSeqLn, seqToStr -- * Default type for sequences , Unknown ) where import Data.List (unfoldr, intercalate, isPrefixOf) import Data.Char (toUpper, isNumber) import Data.Int import Data.Word import Data.Maybe (fromJust, isJust) import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.ByteString.Lazy as BB import qualified Data.ByteString as BBS import Text.Printf (printf) -- | An offset, index, or length of a 'SeqData' type Offset = Int64 -- | The basic data type used in 'Sequence's type SeqData = B.ByteString -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from bad. type Qual = Word8 -- | Quality data is a 'Qual' vector, currently implemented as a 'ByteString'. type QualData = BB.ByteString -- mild abuse -- | A sequence consists of a header, the sequence data itself, and optional quality data. -- The type parameter is a phantom type to separate nucleotide and amino acid sequences data Sequence t = Seq !SeqData !SeqData !(Maybe QualData) -- ^ header and actual sequence deriving Eq -- | For type tagging sequences (protein sequences use 'Amino' below) data Nuc data Unknown -- | Phantom type functionality, unchecked conversion between sequence types castSeq :: Sequence a -> Sequence b castSeq (Seq h d mq) = Seq h d mq castToNuc :: Sequence a -> Sequence Nuc castToNuc = castSeq castToAmino :: Sequence a -> Sequence Amino castToAmino = castSeq -- | A more arranged show instance for Sequences reassembling the display of -- the fasta-format instance Show (Sequence a) where show s = seqToStr s 6 10 [] -- | Returns a properly formatted and probably highlighted string -- | representation of a sequence. Highlighting is done using ANSI-Escape -- | sequences. seqToStr :: Sequence a -> Int -> Int -> [(Int, Int)] -> [Char] seqToStr (Seq header sd _) chunks len parts = let strSeq = toStr sd strHeader = toStr header identifier = head (words strHeader) comment = intercalate " " $ tail (words strHeader) numLength = (length . show . length) strSeq width = chunks * len + numLength + chunks format = intercalate "\n" . splits (width) in (line "ID" width) ++ (format identifier) ++ "\n\n" ++ (line "COMMENT" width) ++ (format comment) ++ "\n\n" ++ (line "DATA" width) ++ (showDNA strSeq len chunks parts) -- | A simple function to display a sequence: we generate the sequence string and -- | call putStrLn putSeqLn :: Sequence a -> Int -> Int -> [(Int, Int)] -> IO () putSeqLn s chunks len parts = putStrLn $ seqToStr s chunks len parts -- Creates a header line of length n line :: [Char] -> Int -> [Char] line header width = header ++ " " ++ (take (width - (length header) - 1) (repeat '-')) ++ "\n" -- | Splits a string into parts of size width. The last element can be shorter. splits :: Int -> [t] -> [[t]] splits _ [] = [] splits width xs = let (a, b) = splitAt width xs in [a] ++ (splits width b) -- Highlighting and Dehighlighting ANSI-Escape sequences beginHighlight, endHighlight :: String beginHighlight = "\ESC[7m" endHighlight = "\ESC[0m" -- Creates a string representation of the dna-part of a sequence. We can either -- highlight specific areas, or, if this is empty, numberize the rows. At the -- moment, both at the same time is not possible (see comment in the numberize- -- function). If you want numberized output, take a look at the showDNANumber- -- function. showDNA :: [Char] -> Int -> Int -> [(Int, Int)] -> [Char] showDNA xs len chunks highs = let hs = highlight xs highs ks = splitsSkip hs len [beginHighlight, endHighlight] ch = chunkify ks chunks ns = numberize ch chunks len ns' = clean ns False in ns' -- Add orientation numbers to each line. numberize :: [[Char]] -> Int -> Int -> [Char] numberize ls chunks len = let -- The initial numbering of the single elements in lines numbered = zip [0, (chunks * len)..] ls -- Create the format string with the right alignment, depending on the -- number of elements in lines numLength = (show . length . show ) (chunks * len * (length ls)) formatStr = "%" ++ numLength ++ "d " toNum (a,b) = (printf formatStr a) ++ b in intercalate "\n" (map toNum numbered) -- Concatenates each n elements of a list. chunkify :: [[Char]] -> Int -> [[Char]] chunkify [] _n = [] chunkify xs n = [intercalate " " (take n xs)] ++ (chunkify (drop n xs) n) -- Highlighting should not occur on spaces or over newline borders. We walk -- through a generated string and add appropiate stop-highlight and start- -- highlight markers. -- -- This is one of the functions which can be definitely optimized by a seasoned -- Haskell programmer. clean :: [Char] -> Bool -> [Char] clean [] _ = [] -- Handling spaces clean (' ':xs) True = endHighlight ++ " " ++ beginHighlight ++ clean xs True clean (' ':xs) False = " " ++ clean xs False -- Newlines and numbering clean ('\n':xs) True = let (num, dna) = span (\c -> isNumber c) xs in endHighlight ++ "\n" ++ num ++ beginHighlight ++ clean dna True clean ('\n':xs) False = "\n" ++ clean xs False -- Checking for highlighting at all clean str@(x:xs) isHigh = -- Special cases: check for the beginning or ending of highlighting let a = isPrefixOf beginHighlight str b = isPrefixOf endHighlight str in if a then x : clean xs True else if b then x : clean xs False else x : clean xs isHigh -- Highlights a range of strings. Each start and end to be highlighted is -- described by a position-tuple (start, end). highlight :: [Char] -> [(Int, Int)] -> [Char] highlight ss [] = ss highlight ss ((start, end):rest) = let sl = [beginHighlight, endHighlight] a = insert ss start beginHighlight sl b = insert a (end+1) endHighlight sl in highlight b rest -- Generates lists of length len from xs, skipping sublist in skips while -- counting list elements. splitsSkip :: (Eq t) => [t] -> Int -> [[t]] -> [[t]] splitsSkip [] _ _ = [] splitsSkip xs len skips = (takeSkip xs len skips) : splitsSkip (dropSkip xs len skips) len skips -- Emulates the normal take behaviour, skipping over sublists in skips. takeSkip :: (Eq a) => [a] -> Int -> [[a]] -> [a] takeSkip [] _ _ = [] takeSkip _ 0 _ = [] takeSkip str@(x:xs) n skips = let n' = case (testPrefixes str skips) of Nothing -> n-1 Just len -> n + len - 1 in x : takeSkip xs n' skips -- Emulates the normal drop behaviour, skipping over sublists in skips. dropSkip :: (Eq a) => [a] -> Int -> [[a]] -> [a] dropSkip [] _ _ = [] dropSkip xs 0 _ = xs dropSkip str@(_:xs) n skips = let n' = case (testPrefixes str skips) of Nothing -> n-1 Just len -> n + len - 1 in dropSkip xs n' skips -- Another definition of insert, which provides support for handling skipping -- of strings. Inserts elem *before* pos into xs, but skips all strings in -- skips while calculating the specified position insert :: (Eq a) => [a] -> Int -> [a] -> [[a]] -> [a] insert [] _ _ _ = [] insert xs 0 e _ = e ++ xs insert str@(x:xs) pos e skips = let pos' = case (testPrefixes str skips) of Nothing -> pos-1 Just len -> pos + len - 1 in x : insert xs pos' e skips -- Test, if one of the prefixes is the prefix of xs. If yes, return it's length -- else Nothing testPrefixes :: (Eq a) => [a] -> [[a]] -> Maybe Int testPrefixes _ [] = Nothing testPrefixes xs (p:ps) = if (isPrefixOf p xs) then return (length p) else testPrefixes xs ps -- | Convert a String to 'SeqData' fromStr :: String -> SeqData fromStr = B.pack -- | Convert a 'SeqData' to a String toStr :: SeqData -> String toStr = B.unpack -- | Read the character at the specified position in the sequence. {-# INLINE (!) #-} (!) :: Sequence a -> Offset -> Char (!) (Seq _ bs _) = B.index bs {-# INLINE (?) #-} (?) :: Sequence a -> Offset -> Qual (?) (Seq _ _ (Just qs)) = BB.index qs (?) (Seq _ _ Nothing) = error "Attempting to use 'seqqual' on sequence without associated quality data" -- | Return sequence length. seqlength :: Sequence a -> Offset seqlength (Seq _ bs _) = B.length bs -- | Return sequence label (first word of header) seqlabel :: Sequence a -> SeqData seqlabel (Seq l _ _) = head (B.words l) -- | Return full header. seqheader :: Sequence a -> SeqData seqheader (Seq l _ _) = l -- | Return the sequence data. seqdata :: Sequence a -> SeqData seqdata (Seq _ d _) = d -- | Return the quality data, or error if none exist. Use hasqual if in doubt. seqqual :: Sequence a -> QualData seqqual (Seq _ _ (Just q)) = q seqqual (Seq _ _ Nothing) = error "Attempting to use 'seqqual' on sequence without associated quality data" -- | Check whether the sequence has associated quality data. hasqual :: Sequence a -> Bool hasqual (Seq _ _ q) = isJust q -- | Modify the header by appending text, or by replacing -- all but the sequence label (i.e. first word). appendHeader, setHeader :: Sequence a -> String -> Sequence a appendHeader (Seq h d q) t = (Seq (B.append h (B.pack (" "++t))) d q) setHeader (Seq h d q) t = (Seq (B.append (head $ B.words h) (B.pack (" "++t))) d q) -- | Returns a sequence with all internal storage freshly copied and -- with sequence and quality data present as a single chunk. -- -- By freshly copying internal storage, 'defragSeq' allows garbage -- collection of the original data source whence the sequence was -- read; otherwise, use of just a short sequence name can cause an -- entire sequence file buffer to be retained. -- -- By compacting sequence data into a single chunk, 'defragSeq' avoids -- linear-time traversal of sequence chunks during random access into -- sequence data. defragSeq :: Sequence t -> Sequence t defragSeq (Seq name sequ qual) = Seq (defragB name) (defragB sequ) (fmap defragBB qual) where defragB = B.fromChunks . (: []) . BBS.concat . B.toChunks defragBB = BB.fromChunks . (: []) . BBS.concat . BB.toChunks -- | map over sequences, treating them as a sequence of (char,word8) pairs. -- This will work on sequences without quality, as long as the function doesn't -- try to examine it. -- The current implementation is not very efficient. seqmap :: ((Char,Qual) -> (Char,Qual)) -> Sequence t -> Sequence t seqmap f (Seq n ds Nothing) = Seq n xs Nothing where xs = B.map (\x -> fst $ f (x,e)) ds e = error "seqmap: tried to access qual data, but none available" seqmap f (Seq n ds (Just qs)) = Seq n (B.pack xs) (Just $ BB.pack ys) where (xs,ys) = unzip $ map f $ zip (B.unpack ds) (BB.unpack qs) ------------------------------------------------------------ -- Nucleotide (DNA, RNA) specific stuff ------------------------------------------------------------ -- | Calculate the reverse complement. -- This is only relevant for the nucleotide alphabet, -- and it leaves other characters unmodified. revcompl :: Sequence Nuc -> Sequence Nuc revcompl (Seq l bs qs) = Seq l (revcompl' bs) (maybe Nothing (Just . BB.reverse) qs) -- | Calculate the reverse complent for SeqData only. revcompl' :: SeqData -> SeqData revcompl' = B.map compl . B.reverse -- | Complement a single character. I.e. identify the nucleotide it -- can hybridize with. Note that for multiple nucleotides, you usually -- want the reverse complement (see 'revcompl' for that). compl :: Char -> Char compl 'A' = 'T' compl 'T' = 'A' compl 'C' = 'G' compl 'G' = 'C' compl 'a' = 't' compl 't' = 'a' compl 'c' = 'g' compl 'g' = 'c' compl x = x ------------------------------------------------------------ -- Amino acid (protein) stuff ------------------------------------------------------------ -- | Translate a nucleotide sequence into the corresponding protein -- sequence. This works rather blindly, with no attempt to identify ORFs -- or otherwise QA the result. translate :: Sequence Nuc -> Offset -> [Amino] translate s' o' = unfoldr triples (s',o') where triples (s,o) = if o > seqlength s - 3 then Nothing else Just (trans1 (map (s!) [o,o+1,o+2]),(s,o+3)) -- prop_trans s = tail (translate s 0) == translate s 3 trans1 :: String -> Amino trans1 = maybe Xaa id . flip lookup trans_tbl . map (repUT . toUpper) where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly | His | Ile | Leu | Lys | Met | Phe | Pro | Ser | Thr | Tyr | Trp | Val | STP | Asx | Glx | Xle | Xaa -- unknowns deriving (Show,Eq) trans_tbl :: [(String,Amino)] trans_tbl = [("GCT",Ala),("GCC",Ala),("GCA",Ala),("GCG",Ala), ("CGT",Arg),("CGC",Arg),("CGA",Arg),("CGG",Arg),("AGA",Arg),("AGG",Arg), ("AAT",Asn),("AAC",Asn), ("GAT",Asp),("GAC",Asp), -- ("RAT",Asx),("RAC",Asx), -- IUPAC: R is purine (A or G) ("TGT",Cys),("TGC",Cys), ("CAA",Gln),("CAG",Gln), ("GAA",Glu),("GAG",Glu), -- ("SAA",Glx),("SAG",Glx), -- IUPAC: S is C or G ("GGT",Gly),("GGC",Gly),("GGA",Gly),("GGG",Gly), ("CAT",His),("CAC",His), ("ATT",Ile),("ATC",Ile),("ATA",Ile), ("TTA",Leu),("TTG",Leu),("CTT",Leu),("CTC",Leu),("CTA",Leu),("CTG",Leu), ("AAA",Lys),("AAG",Lys), ("ATG",Met), ("TTT",Phe),("TTC",Phe), ("CCT",Pro),("CCC",Pro),("CCA",Pro),("CCG",Pro), ("TCT",Ser),("TCC",Ser),("TCA",Ser),("TCG",Ser),("AGT",Ser),("AGC",Ser), ("ACT",Thr),("ACC",Thr),("ACA",Thr),("ACG",Thr), ("TAT",Tyr),("TAC",Tyr), ("TGG",Trp), ("GTT",Val),("GTC",Val),("GTA",Val),("GTG",Val), ("TAG",STP),("TGA",STP),("TAA",STP) ] -- todo: extend with more IUPAC nucleotide wildcards? -- | Convert a list of amino acids to a sequence in IUPAC format. toIUPAC :: [Amino] -> SeqData toIUPAC = B.pack . map (fromJust . flip lookup iupac) -- | Convert a sequence in IUPAC format to a list of amino acids. fromIUPAC :: SeqData -> [Amino] fromIUPAC = map (maybe Xaa id . flip lookup iupac' . toUpper) . B.unpack iupac :: [(Amino,Char)] iupac = [(Ala,'A') ,(Arg,'R') ,(Asn,'N') ,(Asp,'D') ,(Cys,'C') ,(Gln,'Q') ,(Glu,'E') ,(Gly,'G') ,(His,'H') ,(Ile,'I') ,(Leu,'L') ,(Lys,'K') ,(Met,'M') ,(Phe,'F') ,(Pro,'P') ,(Ser,'S') ,(Thr,'T') ,(Tyr,'Y') ,(Trp,'W') ,(Val,'V') ,(Asx,'B') -- Asn or Asp ,(Glx,'Z') -- Gln or Glu ,(Xle,'J') -- Ile or Leu ,(STP,'X') ,(Xaa,'X') ] iupac' :: [(Char,Amino)] iupac' = map (\(a,b)->(b,a)) iupac