{- | Module: Bio.Sequence.Fasta This module incorporates functionality for reading and writing sequence data in the Fasta format. Each sequence consists of a header (with a '>' prefix) and a set of lines containing the sequence data. As Fasta is used for both amino acids and nucleotides, the resulting 'Sequence's are type-tagged with 'Unknown'. If you know the type of sequence you are reading, use 'castToAmino' or 'castToNuc'. -} module Bio.Sequence.Fasta ( -- * Reading and writing plain FASTA files readFasta, writeFasta, hReadFasta, hWriteFasta -- * Counting sequences in a FASTA file , countSeqs -- * Helper function for reading your own sequences , mkSeqs ) where import Data.Char (chr, isSpace) import Data.List (groupBy, intersperse) import System.IO import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.ByteString.Lazy as BB import Data.ByteString.Lazy.Char8 (ByteString) import Bio.Core.Sequence splitsAt :: Offset -> ByteString -> [ByteString] splitsAt n s = let (s1,s2) = B.splitAt (unOff n) s in if B.null s2 then [s1] else s1 : splitsAt n s2 {- -- ugly? class SeqType sd where toSeq :: sd -> sd -> Sequence fromSeq :: Sequence -> (sd,sd) instance SeqType B.ByteString where toSeq = Seq fromSeq (Seq x y) = (x,y) instance SeqType BS.ByteString where toSeq h s = Seq (B.fromChunks [h]) (B.fromChunks [s]) fromSeq (Seq x y) = (c x, c y) where c = BS.concat . B.toChunks -} data Sequence = Seq SeqLabel SeqData (Maybe QualData) deriving Eq instance BioSeq Sequence where seqlabel (Seq lab seq mqual) = lab seqdata (Seq lab seq mqual) = seq seqlength (Seq lab seq mqual) = Offset {unOff = B.length $ unSD seq} instance Show Sequence where show (Seq lab seq qual) = ">" ++ (B.unpack $ unSL lab) ++ "\n" ++ (B.unpack $ unSD seq) toStr :: SeqData -> String toStr = B.unpack . unSD -- | Lazily read sequences from a FASTA-formatted file readFasta :: FilePath -> IO [Sequence] readFasta f = (mkSeqs . B.lines) `fmap` B.readFile f -- | Write sequences to a FASTA-formatted file. -- Line length is 60. writeFasta :: FilePath -> [Sequence] -> IO () writeFasta f ss = do h <- openFile f WriteMode hWriteFasta h ss hClose h -- | Lazily read sequence from handle hReadFasta :: Handle -> IO [Sequence] hReadFasta h = (mkSeqs . B.lines) `fmap` B.hGetContents h -- | Write sequences in FASTA format to a handle. hWriteFasta :: Handle -> [Sequence] -> IO () hWriteFasta h = mapM_ (wFasta h) wHead :: Handle -> SeqLabel -> IO () wHead h l = do B.hPut h $ B.pack ">" B.hPut h (unSL l) B.hPut h $ B.pack "\n" wFasta :: Handle -> Sequence -> IO () wFasta h (Seq l d _) = do wHead h l let ls = splitsAt 60 (unSD d) mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls B.hPut h $ B.pack "\n" -- | Convert a list of FASTA-formatted lines into a list of sequences. -- Blank lines are ignored. -- Comment lines start with "#" are allowed between sequences (and ignored). -- Lines starting with ">" initiate a new sequence. mkSeqs :: [ByteString] -> [Sequence] mkSeqs = map mkSeq . blocks mkSeq :: [ByteString] -> Sequence mkSeq (l:ls) -- maybe check this? | B.length l < 2 || isSpace (B.head $ B.tail l) -- = error "Trying to read sequence without a name...and failing." | otherwise = Seq (SeqLabel (B.drop 1 l)) (SeqData (B.filter (not . isSpace) $ B.concat $ takeWhile isSeq ls)) Nothing where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s mkSeq [] = error "empty input to mkSeq" -- | Split lines into blocks starting with '>' characters -- Filter out # comments (but not semicolons?) blocks :: [ByteString] -> [[ByteString]] blocks = groupBy (const (('>' /=) . B.head)) . filter ((/='#') . B.head) . dropWhile (('>' /=) . B.head) . filter (not . B.null) countSeqs :: FilePath -> IO Int countSeqs f = do ss <- B.readFile f let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ B.lines ss return (length hdrs)