module Bio.Sequence.Fasta
(
readFasta, writeFasta, hReadFasta, hWriteFasta
, readQual, writeQual, hWriteQual
, readFastaQual, hWriteFastaQual, writeFastaQual
, countSeqs
, mkSeqs
, Qual
) where
import Data.Char (chr)
import Data.List (groupBy,intersperse)
import System.IO
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import Data.ByteString.Lazy.Char8 (ByteString)
import Bio.Sequence.SeqData
splitsAt :: Offset -> ByteString -> [ByteString]
splitsAt n s = let (s1,s2) = B.splitAt n s
in if B.null s2 then [s1] else s1 : splitsAt n s2
readFasta :: FilePath -> IO [Sequence Unknown]
readFasta f = (mkSeqs . blines) `fmap` B.readFile f
blines :: B.ByteString -> [B.ByteString]
blines = B.lines . B.filter (/=Data.Char.chr 13)
writeFasta :: FilePath -> [Sequence a] -> IO ()
writeFasta f ss = do
h <- openFile f WriteMode
hWriteFasta h ss
hClose h
readQual :: FilePath -> IO [Sequence Unknown]
readQual f = (mkQual . blines) `fmap` B.readFile f
writeQual :: FilePath -> [Sequence a] -> IO ()
writeQual f ss = do
h <- openFile f WriteMode
hWriteQual h ss
hClose h
readFastaQual :: FilePath -> FilePath -> IO [Sequence Unknown]
readFastaQual s q = do
ss <- readFasta s
qs <- readQual q
let mkseq s1@(Seq x y _) s2@(Seq _ _ (Just z))
| seqlabel s1 /= seqlabel s2 = error ("mismatching sequence and quality: "
++toStr (seqlabel s1)++","++toStr (seqlabel s2))
| B.length y /= B.length z = error ("mismatching sequence and quality lengths:"
++ toStr (seqlabel s1)++","++show (B.length y,B.length z))
| otherwise = Seq x y (Just z)
mkseq _ _ = error "readFastaQual: could not combine Fasta and Qual information"
zipWith' f (x:xs) (y:ys) = f x y : zipWith' f xs ys
zipWith' _ [] [] = []
zipWith' _ _ _ = error "readFastaQual: Unbalanced sequence and quality"
return $ zipWith' mkseq ss qs
writeFastaQual :: FilePath -> FilePath -> [Sequence a] -> IO ()
writeFastaQual f q ss = do
hf <- openFile f WriteMode
hq <- openFile q WriteMode
hWriteFastaQual hf hq ss
hClose hq
hClose hf
hWriteFastaQual :: Handle -> Handle -> [Sequence a] -> IO ()
hWriteFastaQual hf hq = mapM_ wFQ
where wFQ s = wFasta hf s >> wQual hq s
hReadFasta :: Handle -> IO [Sequence Unknown]
hReadFasta h = (mkSeqs . blines) `fmap` B.hGetContents h
hWriteFasta :: Handle -> [Sequence a] -> IO ()
hWriteFasta h = mapM_ (wFasta h)
wHead :: Handle -> SeqData -> IO ()
wHead h l = do
B.hPut h $ B.pack ">"
B.hPut h l
B.hPut h $ B.pack "\n"
wFasta :: Handle -> Sequence a -> IO ()
wFasta h (Seq l d _) = do
wHead h l
let ls = splitsAt 60 d
mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls
B.hPut h $ B.pack "\n"
hWriteQual :: Handle -> [Sequence a] -> IO ()
hWriteQual h = mapM_ (wQual h)
wQual :: Handle -> Sequence a -> IO ()
wQual h (Seq l _ (Just q)) = do
wHead h l
let qls = splitsAt 20 q
qs = B.pack . unwords . map show . BB.unpack
mapM_ ((\l' -> B.hPut h l' >> B.hPut h (B.pack "\n")) . qs) qls
wQual _ (Seq _ _ Nothing) = return ()
mkSeqs :: [ByteString] -> [Sequence Unknown]
mkSeqs = map mkSeq . blocks
mkSeq :: [ByteString] -> Sequence Unknown
mkSeq (l:ls)
| otherwise = Seq (B.drop 1 l) (B.concat $ takeWhile isSeq ls) Nothing
where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s
mkSeq [] = error "empty input to mkSeq"
mkQual :: [ByteString] -> [Sequence Unknown]
mkQual = map f . blocks
where f [] = error "mkQual on empty input - this is impossible"
f (l:ls) = Seq (B.drop 1 l) B.empty
(Just $ BB.pack $ go 0 ls)
isDigit x = x <= 58 && x >= 48
go i (s:ss) = case BB.uncons s of Just (c,rs) -> if isDigit c then go (c 48 + 10*i) (rs:ss)
else let rs' = BB.dropWhile (not . isDigit) rs
in if BB.null rs' then i : go 0 ss else i : go 0 (rs':ss)
Nothing -> i : go 0 ss
go _ [] = []
blocks :: [ByteString] -> [[ByteString]]
blocks = groupBy (const (('>' /=) . B.head)) . filter ((/='#') . B.head) . dropWhile (('>' /=) . B.head) . filter (not . B.null)
countSeqs :: FilePath -> IO Int
countSeqs f = do
ss <- B.readFile f
let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ blines ss
return (length hdrs)