module Bio.Sequence.Fasta
(
readFasta, writeFasta, hReadFasta, hWriteFasta
, readQual, writeQual, hWriteQual
, readFastaQual, hWriteFastaQual, writeFastaQual
, countSeqs
, mkSeqs
) where
import Data.List (groupBy,intersperse)
import Data.Int
import Data.Maybe
import System.IO
import System.IO.Unsafe
import Control.Monad
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import Data.ByteString.Lazy.Char8 (ByteString)
import Bio.Sequence.SeqData
splitsAt :: Offset -> ByteString -> [ByteString]
splitsAt n s = let (s1,s2) = B.splitAt n s
in if B.null s2 then [s1] else s1 : splitsAt n s2
readFasta :: FilePath -> IO [Sequence]
readFasta f = do
ls <- getLines f
return (mkSeqs ls)
writeFasta :: FilePath -> [Sequence] -> IO ()
writeFasta f ss = do
h <- openFile f WriteMode
hWriteFasta h ss
hClose h
readQual :: FilePath -> IO [Sequence]
readQual f = do
ls <- getLines f
return (mkQual ls)
writeQual :: FilePath -> [Sequence] -> IO ()
writeQual f ss = do
h <- openFile f WriteMode
hWriteQual h ss
hClose h
readFastaQual :: FilePath -> FilePath -> IO [Sequence]
readFastaQual s q = do
ss <- readFasta s
qs <- readQual q
return ss
let mkseq s1@(Seq x y _) s2@(Seq _ _ (Just z))
| seqlabel s1 /= seqlabel s2 = error ("mismatching sequence and quality: "
++show (seqlabel s1,seqlabel s2))
| B.length y /= B.length z = error ("mismatching sequence and quality lengths:"
++ show (seqlabel s1,B.length y,B.length z))
| otherwise = Seq x y (Just z)
mkseq _ _ = error "readFastaQual: could not combine Fasta and Qual information"
return $ zipWith mkseq ss qs
writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO ()
writeFastaQual f q ss = do
hf <- openFile f WriteMode
hq <- openFile q WriteMode
hWriteFastaQual hf hq ss
hClose hq
hClose hf
hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO ()
hWriteFastaQual hf hq = mapM_ wFQ
where wFQ s = wFasta hf s >> wQual hq s
hReadFasta :: Handle -> IO [Sequence]
hReadFasta h = do
ls <- hGetLines h
return (mkSeqs ls)
hWriteFasta :: Handle -> [Sequence] -> IO ()
hWriteFasta h = mapM_ (wFasta h)
wHead :: Handle -> SeqData -> IO ()
wHead h l = do
B.hPut h $ B.pack ">"
B.hPut h l
B.hPut h $ B.pack "\n"
wFasta :: Handle -> Sequence -> IO ()
wFasta h (Seq l d _) = do
wHead h l
let ls = splitsAt 60 d
mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls
B.hPut h $ B.pack "\n"
hWriteQual :: Handle -> [Sequence] -> IO ()
hWriteQual h = mapM_ (wQual h)
wQual :: Handle -> Sequence -> IO ()
wQual h (Seq l _ (Just q)) = do
wHead h l
let qls = splitsAt 20 q
qs = B.pack . unwords . map show . BB.unpack
mapM_ ((\l' -> B.hPut h l' >> B.hPut h (B.pack "\n")) . qs) qls
wQual _ (Seq _ _ Nothing) = return ()
getLines :: FilePath -> IO [ByteString]
getLines f = do
h <- openFile f ReadMode
hGetLines' (hClose h) h
hGetLines :: Handle -> IO [ByteString]
hGetLines = hGetLines' (return ())
hGetLines' :: IO () -> Handle -> IO [ByteString]
hGetLines' c h = do
e <- hIsEOF h
if e then c >> return []
else do l' <- BS.hGetLine h
let l = B.fromChunks $ if BS.null l' then [] else [l']
ls <- unsafeInterleaveIO $ hGetLines' c h
return (l:ls)
mkSeqs :: [ByteString] -> [Sequence]
mkSeqs = map mkSeq . blocks
mkSeq :: [ByteString] -> Sequence
mkSeq (l:ls) = Seq (B.drop 1 l) (B.concat $ takeWhile isSeq ls) Nothing
where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s
mkSeq [] = error "empty input to mkSeq"
mkQual :: [ByteString] -> [Sequence]
mkQual = map f . blocks
where f (l:ls) = Seq (B.drop 1 l) B.empty
(Just $ BB.pack (map int (B.words $ B.unlines ls)))
f [] = error "mkQual: empty quality data"
int = fromIntegral . fst . fromJust' . B.readInt
fromJust' = maybe (error "Error in qual format") id
blocks :: [ByteString] -> [[ByteString]]
blocks = groupBy (const (('>' /=) . B.head)) . filter ((/='#') . B.head) . filter (not . B.null)
countSeqs :: FilePath -> IO Int
countSeqs f = do
ss <- B.readFile f
let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ B.lines ss
return (length hdrs)