{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
module Bio.Seq.IO
( Genome
, openGenome
, closeGenome
, withGenome
, getSeq
, getChrom
, getChrSizes
, mkIndex
) where
import Control.Exception (bracket)
import Conduit
import qualified Data.ByteString.Char8 as B
import qualified Data.HashMap.Lazy as M
import Data.List.Split
import System.IO
import Bio.Seq
import Bio.Utils.Misc (readInt)
import Bio.Data.Fasta (fastaReader)
data Genome = Genome !Handle !IndexTable !Int
type IndexTable = M.HashMap B.ByteString (Int, Int)
magic :: B.ByteString
magic = "<HASKELLBIOINFORMATICS_7d2c5gxhg934>"
openGenome :: FilePath -> IO Genome
openGenome fl = do
h <- openFile fl ReadMode
sig <- B.hGetLine h
if sig == magic
then do
header <- B.hGetLine h
return $ Genome h (getIndex header) (B.length sig + B.length header + 2)
else error "Bio.Seq.Query.openGenome: Incorrect format"
where
getIndex = M.fromList . map f . chunksOf 3 . B.words
where
f [k, v, l] = (k, (readInt v, readInt l))
f _ = error "error"
{-# INLINE openGenome #-}
closeGenome :: Genome -> IO ()
closeGenome (Genome h _ _) = hClose h
{-# INLINE closeGenome #-}
withGenome :: FilePath -> (Genome -> IO a) -> IO a
withGenome fl fn = bracket (openGenome fl) closeGenome fn
{-# INLINE withGenome #-}
type Query = (B.ByteString, Int, Int)
getSeq :: BioSeq s a => Genome -> Query -> IO (Either String (s a))
getSeq (Genome h index headerSize) (chr, start, end) = case M.lookup chr index of
Just (chrStart, chrSize) ->
if end > chrSize
then return $ Left $ "Bio.Seq.getSeq: out of index: " ++ show end ++
">" ++ show chrSize
else do
hSeek h AbsoluteSeek $ fromIntegral $ headerSize + chrStart + start
fromBS <$> B.hGet h (end - start)
_ -> return $ Left $ "Bio.Seq.getSeq: Cannot find " ++ show chr
{-# INLINE getSeq #-}
getChrom :: Genome -> B.ByteString -> IO (Either String (DNA IUPAC))
getChrom g chr = case lookup chr chrSize of
Just s -> getSeq g (chr, 0, s)
_ -> return $ Left "Unknown chromosome"
where
chrSize = getChrSizes g
{-# INLINE getChrom #-}
getChrSizes :: Genome -> [(B.ByteString, Int)]
getChrSizes (Genome _ table _) = map (\(k, (_, l)) -> (k, l)) . M.toList $ table
{-# INLINE getChrSizes #-}
mkIndex :: [FilePath]
-> FilePath
-> IO ()
mkIndex fls outFl = withFile outFl WriteMode $ \outH -> do
(chrs, dnas) <- (unzip . concat) <$> mapM readSeq fls
B.hPutStr outH $ B.unlines [magic, mkHeader chrs]
mapM_ (B.hPutStr outH) dnas
where
readSeq fl = runResourceT $ runConduit $ fastaReader fl .| conduit .| sinkList
where
conduit = awaitForever $ \(chrName, seqs) -> do
let dna = B.concat seqs
yield ((head $ B.words chrName, B.length dna), dna)
{-# INLINE mkIndex #-}
mkHeader :: [(B.ByteString, Int)] -> B.ByteString
mkHeader xs = B.unwords.fst $ foldl f ([], 0) xs
where
f (s, i) (s', i') = (s ++ [s', B.pack $ show i, B.pack $ show i'], i + i')
{-# INLINE mkHeader #-}