{-# LANGUAGE BangPatterns #-} module Bio.Seq.IO ( Genome , GenomeH , gHOpen , gHClose , pack , getSeqs , getSeq , readIndex , getChrom , getChrSizes , mkIndex ) where import Bio.Seq import Bio.Utils.Misc (readInt) import Control.Applicative ((<$>)) import qualified Data.ByteString.Char8 as B import qualified Data.HashMap.Lazy as M import Data.List.Split import System.IO -- | The first 2048 bytes are header. Header consists of a magic string, -- followed by chromosome information. Example: -- \nCHR1 START SIZE newtype Genome = G FilePath newtype GenomeH = GH Handle headerSize :: Int headerSize = 2048 magic :: String magic = "" pack :: FilePath -> IO Genome pack fl = withFile fl ReadMode f where f h = do l <- hGetLine h if l == magic then return $ G fl else error "Bio.Seq.Query.pack: Incorrect format" gHOpen :: Genome -> IO GenomeH gHOpen (G fl) = do h <- openFile fl ReadMode return $ GH h gHClose :: GenomeH -> IO () gHClose (GH h) = hClose h type IndexTable = M.HashMap B.ByteString (Int, Int) type Query = (B.ByteString, Int, Int) -- (chr, start, end), zero-based index, half-close-half-open getSeqs :: BioSeq s a => Genome -> [Query] -> IO [Either String (s a)] getSeqs g querys = do gH <- gHOpen g index <- readIndex gH r <- mapM (getSeq gH index) querys gHClose gH return r {-# INLINE getSeqs #-} getSeq :: BioSeq s a => GenomeH -> IndexTable -> Query -> IO (Either String (s a)) getSeq (GH h) index (chr, start, end) = case M.lookup chr index of Just (chrStart, chrSize) -> if end > chrSize then return $ Left $ "Bio.Seq.getSeq: out of index: " ++ show end ++ ">" ++ show chrSize else do hSeek h AbsoluteSeek $ fromIntegral $ headerSize + chrStart + start (Right . fromBS) <$> B.hGet h (end - start) _ -> return $ Left $ "Bio.Seq.getSeq: Cannot find " ++ show chr {-# INLINE getSeq #-} getChrom :: Genome -> B.ByteString -> IO (Maybe (DNA IUPAC)) getChrom g chr = do chrSize <- getChrSizes g case lookup chr chrSize of Just s -> do [Right dna] <- getSeqs g [(chr, 0, s)] return $ Just dna _ -> return Nothing {-# INLINE getChrom #-} getChrSizes :: Genome -> IO [(B.ByteString, Int)] getChrSizes g = do gh <- gHOpen g table <- readIndex gh gHClose gh return . map (\(k, (_, l)) -> (k, l)) . M.toList $ table {-# INLINE getChrSizes #-} readIndex :: GenomeH -> IO IndexTable readIndex (GH h) = do header <- B.hGetLine h >> B.hGetLine h return $ M.fromList . map f . chunksOf 3 . B.words $ header where f [k, v, l] = (k, (readInt v, readInt l)) f _ = error "error" {-# INLINE readIndex #-} -- | indexing a genome. mkIndex :: [FilePath] -- ^ fasta files representing individual chromosomes -> FilePath -- ^ output file -> IO () mkIndex fls outFl = do outH <- openFile outFl WriteMode hPutStr outH $ magic ++ "\n" ++ replicate 2024 '#' -- header size: 1024 chrs <- mapM (write outH) fls hSeek outH AbsoluteSeek 24 B.hPutStrLn outH $ mkHeader chrs hClose outH where write handle fl = do h <- openFile fl ReadMode fastaHeader <- B.hGetLine h n <- loop 0 h hClose h return (B.tail fastaHeader, n) where loop !n h' = do eof <- hIsEOF h' if eof then return n else do l <- B.hGetLine h' B.hPutStr handle l loop (n + B.length l) h' {-# INLINE mkIndex #-} mkHeader :: [(B.ByteString, Int)] -> B.ByteString mkHeader xs = B.unwords.fst $ foldl f ([], 0) xs where f (s, i) (s', i') = (s ++ [s', B.pack $ show i, B.pack $ show i'], i + i') {-# INLINE mkHeader #-}