{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE LambdaCase #-}
module Data.BBI
( BbiFile(..)
, BbiFileHeader(..)
, FileType(..)
, openBbiFile
, closeBbiFile
, getBbiFileHeader
, getChromId
, overlappingBlocks
, readBlocks
) where
import Codec.Compression.Zlib (decompress)
import Control.Applicative ((<$>))
import Control.Monad.State
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import Conduit
import Data.Maybe
import qualified Data.Map.Strict as M
import System.IO
import Data.BBI.Utils
data FileType = BigBed
| BigWig
deriving (Show)
getFileType :: Handle -> IO (FileType, Endianness)
getFileType h = do
hSeek h AbsoluteSeek 0
magic <- B.hGet h 4
let magicBE = readInt32 BE magic
magicLE = readInt32 LE magic
return $ case () of
_ | magicBE == bigBedMagic -> (BigBed, BE)
| magicBE == bigWigMagic -> (BigWig, BE)
| magicLE == bigBedMagic -> (BigBed, LE)
| magicLE == bigWigMagic -> (BigWig, LE)
| otherwise -> error "Not a bigBed or bigWig file!"
where
bigWigMagic = 0x888FFC26
bigBedMagic = 0x8789F2EB
{-# INLINE getFileType #-}
data BbiFileHeader = BbiFileHeader
{ _filetype :: !FileType
, _endian :: !Endianness
, _version :: !Int
, _zoomlevels :: !Int
, _chrTreeOffset :: !Int
, _fullDataOffset :: !Int
, _fullIndexOffset :: !Int
, _filedCount :: !Int
, _definedFiledCount :: !Int
, _autoSqlOffset :: !Int
, _totalSummaryOffset :: !Int
, _uncompressBufSize :: !Int
, _reserved :: !Int
} deriving (Show)
getBbiFileHeader :: Handle -> IO BbiFileHeader
getBbiFileHeader h = do
(ft, endi) <- getFileType h
hSeek h AbsoluteSeek 4
BbiFileHeader <$> return ft <*> return endi <*> hReadInt16 endi h <*>
hReadInt16 endi h <*> hReadInt64 endi h <*> hReadInt64 endi h <*>
hReadInt64 endi h <*> hReadInt16 endi h <*> hReadInt16 endi h <*>
hReadInt64 endi h <*> hReadInt64 endi h <*> hReadInt32 endi h <*>
hReadInt64 endi h
{-# INLINE getBbiFileHeader #-}
data RTreeHeader = RTreeHeader
{ _blockSize :: Int
, _itemCount :: Int
, _startChromIx :: Int
, _startBase :: Int
, _endChromIx :: Int
, _endBase :: Int
, _endFileOffset :: Int
, _itemsPerSlot :: Int
, _rTreeReserved :: Int
} deriving (Show)
getRTreeHeader :: Endianness
-> Handle
-> Integer
-> IO RTreeHeader
getRTreeHeader e h i = do
magic <- hSeek h AbsoluteSeek i >> hReadInt32 e h
if magic /= 0x2468ACE0
then error "incorrect RTree magic"
else RTreeHeader <$> hReadInt32 e h <*> hReadInt64 e h <*>
hReadInt32 e h <*> hReadInt32 e h <*> hReadInt32 e h <*>
hReadInt32 e h <*> hReadInt64 e h <*> hReadInt32 e h <*>
hReadInt32 e h
data BbiFile = BbiFile
{ _filehandle :: Handle
, _header :: !BbiFileHeader
, _chromTree :: !(M.Map Chromosome (ChromID, ChromSize))
, _rtree :: !RTreeHeader
} deriving (Show)
type Chromosome = B.ByteString
type ChromSize = Int
type ChromID = Int
openBbiFile :: FilePath -> IO BbiFile
openBbiFile fl = do
h <- openFile fl ReadMode
header <- getBbiFileHeader h
t <- getChromTreeAsList (_endian header) h (fromIntegral $ _chrTreeOffset header)
rtree <- getRTreeHeader (_endian header) h (fromIntegral $ _fullIndexOffset header)
return $ BbiFile h header (M.fromList t) rtree
closeBbiFile :: BbiFile -> IO ()
closeBbiFile = hClose . _filehandle
getChromTreeAsList :: Endianness
-> Handle
-> Integer
-> IO [(B.ByteString, (Int, Int))]
getChromTreeAsList e h offset = do
magic <- hSeek h AbsoluteSeek offset >> hReadInt32 e h
if magic /= 0x78CA8C91
then error "wrong chromosome tree header"
else do
_ <- hReadInt32 e h
keySize <- hReadInt32 e h
_ <- hReadInt32 e h
_ <- hReadInt64 e h
_ <- hReadInt64 e h
traverseTree keySize
where
traverseTree ks = go
where
go = readNode >>= \case
(True, n) -> replicateM n $ readLeafItem ks
(False, n) -> fmap concat $
replicateM n (readNonLeafItem ks) >>= mapM ( \(_, off) -> do
hSeek h AbsoluteSeek $ fromIntegral off
go )
readNode = do
isLeaf <- hReadBool h
_ <- hReadBool h
count <- hReadInt16 e h
return (isLeaf, count)
readLeafItem n = do
key <- B.filter (/=0) <$> B.hGet h n
chromId <- hReadInt32 e h
chromSize <- hReadInt32 e h
return (key, (chromId, chromSize))
readNonLeafItem n = do
key <- B.filter (/=0) <$> B.hGet h n
childOffset <- hReadInt64 e h
return (key, childOffset)
overlap :: Int -> Int -> Int -> Int -> Int -> Int -> Int -> Bool
overlap qChr qStart qEnd rStartChr rStart rEndChr rEnd =
(qChr, qStart) < (rEndChr, rEnd) && (qChr, qEnd) > (rStartChr, rStart)
{-# INLINE overlap #-}
getChromId :: BbiFile -> B.ByteString -> Maybe ChromID
getChromId fl chr = fmap fst . M.lookup chr . _chromTree $ fl
{-# INLINE getChromId #-}
overlappingBlocks :: BbiFile -> (ChromID, Int, Int) -> IO [Block]
overlappingBlocks fl (cid, start, end) =
if overlap cid start end (_startChromIx $ _rtree fl)
(_startBase $ _rtree fl)
(_endChromIx $ _rtree fl)
(_endBase $ _rtree fl)
then catMaybes <$> go ((fromIntegral . _fullIndexOffset . _header) fl + 48)
else return []
where
h = _filehandle fl
e = _endian . _header $ fl
go i = do
hSeek h AbsoluteSeek i
(isLeaf, n) <- readNode
if isLeaf
then replicateM n readLeafItem
else do
pos <- fromIntegral <$> hTell h
results <- mapM (readNonLeafItem . (\x -> pos + x*24)) [0..n-1]
return $ concat results
readNode = do isLeaf <- hReadBool h
_ <- hReadBool h
count <- hReadInt16 e h
return (isLeaf, count)
readLeafItem = do
stCIx <- hReadInt32 e h
st <- hReadInt32 e h
edCIx <- hReadInt32 e h
ed <- hReadInt32 e h
offset <- hReadInt64 e h
size <- hReadInt64 e h
return $ if overlap cid start end stCIx st edCIx ed
then Just (offset, size)
else Nothing
readNonLeafItem p = do
hSeek h AbsoluteSeek $ fromIntegral p
stCIx <- hReadInt32 e h
st <- hReadInt32 e h
edCIx <- hReadInt32 e h
ed <- hReadInt32 e h
next <- hReadInt64 e h
if overlap cid start end stCIx st edCIx ed
then go $ fromIntegral next
else return []
{-# INLINE overlappingBlocks #-}
type Block = (Offset, BlockSize)
type Offset = Int
type BlockSize = Int
readBlocks :: BbiFile -> [Block] -> Source IO B.ByteString
readBlocks fl blks = forM_ blks $ \(offset, size) -> do
bs <- lift $ hSeek handle AbsoluteSeek (fromIntegral offset) >>
BL.hGet handle size
yield . BL.toStrict . decompress $ bs
where
handle = _filehandle fl
{-# INLINE readBlocks #-}