{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE LambdaCase #-} module Data.BBI ( BbiFile(..) , BbiFileHeader(..) , FileType(..) , openBbiFile , closeBbiFile , getBbiFileHeader , getChromId , overlappingBlocks , readBlocks ) where import Codec.Compression.Zlib (decompress) import Control.Applicative ((<$>)) import Control.Monad.State import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import Conduit import Data.Maybe import qualified Data.Map.Strict as M import System.IO import Data.BBI.Utils data FileType = BigBed | BigWig deriving (Show) -- | Get the file type. getFileType :: Handle -> IO (FileType, Endianness) getFileType h = do hSeek h AbsoluteSeek 0 magic <- B.hGet h 4 let magicBE = readInt32 BE magic magicLE = readInt32 LE magic return $ case () of _ | magicBE == bigBedMagic -> (BigBed, BE) | magicBE == bigWigMagic -> (BigWig, BE) | magicLE == bigBedMagic -> (BigBed, LE) | magicLE == bigWigMagic -> (BigWig, LE) | otherwise -> error "Not a bigBed or bigWig file!" where bigWigMagic = 0x888FFC26 bigBedMagic = 0x8789F2EB {-# INLINE getFileType #-} -- | The header for big index file. Here are the Byte-by-byte details: -- -------------------- ---- ---- ------------------------------------------- -- Name Size Type Description -- -------------------- ---- ---- ------------------------------------------- -- magic 4 uint 0x888FFC26 for BigWig, 0x8789F2EB for -- BigBed. If byte-swapped, all -- numbers in file are byte-swapped. -- version 2 uint Currently 3. -- zoomLevels 2 uint Number of different zoom summary resolutions. -- chromosomeTreeOffset 8 uint Offset in file to chromosome B+ tree index. -- fullDataOffset 8 uint Offset to main (unzoomed) data. Points -- specifically to the dataCount. -- fullIndexOffset 8 uint Offset to R tree index of items. -- fieldCount 2 uint Number of fields in BED file. (0 for BigWig) -- definedFieldCount 2 uint Number of fields that are predefined BED fields. -- autoSqlOffset 8 uint Offset to zero-terminated string with .as spec. -- totalSummaryOffset 8 uint Offset to overall file summary data block. -- uncompressBufSize 4 uint Maximum size of decompression buffer -- needed (nonzero on compressed files). -- reserved 8 uint Reserved for future expansion. Currently 0. data BbiFileHeader = BbiFileHeader { _filetype :: !FileType , _endian :: !Endianness , _version :: !Int , _zoomlevels :: !Int , _chrTreeOffset :: !Int , _fullDataOffset :: !Int , _fullIndexOffset :: !Int , _filedCount :: !Int , _definedFiledCount :: !Int , _autoSqlOffset :: !Int , _totalSummaryOffset :: !Int , _uncompressBufSize :: !Int , _reserved :: !Int } deriving (Show) getBbiFileHeader :: Handle -> IO BbiFileHeader getBbiFileHeader h = do (ft, endi) <- getFileType h hSeek h AbsoluteSeek 4 BbiFileHeader <$> return ft <*> return endi <*> hReadInt16 endi h <*> hReadInt16 endi h <*> hReadInt64 endi h <*> hReadInt64 endi h <*> hReadInt64 endi h <*> hReadInt16 endi h <*> hReadInt16 endi h <*> hReadInt64 endi h <*> hReadInt64 endi h <*> hReadInt32 endi h <*> hReadInt64 endi h {-# INLINE getBbiFileHeader #-} -- | R tree index header. -- Name Size Type Description -- magic 4 uint 0x2468ACE0. If byte-swapped all numbers in index are byte-swapped. -- blockSize 4 uint Number of children per block (not byte size of block). -- itemCount 8 uint The number of chromosomes/contigs. -- startChromIx 4 uint ID of first chromosome in index. -- startBase 4 uint Position of first base in index. -- endChromIx 4 uint ID of last chromosome in index. -- endBase 4 uint Position of last base in index. -- endFileOffset 8 uint Position in file where data being indexed ends. -- itemsPerSlot 4 uint Number of items pointed to by leaves of index. -- Reserved 4 uint Reserved for future expansion. Currently 0. data RTreeHeader = RTreeHeader { _blockSize :: Int , _itemCount :: Int , _startChromIx :: Int , _startBase :: Int , _endChromIx :: Int , _endBase :: Int , _endFileOffset :: Int , _itemsPerSlot :: Int , _rTreeReserved :: Int } deriving (Show) getRTreeHeader :: Endianness -> Handle -> Integer -- ^ Location of R Tree header in the file. -> IO RTreeHeader getRTreeHeader e h i = do magic <- hSeek h AbsoluteSeek i >> hReadInt32 e h if magic /= 0x2468ACE0 then error "incorrect RTree magic" else RTreeHeader <$> hReadInt32 e h <*> hReadInt64 e h <*> hReadInt32 e h <*> hReadInt32 e h <*> hReadInt32 e h <*> hReadInt32 e h <*> hReadInt64 e h <*> hReadInt32 e h <*> hReadInt32 e h -- | Big index file format. Here are the Byte-by-byte details: -- -------------- ------ ----------------------------------------------------- -- Name Size Description -- -------------- ------ ----------------------------------------------------- -- bbiHeader 64 Contains high-level information about file and -- offsets to various parts of file. -- zoomHeaders N*24 One for each level of zoom built into file. -- autoSql Varies Zero-terminated string in autoSql format describing -- formats. Optional, not used in BigWig. -- totalSummary 40 Statistical summary of entire file. -- chromosomeTree Varies B+ tree-formatted index of chromosomes, their sizes, -- and a unique ID for each. -- dataCount 4 Number of records in data. For BigWig this -- corresponds to the number of sections, not the -- number of floating point values. -- data Varies Possibly compressed data in format specific for -- file type. -- index Varies R tree index of data. -- zoomInfo Varies One for each zoom level. data BbiFile = BbiFile { _filehandle :: Handle , _header :: !BbiFileHeader , _chromTree :: !(M.Map Chromosome (ChromID, ChromSize)) , _rtree :: !RTreeHeader } deriving (Show) type Chromosome = B.ByteString type ChromSize = Int type ChromID = Int openBbiFile :: FilePath -> IO BbiFile openBbiFile fl = do h <- openFile fl ReadMode header <- getBbiFileHeader h t <- getChromTreeAsList (_endian header) h (fromIntegral $ _chrTreeOffset header) rtree <- getRTreeHeader (_endian header) h (fromIntegral $ _fullIndexOffset header) return $ BbiFile h header (M.fromList t) rtree closeBbiFile :: BbiFile -> IO () closeBbiFile = hClose . _filehandle getChromTreeAsList :: Endianness -> Handle -> Integer -- ^ Location -> IO [(B.ByteString, (Int, Int))] getChromTreeAsList e h offset = do magic <- hSeek h AbsoluteSeek offset >> hReadInt32 e h if magic /= 0x78CA8C91 then error "wrong chromosome tree header" else do _ <- hReadInt32 e h -- blockSize, not used keySize <- hReadInt32 e h _ <- hReadInt32 e h -- valSize, not used _ <- hReadInt64 e h -- itemCount, not used _ <- hReadInt64 e h -- reserved, not used traverseTree keySize where traverseTree ks = go where go = readNode >>= \case (True, n) -> replicateM n $ readLeafItem ks (False, n) -> fmap concat $ replicateM n (readNonLeafItem ks) >>= mapM ( \(_, off) -> do hSeek h AbsoluteSeek $ fromIntegral off go ) readNode = do isLeaf <- hReadBool h _ <- hReadBool h count <- hReadInt16 e h return (isLeaf, count) readLeafItem n = do key <- B.filter (/=0) <$> B.hGet h n chromId <- hReadInt32 e h chromSize <- hReadInt32 e h return (key, (chromId, chromSize)) readNonLeafItem n = do key <- B.filter (/=0) <$> B.hGet h n childOffset <- hReadInt64 e h return (key, childOffset) overlap :: Int -> Int -> Int -> Int -> Int -> Int -> Int -> Bool overlap qChr qStart qEnd rStartChr rStart rEndChr rEnd = (qChr, qStart) < (rEndChr, rEnd) && (qChr, qEnd) > (rStartChr, rStart) {-# INLINE overlap #-} getChromId :: BbiFile -> B.ByteString -> Maybe ChromID getChromId fl chr = fmap fst . M.lookup chr . _chromTree $ fl {-# INLINE getChromId #-} overlappingBlocks :: BbiFile -> (ChromID, Int, Int) -> IO [Block] overlappingBlocks fl (cid, start, end) = if overlap cid start end (_startChromIx $ _rtree fl) (_startBase $ _rtree fl) (_endChromIx $ _rtree fl) (_endBase $ _rtree fl) then catMaybes <$> go ((fromIntegral . _fullIndexOffset . _header) fl + 48) else return [] where h = _filehandle fl e = _endian . _header $ fl go i = do hSeek h AbsoluteSeek i (isLeaf, n) <- readNode if isLeaf then replicateM n readLeafItem else do pos <- fromIntegral <$> hTell h results <- mapM (readNonLeafItem . (\x -> pos + x*24)) [0..n-1] return $ concat results readNode = do isLeaf <- hReadBool h _ <- hReadBool h count <- hReadInt16 e h return (isLeaf, count) readLeafItem = do stCIx <- hReadInt32 e h st <- hReadInt32 e h edCIx <- hReadInt32 e h ed <- hReadInt32 e h offset <- hReadInt64 e h size <- hReadInt64 e h return $ if overlap cid start end stCIx st edCIx ed then Just (offset, size) else Nothing readNonLeafItem p = do hSeek h AbsoluteSeek $ fromIntegral p stCIx <- hReadInt32 e h st <- hReadInt32 e h edCIx <- hReadInt32 e h ed <- hReadInt32 e h next <- hReadInt64 e h if overlap cid start end stCIx st edCIx ed then go $ fromIntegral next else return [] {-# INLINE overlappingBlocks #-} type Block = (Offset, BlockSize) type Offset = Int type BlockSize = Int -- | Read Blocks from the file. readBlocks :: BbiFile -> [Block] -> Source IO B.ByteString readBlocks fl blks = forM_ blks $ \(offset, size) -> do bs <- lift $ hSeek handle AbsoluteSeek (fromIntegral offset) >> BL.hGet handle size yield . BL.toStrict . decompress $ bs where handle = _filehandle fl {-# INLINE readBlocks #-} {- streamBbi :: BbiFile -> Source IO (B.ByteString, Int, Int, B.ByteString) streamBbi fl = mapM_ (query fl) allChroms where allChroms = map (\(chr, (_, size)) -> (chr, 0, size-1)) . M.toList . _chromTree $ fl -}