src/Data/BBI.hs

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}

module Data.BBI
    ( BbiFile(..)
    , BbiFileHeader(..)
    , FileType(..)
    , openBbiFile
    , closeBbiFile
    , getBbiFileHeader
    , getChromId
    , overlappingBlocks
    , readBlocks
    ) where

import Codec.Compression.Zlib (decompress)
import Control.Applicative ((<$>))
import Control.Monad.State
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import Data.Conduit
import Data.Maybe
import qualified Data.Map.Strict as M
import System.IO
import Data.BBI.Utils

data FileType = BigBed
              | BigWig
    deriving (Show)

data BbiFile = BbiFile
    { _filehandle :: Handle
    , _header :: !BbiFileHeader
    , _chromTree :: !(M.Map B.ByteString (Int, Int))
    , _rtree :: !RTreeHeader
    } deriving (Show)

data BbiFileHeader = BbiFileHeader
    { _filetype :: !FileType
    , _endian :: !Endianness
    , _version :: !Int
    , _zoomlevels :: !Int
    , _chrTreeOffset :: !Int
    , _fullDataOffset :: !Int
    , _fullIndexOffset :: !Int
    , _filedCount :: !Int
    , _definedFiledCount :: !Int
    , _autoSqlOffset :: !Int
    , _totalSummaryOffset :: !Int
    , _uncompressBufSize :: !Int
    , _reserved :: !Int
    } deriving (Show)

openBbiFile :: FilePath -> IO BbiFile
openBbiFile fl = do
    h <- openFile fl ReadMode
    header <- fromRight <$> getBbiFileHeader h
    t <- fromRight <$> getChromTreeAsList (_endian header) h (fromIntegral $ _chrTreeOffset header)
    rtree <- fromRight <$> getRTreeHeader (_endian header) h (fromIntegral $ _fullIndexOffset header)
    return $ BbiFile h header (M.fromList t) rtree

closeBbiFile :: BbiFile -> IO ()
closeBbiFile = hClose . _filehandle

getBbiFileHeader :: Handle -> IO (Either String BbiFileHeader)
getBbiFileHeader h = do
    ft <- getFileType h
    case ft of
        Right (t, e) -> do
            hSeek h AbsoluteSeek 4
            v <- hReadInt16 e h
            zl <- hReadInt16 e h
            ct <- hReadInt64 e h
            fd <- hReadInt64 e h
            fi <- hReadInt64 e h
            fc <- hReadInt16 e h
            df <- hReadInt16 e h
            as <- hReadInt64 e h
            ts <- hReadInt64 e h
            ub <- hReadInt32 e h
            r <- hReadInt64 e h
            return . Right $ BbiFileHeader t e v zl ct fd fi fc df as ts ub r
        Left e -> return $ Left e

getFileType :: Handle -> IO (Either String (FileType, Endianness))
getFileType h = do
    hSeek h AbsoluteSeek 0
    magic <- B.hGet h 4
    let magicBE = readInt32 BE magic
        magicLE = readInt32 LE magic
    return $ case () of
        _ | magicBE == bigBedMagic -> Right (BigBed, BE)
          | magicBE == bigWigMagic -> Right (BigWig, BE)
          | magicLE == bigBedMagic -> Right (BigBed, LE)
          | magicLE == bigWigMagic -> Right (BigWig, LE)
          | otherwise -> Left "not a bigBed/bigWig file"
  where
    bigWigMagic = 0x888FFC26
    bigBedMagic = 0x8789F2EB
{-# INLINE getFileType #-}

getChromTreeAsList :: Endianness -> Handle -> Integer -> IO (Either String [(B.ByteString, (Int, Int))])
getChromTreeAsList e h offset = do
    hSeek h AbsoluteSeek offset
    magic <- hReadInt32 e h
    if magic /= chromTreeMagic
       then return $ Left "wrong chromosome tree header"
       else do
           _ <- hReadInt32 e h -- blockSize, not used
           keySize <- hReadInt32 e h
           _ <- hReadInt32 e h -- valSize, not used
           _ <- hReadInt64 e h -- itemCount, not used
           _ <- hReadInt64 e h -- reserved, not used
           Right <$> traverseTree keySize
  where
    traverseTree ks = go
      where
        go = do (isLeaf, n) <- readNode
                if isLeaf
                   then replicateM n $ readLeafItem ks
                   else do
                       xs <- replicateM n $ readNonLeafItem ks
                       rs <- forM xs $ \(_, off) -> do
                           hSeek h AbsoluteSeek $ fromIntegral off
                           go
                       return $ concat rs

    chromTreeMagic = 0x78CA8C91

    readNode = do isLeaf <- hReadBool h
                  _ <- hReadBool h
                  count <- hReadInt16 e h
                  return (isLeaf, count)

    readLeafItem n = do key <- B.filter (/=0) <$> B.hGet h n
                        chromId <- hReadInt32 e h
                        chromSize <- hReadInt32 e h
                        return (key, (chromId, chromSize))

    readNonLeafItem n = do key <- B.filter (/=0) <$> B.hGet h n
                           childOffset <- hReadInt64 e h
                           return (key, childOffset)

data RTreeHeader = RTreeHeader
    { _blockSize :: Int
    , _itemCount :: Int
    , _startChromIx :: Int
    , _startBase :: Int
    , _endChromIx :: Int
    , _endBase :: Int
    , _endFileOffset :: Int
    , _itemsPerSlot :: Int
    , _rTreeReserved :: Int
    } deriving (Show)

getRTreeHeader :: Endianness -> Handle -> Integer -> IO (Either String RTreeHeader)
getRTreeHeader e h i = do
    hSeek h AbsoluteSeek i
    magic <- hReadInt32 e h
    if magic /= rtmagic
       then return $ Left "incorrect RTree magic"
       else do
           bs <- hReadInt32 e h
           ic <- hReadInt64 e h
           sc <- hReadInt32 e h
           sb <- hReadInt32 e h
           ec <- hReadInt32 e h
           eb <- hReadInt32 e h
           eo <- hReadInt64 e h
           ips <- hReadInt32 e h
           r <- hReadInt32 e h
           return $ Right $ RTreeHeader bs ic sc sb ec eb eo ips r
  where
    rtmagic = 0x2468ACE0

overlap :: Int -> Int -> Int -> Int -> Int -> Int -> Int -> Bool
overlap qChr qStart qEnd rStartChr rStart rEndChr rEnd = 
    (qChr, qStart) < (rEndChr, rEnd) && (qChr, qEnd) > (rStartChr, rStart)
{-# INLINE overlap #-}

getChromId :: BbiFile -> B.ByteString -> Maybe Int
getChromId fl chr = fmap fst . M.lookup chr . _chromTree $ fl
{-# INLINE getChromId #-}

overlappingBlocks :: BbiFile -> (Int, Int, Int) -> IO [(Int, Int)]
overlappingBlocks fl (cid, start, end) =
    if overlap cid start end (_startChromIx $ _rtree fl)
                             (_startBase $ _rtree fl)
                             (_endChromIx $ _rtree fl)
                             (_endBase $ _rtree fl)
       then catMaybes <$> go ((fromIntegral . _fullIndexOffset . _header) fl + 48)
       else return []
  where
    h = _filehandle fl
    e = _endian . _header $ fl
    go i = do
        hSeek h AbsoluteSeek i
        (isLeaf, n) <- readNode 
        if isLeaf
           then replicateM n readLeafItem
           else do 
               pos <- fromIntegral <$> hTell h
               results <- mapM (readNonLeafItem . (\x -> pos + x*24)) [0..n-1]
               return $ concat results

    readNode = do isLeaf <- hReadBool h
                  _ <- hReadBool h
                  count <- hReadInt16 e h
                  return (isLeaf, count)

    readLeafItem = do
        stCIx <- hReadInt32 e h
        st <- hReadInt32 e h
        edCIx <- hReadInt32 e h
        ed <- hReadInt32 e h
        offset <- hReadInt64 e h
        size <- hReadInt64 e h
        return $ if overlap cid start end stCIx st edCIx ed
                    then Just (offset, size)
                    else Nothing

    readNonLeafItem p = do 
        hSeek h AbsoluteSeek $ fromIntegral p
        stCIx <- hReadInt32 e h
        st <- hReadInt32 e h
        edCIx <- hReadInt32 e h
        ed <- hReadInt32 e h
        next <- hReadInt64 e h
        if overlap cid start end stCIx st edCIx ed
           then go $ fromIntegral next
           else return []
{-# INLINE overlappingBlocks #-}

readBlocks :: BbiFile -> [(Int, Int)] -> Source IO B.ByteString
readBlocks fl blks = forM_ blks $ \(offset, size) -> do
    bs <- lift $ do hSeek handle AbsoluteSeek $ fromIntegral offset
                    BL.hGet handle size
    yield . BL.toStrict . decompress $ bs
  where
    handle = _filehandle fl
{-# INLINE readBlocks #-}

{-
streamBbi :: BbiFile -> Source IO (B.ByteString, Int, Int, B.ByteString)
streamBbi fl = mapM_ (query fl) allChroms
  where
    allChroms = map (\(chr, (_, size)) -> (chr, 0, size-1)) . M.toList . _chromTree $ fl
    -}