{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleContexts #-} -------------------------------------------------------------------------------- -- | -- Module : $Header$ -- Copyright : (c) 2014 Kai Zhang -- License : MIT -- Maintainer : kai@kzhang.org -- Stability : experimental -- Portability : portable -- functions for processing BED files -------------------------------------------------------------------------------- module Bio.Data.Bed ( BEDLike(..) , BEDTree , bedToTree , sortedBedToTree , splitBed , splitBedBySize , splitBedBySizeLeft , splitBedBySizeOverlap , Sorted(..) , sortBed , intersectBed , intersectBedWith , intersectSortedBed , intersectSortedBedWith , isOverlapped , mergeBed , mergeBedWith , mergeSortedBed , mergeSortedBedWith -- * BED6 format , BED(..) -- * BED3 format , BED3(..) , fetchSeq , fetchSeq' , compareBed , convert ) where import Control.Arrow ((***)) import Control.Monad.State.Strict import qualified Data.ByteString.Char8 as B import Data.Conduit import qualified Data.Conduit.List as CL import Data.Default.Class (Default(..)) import Data.Function (on) import qualified Data.Foldable as F import qualified Data.HashMap.Strict as M import qualified Data.IntervalMap.Strict as IM import Data.List (groupBy) import Data.Maybe (fromMaybe) import qualified Data.Vector as V import qualified Data.Vector.Algorithms.Intro as I import System.IO import Bio.Seq import Bio.Seq.IO import Bio.Utils.Misc ( binBySizeLeft, binBySize, binBySizeOverlap, bins , readInt, readDouble ) -- | a class representing BED-like data, e.g., BED3, BED6 and BED12. BED format -- uses 0-based index (see documentation). class BEDLike b where -- | construct bed record from chromsomoe, start location and end location asBed :: B.ByteString -> Int -> Int -> b -- | convert bytestring to bed format fromLine :: B.ByteString -> b -- | convert bed to bytestring toLine :: b -> B.ByteString -- | field accessor chrom :: b -> B.ByteString chromStart :: b -> Int chromEnd :: b -> Int toBed3 :: b -> BED3 toBed3 bed = BED3 (chrom bed) (chromStart bed) (chromEnd bed) -- | return the size of a bed region size :: b -> Int size bed = chromEnd bed - chromStart bed hReadBed :: MonadIO m => Handle -> Source m b hReadBed h = do eof <- liftIO $ hIsEOF h unless eof $ do line <- liftIO $ B.hGetLine h yield $ fromLine line hReadBed h {-# INLINE hReadBed #-} -- | non-streaming version hReadBed' :: MonadIO m => Handle -> m [b] hReadBed' h = hReadBed h $$ CL.consume {-# INLINE hReadBed' #-} readBed :: MonadIO m => FilePath -> Source m b readBed fl = do handle <- liftIO $ openFile fl ReadMode hReadBed handle liftIO $ hClose handle {-# INLINE readBed #-} -- | non-streaming version readBed' :: MonadIO m => FilePath -> m [b] readBed' fl = readBed fl $$ CL.consume {-# INLINE readBed' #-} hWriteBed :: MonadIO m => Handle -> Sink b m () hWriteBed handle = do x <- await case x of Nothing -> return () Just bed -> (liftIO . B.hPutStrLn handle . toLine) bed >> hWriteBed handle {-# INLINE hWriteBed #-} hWriteBed' :: MonadIO m => Handle -> [b] -> m () hWriteBed' handle beds = CL.sourceList beds $$ hWriteBed handle {-# INLINE hWriteBed' #-} writeBed :: MonadIO m => FilePath -> Sink b m () writeBed fl = do handle <- liftIO $ openFile fl WriteMode hWriteBed handle liftIO $ hClose handle {-# INLINE writeBed #-} writeBed' :: MonadIO m => FilePath -> [b] -> m () writeBed' fl beds = CL.sourceList beds $$ writeBed fl {-# INLINE writeBed' #-} {-# MINIMAL asBed, fromLine, toLine, chrom, chromStart, chromEnd #-} type BEDTree a = M.HashMap B.ByteString (IM.IntervalMap Int a) -- | convert a set of bed records to interval tree, with combining function for -- equal keys sortedBedToTree :: (BEDLike b, F.Foldable f) => (a -> a -> a) -> Sorted (f (b, a)) -> BEDTree a sortedBedToTree f (Sorted xs) = M.fromList . map ((head *** IM.fromAscListWith f) . unzip) . groupBy ((==) `on` fst) . map (\(bed, x) -> (chrom bed, (IM.IntervalCO (chromStart bed) (chromEnd bed), x))) . F.toList $ xs {-# INLINE sortedBedToTree #-} bedToTree :: BEDLike b => (a -> a -> a) -> [(b, a)] -> BEDTree a bedToTree f xs = M.fromList . map ((head *** IM.fromAscListWith f) . unzip) . groupBy ((==) `on` fst) . map (\(bed, x) -> (chrom bed, (IM.IntervalCO (chromStart bed) (chromEnd bed), x))) . V.toList $ xs' where xs' = V.create $ do v <- V.unsafeThaw . V.fromList $ xs I.sortBy (compareBed `on` fst) v return v {-# INLINE bedToTree #-} -- | split a bed region into k consecutive subregions, discarding leftovers splitBed :: BEDLike b => Int -> b -> [b] splitBed k bed = map (uncurry (asBed chr)) . bins k $ (s, e) where chr = chrom bed s = chromStart bed e = chromEnd bed {-# INLINE splitBed #-} -- | split a bed region into consecutive fixed size subregions, discarding leftovers splitBedBySize :: BEDLike b => Int -> b -> [b] splitBedBySize k bed = map (uncurry (asBed chr)) . binBySize k $ (s, e) where chr = chrom bed s = chromStart bed e = chromEnd bed {-# INLINE splitBedBySize #-} -- | split a bed region into consecutive fixed size subregions, including leftovers splitBedBySizeLeft :: BEDLike b => Int -> b -> [b] splitBedBySizeLeft k bed = map (uncurry (asBed chr)) . binBySizeLeft k $ (s, e) where chr = chrom bed s = chromStart bed e = chromEnd bed {-# INLINE splitBedBySizeLeft #-} splitBedBySizeOverlap :: BEDLike b => Int -- ^ bin size -> Int -- ^ overlap size -> b -> [b] splitBedBySizeOverlap k o bed = map (uncurry (asBed chr)) . binBySizeOverlap k o $ (s, e) where chr = chrom bed s = chromStart bed e = chromEnd bed {-# INLINE splitBedBySizeOverlap #-} -- | a type to imply that underlying data structure is sorted newtype Sorted b = Sorted {fromSorted :: b} deriving (Show, Read, Eq) compareBed :: (BEDLike b1, BEDLike b2) => b1 -> b2 -> Ordering compareBed x y = compare x' y' where x' = (chrom x, chromStart x, chromEnd x) y' = (chrom y, chromStart y, chromEnd y) {-# INLINE compareBed #-} -- | sort BED, first by chromosome (alphabetical order), then by chromStart, last by chromEnd sortBed :: BEDLike b => [b] -> Sorted (V.Vector b) sortBed beds = Sorted $ V.create $ do v <- V.unsafeThaw . V.fromList $ beds I.sortBy compareBed v return v {-# INLINE sortBed #-} -- | return records in A that are overlapped with records in B intersectBed :: (BEDLike b1, BEDLike b2) => [b1] -> [b2] -> [b1] intersectBed a b = intersectSortedBed a b' where b' = sortBed b {-# INLINE intersectBed #-} -- | return records in A that are overlapped with records in B intersectSortedBed :: (BEDLike b1, BEDLike b2) => [b1] -> Sorted (V.Vector b2) -> [b1] intersectSortedBed a (Sorted b) = filter (not . null . f) a where f bed = let chr = chrom bed interval = IM.IntervalCO (chromStart bed) $ chromEnd bed in IM.intersecting (M.lookupDefault IM.empty chr tree) interval tree = sortedBedToTree (\_ _ -> False) . Sorted . zip (V.toList b) . repeat $ False {-# INLINE intersectSortedBed #-} intersectBedWith :: (BEDLike b1, BEDLike b2) => ([b2] -> a) -> [b1] -> [b2] -> [(b1, a)] intersectBedWith fn a = intersectSortedBedWith fn a . sortBed {-# INLINE intersectBedWith #-} intersectSortedBedWith :: (BEDLike b1, BEDLike b2) => ([b2] -> a) -> [b1] -> Sorted (V.Vector b2) -> [(b1, a)] intersectSortedBedWith fn a (Sorted b) = map f a where f bed = let chr = chrom bed interval = IM.IntervalCO (chromStart bed) $ chromEnd bed in (bed, fn $ map snd $ IM.intersecting (M.lookupDefault IM.empty chr tree) interval) tree = sortedBedToTree const . Sorted . V.toList . V.zip b $ b {-# INLINE intersectSortedBedWith #-} isOverlapped :: (BEDLike b1, BEDLike b2) => b1 -> b2 -> Bool isOverlapped bed1 bed2 = chr == chr' && not (e <= s' || e' <= s) where chr = chrom bed1 s = chromStart bed1 e = chromEnd bed1 chr' = chrom bed2 s' = chromStart bed2 e' = chromEnd bed2 mergeBed :: (BEDLike b, Monad m) => [b] -> Source m b mergeBed = mergeSortedBed . sortBed {-# INLINE mergeBed #-} mergeBedWith :: (BEDLike b, Monad m) => ([b] -> b) -> [b] -> Source m b mergeBedWith f = mergeSortedBedWith f . sortBed {-# INLINE mergeBedWith #-} mergeSortedBed :: (BEDLike b, Monad m) => Sorted (V.Vector b) -> Source m b mergeSortedBed = mergeSortedBedWith f where f xs = asBed (chrom $ head xs) lo hi where lo = minimum . map chromStart $ xs hi = maximum . map chromEnd $ xs {-# INLINE mergeSortedBed #-} mergeSortedBedWith :: (BEDLike b, Monad m) => ([b] -> b) -> Sorted (V.Vector b) -> Source m b mergeSortedBedWith mergeFn (Sorted beds) = do (_, r) <- V.foldM' f acc0 . V.tail $ beds yield $ mergeFn r where x0 = V.head beds acc0 = ((chrom x0, chromStart x0, chromEnd x0), [x0]) f ((chr,lo,hi), acc) bed | chr /= chr' || s' > hi = yield (mergeFn acc) >> return ((chr',s',e'), [bed]) | e' > hi = return ((chr',lo,e'), bed:acc) | otherwise = return ((chr,lo,hi), bed:acc) where chr' = chrom bed s' = chromStart bed e' = chromEnd bed {-# INLINE mergeSortedBedWith #-} -- * BED6 format -- | BED6 format, as described in http://genome.ucsc.edu/FAQ/FAQformat.html#format1.7 data BED = BED { _chrom :: !B.ByteString , _chromStart :: {-# UNPACK #-} !Int , _chromEnd :: {-# UNPACK #-} !Int , _name :: !(Maybe B.ByteString) , _score :: !(Maybe Double) , _strand :: !(Maybe Bool) -- ^ True: "+", False: "-" } deriving (Eq, Show, Read) instance Default BED where def = BED { _chrom = "" , _chromStart = 0 , _chromEnd = 0 , _name = Nothing , _score = Nothing , _strand = Nothing } instance BEDLike BED where asBed chr s e = BED chr s e Nothing Nothing Nothing fromLine l = evalState (f (B.split '\t' l)) 1 where f :: [B.ByteString] -> State Int BED f [] = do i <- get if i <= 3 then error "Read BED fail: Incorrect number of fields" else return def f (x:xs) = do i <- get put (i+1) bed <- f xs case i of 1 -> return $ bed {_chrom = x} 2 -> return $ bed {_chromStart = readInt x} 3 -> return $ bed {_chromEnd = readInt x} 4 -> return $ bed {_name = guard' x} 5 -> return $ bed {_score = getScore x} 6 -> return $ bed {_strand = getStrand x} _ -> return def guard' x | x == "." = Nothing | otherwise = Just x getScore x | x == "." = Nothing | otherwise = Just . readDouble $ x getStrand str | str == "-" = Just False | str == "+" = Just True | otherwise = Nothing {-# INLINE fromLine #-} toLine (BED f1 f2 f3 f4 f5 f6) = B.intercalate "\t" [ f1 , (B.pack.show) f2 , (B.pack.show) f3 , fromMaybe "." f4 , score' , strand' ] where strand' | f6 == Just True = "+" | f6 == Just False = "-" | otherwise = "." score' = case f5 of Just x -> (B.pack.show) x _ -> "." {-# INLINE toLine #-} chrom = _chrom chromStart = _chromStart chromEnd = _chromEnd -- | retreive sequences fetchSeq :: (BioSeq DNA a, MonadIO m) => Genome -> Conduit BED m (Either String (DNA a)) fetchSeq g = do gH <- liftIO $ gHOpen g table <- liftIO $ readIndex gH conduitWith gH table liftIO $ gHClose gH where conduitWith h index' = do bed <- await case bed of Just (BED chr start end _ _ isForward) -> do dna <- liftIO $ getSeq h index' (chr, start, end) case isForward of Just False -> yield $ fmap rc dna _ -> yield dna conduitWith h index' _ -> return () {-# INLINE fetchSeq #-} fetchSeq' :: (BioSeq DNA a, MonadIO m) => Genome -> [BED] -> m [Either String (DNA a)] fetchSeq' g beds = CL.sourceList beds $= fetchSeq g $$ CL.consume {-# INLINE fetchSeq' #-} -- * BED3 format data BED3 = BED3 !B.ByteString !Int !Int deriving (Eq, Show, Read) instance Default BED3 where def = BED3 "" 0 0 instance BEDLike BED3 where asBed = BED3 fromLine l = case B.split '\t' l of (a:b:c:_) -> BED3 a (readInt b) $ readInt c _ -> error "Read BED fail: Incorrect number of fields" {-# INLINE fromLine #-} toLine (BED3 f1 f2 f3) = B.intercalate "\t" [f1, (B.pack.show) f2, (B.pack.show) f3] {-# INLINE toLine #-} chrom (BED3 f1 _ _) = f1 chromStart (BED3 _ f2 _) = f2 chromEnd (BED3 _ _ f3) = f3 convert :: (BEDLike b1, BEDLike b2) => b1 -> b2 convert b = asBed (chrom b) (chromStart b) (chromEnd b) {-# INLINE convert #-}