module Bio.ChIPSeq
( monoColonalize
, rpkmBed
, rpkmSortedBed
, countTagsBinBed
, countTagsBinBed'
, tagCountDistr
, peakCluster
) where
import Control.Monad (liftM, forM_, forM)
import Control.Monad.Primitive (PrimMonad)
import Conduit
import Data.Function (on)
import qualified Data.Foldable as F
import qualified Data.HashMap.Strict as M
import qualified Data.IntervalMap as IM
import Data.Maybe (fromJust)
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Algorithms.Intro as I
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Generic.Mutable as GM
import Bio.Data.Bed
monoColonalize :: Monad m => Conduit BED m BED
monoColonalize = do
x <- await
F.forM_ x loop
where
loop prev = do
x <- await
case x of
Nothing -> yield prev
Just current ->
case () of
_ | compareBed prev current == GT ->
error $ "Bio.ChIPSeq.monoColonalize: Input is not sorted: " ++ show prev ++ " > " ++ show current
| chromStart prev == chromStart current &&
chromEnd prev == chromEnd current &&
chrom prev == chrom current &&
_strand prev == _strand current -> loop prev
| otherwise -> yield prev >> loop current
rpkmBed :: (PrimMonad m, BEDLike b, G.Vector v Double)
=> [b] -> Sink BED m (v Double)
rpkmBed regions = do
v <- lift $ do v' <- V.unsafeThaw . V.fromList . zip [0..] $ regions
I.sortBy (compareBed `on` snd) v'
V.unsafeFreeze v'
let (idx, sortedRegions) = V.unzip v
n = G.length idx
rc <- rpkmSortedBed $ Sorted sortedRegions
lift $ do
result <- GM.new n
G.sequence_ . G.imap (\x i -> GM.unsafeWrite result i (rc U.! x)) $ idx
G.unsafeFreeze result
rpkmSortedBed :: (PrimMonad m, BEDLike b, G.Vector v Double)
=> Sorted (V.Vector b) -> Sink BED m (v Double)
rpkmSortedBed (Sorted regions) = do
vec <- lift $ GM.replicate l 0
n <- foldMC (count vec) (0 :: Int)
let factor = fromIntegral n / 1e9
lift $ liftM (G.imap (\i x -> x / factor / (fromIntegral . bedSize) (regions V.! i)))
$ G.unsafeFreeze vec
where
count v nTags tag = do
let chr = chrom tag
p | _strand tag == Just True = chromStart tag
| _strand tag == Just False = chromEnd tag 1
| otherwise = error "Unkown strand"
xs = concat $ IM.elems $
IM.containing (M.lookupDefault IM.empty chr intervalMap) p
addOne v xs
return $ succ nTags
intervalMap = sortedBedToTree (++) . Sorted . G.toList . G.zip regions .
G.map return . G.enumFromN 0 $ l
addOne v' = mapM_ $ \x -> GM.unsafeRead v' x >>= GM.unsafeWrite v' x . (+1)
l = G.length regions
countTagsBinBed :: (Integral a, PrimMonad m, G.Vector v a, BEDLike b)
=> Int
-> [b]
-> Sink BED m ([v a], Int)
countTagsBinBed k beds = do
initRC <- lift $ forM beds $ \bed -> do
let start = chromStart bed
end = chromEnd bed
num = (end start) `div` k
index i = (i start) `div` k
v <- GM.replicate num 0
return (v, index)
sink 0 $ V.fromList initRC
where
sink !nTags vs = do
tag <- await
case tag of
Just (BED chr start end _ _ strand) -> do
let p | strand == Just True = start
| strand == Just False = end 1
| otherwise = error "profiling: unkown strand"
overlaps = concat $ IM.elems $
IM.containing (M.lookupDefault IM.empty chr intervalMap) p
lift $ forM_ overlaps $ \x -> do
let (v, idxFn) = vs `G.unsafeIndex` x
i = let i' = idxFn p
l = GM.length v
in if i' >= l then l 1 else i'
GM.unsafeRead v i >>= GM.unsafeWrite v i . (+1)
sink (nTags+1) vs
_ -> do rc <- lift $ mapM (G.unsafeFreeze . fst) $ G.toList vs
return (rc, nTags)
intervalMap = bedToTree (++) $ zip beds $ map return [0..]
countTagsBinBed' :: (Integral a, PrimMonad m, G.Vector v a, BEDLike b1, BEDLike b2)
=> Int
-> [b1]
-> Sink b2 m ([v a], Int)
countTagsBinBed' k beds = do
initRC <- lift $ forM beds $ \bed -> do
let start = chromStart bed
end = chromEnd bed
num = (end start) `div` k
index i = (i start) `div` k
v <- GM.replicate num 0
return (v, index)
sink 0 $ V.fromList initRC
where
sink !nTags vs = do
tag <- await
case tag of
Just bed -> do
let chr = chrom bed
start = chromStart bed
end = chromEnd bed
overlaps = concat $ IM.elems $ IM.intersecting
(M.lookupDefault IM.empty chr intervalMap) $ IM.IntervalCO start end
lift $ forM_ overlaps $ \x -> do
let (v, idxFn) = vs `G.unsafeIndex` x
lo = let i = idxFn start
in if i < 0 then 0 else i
hi = let i = idxFn end
l = GM.length v
in if i >= l then l 1 else i
forM_ [lo..hi] $ \i ->
GM.unsafeRead v i >>= GM.unsafeWrite v i . (+1)
sink (nTags+1) vs
_ -> do rc <- lift $ mapM (G.unsafeFreeze . fst) $ G.toList vs
return (rc, nTags)
intervalMap = bedToTree (++) $ zip beds $ map return [0..]
tagCountDistr :: PrimMonad m => G.Vector v Int => Sink BED m (v Int)
tagCountDistr = loop M.empty
where
loop m = do
x <- await
case x of
Just (BED chr s e _ _ (Just str)) -> do
let p | str = s
| otherwise = 1 e
case M.lookup chr m of
Just table -> loop $ M.insert chr (M.insertWith (+) p 1 table) m
_ -> loop $ M.insert chr (M.fromList [(p,1)]) m
_ -> lift $ do
vec <- GM.replicate 100 0
F.forM_ m $ \table ->
F.forM_ table $ \v -> do
let i = min 99 v
GM.unsafeRead vec i >>= GM.unsafeWrite vec i . (+1)
G.unsafeFreeze vec
peakCluster :: (BEDLike b, Monad m)
=> [b]
-> Int
-> Int
-> Source m BED
peakCluster peaks r th = mergeBedWith mergeFn peaks' $= filterC g
where
peaks' = map f peaks
f b = let chr = chrom b
c = (chromStart b + chromEnd b) `div` 2
in BED3 chr (cr) (c+r)
mergeFn xs = BED (chrom $ head xs) lo hi Nothing (Just $ fromIntegral $ length xs) Nothing
where
lo = minimum . map chromStart $ xs
hi = maximum . map chromEnd $ xs
g b = fromJust (_score b) >= fromIntegral th