module Bio.ChIPSeq
( monoColonalize
, rpkmBed
, rpkmSortedBed
, countTagsBinBed
, countTagsBinBed'
, tagCountDistr
, peakCluster
) where
import Conduit
import Control.Monad (forM, forM_, liftM)
import Control.Monad.Primitive (PrimMonad)
import qualified Data.Foldable as F
import Data.Function (on)
import qualified Data.HashMap.Strict as M
import qualified Data.IntervalMap as IM
import Control.Lens ((^.), (&), (.~))
import Data.Maybe (fromJust, fromMaybe)
import qualified Data.Vector as V
import qualified Data.Vector.Algorithms.Intro as I
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Generic.Mutable as GM
import qualified Data.Vector.Unboxed as U
import Bio.Data.Bed
monoColonalize :: Monad m => ConduitT BED BED m ()
monoColonalize = do
x <- headC
case x of
Just b -> yield b >> concatMapAccumC f b
Nothing -> return ()
where
f cur prev = case compareBed prev cur of
GT -> error $
"Input is not sorted: " ++ show prev ++ " > " ++ show cur
LT -> (cur, [cur])
_ -> if prev^.strand == cur^.strand then (cur, []) else (cur, [cur])
rpkmBed :: (PrimMonad m, BEDLike b, G.Vector v Double)
=> [b] -> ConduitT BED o m (v Double)
rpkmBed regions = do
v <- lift $ do v' <- V.unsafeThaw . V.fromList . zip [0..] $ regions
I.sortBy (compareBed `on` snd) v'
V.unsafeFreeze v'
let (idx, sortedRegions) = V.unzip v
n = G.length idx
rc <- rpkmSortedBed $ Sorted sortedRegions
lift $ do
result <- GM.new n
G.sequence_ . G.imap (\x i -> GM.unsafeWrite result i (rc U.! x)) $ idx
G.unsafeFreeze result
rpkmSortedBed :: (PrimMonad m, BEDLike b, G.Vector v Double)
=> Sorted (V.Vector b) -> ConduitT BED o m (v Double)
rpkmSortedBed (Sorted regions) = do
vec <- lift $ GM.replicate l 0
n <- foldMC (count vec) (0 :: Int)
let factor = fromIntegral n / 1e9
lift $ liftM (G.imap (\i x -> x / factor / (fromIntegral . size) (regions V.! i)))
$ G.unsafeFreeze vec
where
count v nTags tag = do
let p | tag^.strand == Just True = tag^.chromStart
| tag^.strand == Just False = tag^.chromEnd 1
| otherwise = error "Unkown strand"
xs = concat $ IM.elems $
IM.containing (M.lookupDefault IM.empty (tag^.chrom) intervalMap) p
addOne v xs
return $ succ nTags
intervalMap = sortedBedToTree (++) . Sorted . G.toList . G.zip regions .
G.map return . G.enumFromN 0 $ l
addOne v' = mapM_ $ \x -> GM.unsafeRead v' x >>= GM.unsafeWrite v' x . (+1)
l = G.length regions
countTagsBinBed :: (Integral a, PrimMonad m, G.Vector v a, BEDLike b)
=> Int
-> [b]
-> ConduitT BED o m ([v a], Int)
countTagsBinBed k beds = do
initRC <- lift $ forM beds $ \bed -> do
let start = bed^.chromStart
end = bed^.chromEnd
num = (end start) `div` k
index i = (i start) `div` k
v <- GM.replicate num 0
return (v, index)
sink 0 $ V.fromList initRC
where
sink !nTags vs = do
tag <- await
case tag of
Just bed -> do
let p | bed^.strand == Just True = bed^.chromStart
| bed^.strand == Just False = bed^.chromEnd 1
| otherwise = error "profiling: unkown strand"
overlaps = concat $ IM.elems $
IM.containing (M.lookupDefault IM.empty (bed^.chrom) intervalMap) p
lift $ forM_ overlaps $ \x -> do
let (v, idxFn) = vs `G.unsafeIndex` x
i = let i' = idxFn p
l = GM.length v
in if i' >= l then l 1 else i'
GM.unsafeRead v i >>= GM.unsafeWrite v i . (+1)
sink (nTags+1) vs
_ -> do rc <- lift $ mapM (G.unsafeFreeze . fst) $ G.toList vs
return (rc, nTags)
intervalMap = bedToTree (++) $ zip beds $ map return [0..]
countTagsBinBed' :: (Integral a, PrimMonad m, G.Vector v a, BEDLike b1, BEDLike b2)
=> Int
-> [b1]
-> ConduitT b2 o m ([v a], Int)
countTagsBinBed' k beds = do
initRC <- lift $ forM beds $ \bed -> do
let start = bed^.chromStart
end = bed^.chromEnd
num = (end start) `div` k
index i = (i start) `div` k
v <- GM.replicate num 0
return (v, index)
sink 0 $ V.fromList initRC
where
sink !nTags vs = do
tag <- await
case tag of
Just bed -> do
let chr = bed^.chrom
start = bed^.chromStart
end = bed^.chromEnd
overlaps = concat $ IM.elems $ IM.intersecting
(M.lookupDefault IM.empty chr intervalMap) $ IM.IntervalCO start end
lift $ forM_ overlaps $ \x -> do
let (v, idxFn) = vs `G.unsafeIndex` x
lo = let i = idxFn start
in if i < 0 then 0 else i
hi = let i = idxFn end
l = GM.length v
in if i >= l then l 1 else i
forM_ [lo..hi] $ \i ->
GM.unsafeRead v i >>= GM.unsafeWrite v i . (+1)
sink (nTags+1) vs
_ -> do rc <- lift $ mapM (G.unsafeFreeze . fst) $ G.toList vs
return (rc, nTags)
intervalMap = bedToTree (++) $ zip beds $ map return [0..]
tagCountDistr :: PrimMonad m => G.Vector v Int => ConduitT BED o m (v Int)
tagCountDistr = loop M.empty
where
loop m = do
x <- await
case x of
Just bed -> do
let p | fromMaybe True (bed^.strand) = bed^.chromStart
| otherwise = 1 bed^.chromEnd
case M.lookup (bed^.chrom) m of
Just table -> loop $ M.insert (bed^.chrom) (M.insertWith (+) p 1 table) m
_ -> loop $ M.insert (bed^.chrom) (M.fromList [(p,1)]) m
_ -> lift $ do
vec <- GM.replicate 100 0
F.forM_ m $ \table ->
F.forM_ table $ \v -> do
let i = min 99 v
GM.unsafeRead vec i >>= GM.unsafeWrite vec i . (+1)
G.unsafeFreeze vec
peakCluster :: (BEDLike b, Monad m)
=> [b]
-> Int
-> Int
-> ConduitT o BED m ()
peakCluster peaks r th = mergeBedWith mergeFn peaks' .| filterC g
where
peaks' = map f peaks
f b = let c = (b^.chromStart + b^.chromEnd) `div` 2
in asBed (b^.chrom) (cr) (c+r) :: BED3
mergeFn xs = asBed (head xs ^. chrom) lo hi & score .~ Just (fromIntegral $ length xs)
where
lo = minimum $ map (^.chromStart) xs
hi = maximum $ map (^.chromEnd) xs
g b = fromJust (b^.score) >= fromIntegral th