{-# LANGUAGE BangPatterns     #-}
{-# LANGUAGE FlexibleContexts #-}

module Bio.Motif.Search
    ( findTFBS
    , findTFBSWith
    , findTFBSSlow
    , maxMatchSc
    , optimalScoresSuffix
    ) where

import           Conduit
import qualified Data.ByteString.Char8       as B
import qualified Data.Matrix.Unboxed         as M
import           Data.Ord                    (comparing)
import qualified Data.Vector.Unboxed         as U

import           Bio.Motif                   (Bkgd (..), PWM (..), scores', size)
import           Bio.Seq                     (DNA, toBS)
import qualified Bio.Seq                     as Seq (length)

-- | given a user defined threshold, look for TF binding sites on a DNA
-- sequence, using look ahead search. This function doesn't search for binding
-- sites on the reverse strand
findTFBS :: Monad m
         => Bkgd
         -> PWM
         -> DNA a
         -> Double
         -> Bool    -- ^ whether to skip ambiguous sequences. Recommend: True
                    -- in most cases
         -> ConduitT i (Int, Double) m ()
findTFBS :: Bkgd
-> PWM -> DNA a -> Double -> Bool -> ConduitT i (Int, Double) m ()
findTFBS Bkgd
bg PWM
pwm DNA a
dna Double
thres Bool
skip = Vector Double
-> Bkgd
-> PWM
-> DNA a
-> Double
-> Bool
-> ConduitT i (Int, Double) m ()
forall (m :: * -> *) a i.
Monad m =>
Vector Double
-> Bkgd
-> PWM
-> DNA a
-> Double
-> Bool
-> ConduitT i (Int, Double) m ()
findTFBSWith Vector Double
sigma Bkgd
bg PWM
pwm DNA a
dna Double
thres Bool
skip
  where
    sigma :: Vector Double
sigma = Bkgd -> PWM -> Vector Double
optimalScoresSuffix Bkgd
bg PWM
pwm
{-# INLINE findTFBS #-}

findTFBSWith :: Monad m
             => U.Vector Double   -- ^ best possible match score of suffixes
             -> Bkgd
             -> PWM
             -> DNA a
             -> Double
             -> Bool    -- ^ whether to skip ambiguous sequences. Recommend: True
                        -- in most cases
             -> ConduitT i (Int, Double) m ()
findTFBSWith :: Vector Double
-> Bkgd
-> PWM
-> DNA a
-> Double
-> Bool
-> ConduitT i (Int, Double) m ()
findTFBSWith Vector Double
sigma Bkgd
bg PWM
pwm DNA a
dna Double
thres Bool
skip = Int -> ConduitT i (Int, Double) m ()
forall (m :: * -> *) i.
Monad m =>
Int -> ConduitT i (Int, Double) m ()
loop Int
0
  where
    loop :: Int -> ConduitT i (Int, Double) m ()
loop !Int
i | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1 = () -> ConduitT i (Int, Double) m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
            | Bool
otherwise = do let (Int
d, Double
sc) = Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
forall a.
Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
searchFn Bkgd
bg PWM
pwm Vector Double
sigma DNA a
dna Int
i Double
thres
                             if Int
d Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
                                then (Int, Double) -> ConduitT i (Int, Double) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Int
i, Double
sc) ConduitT i (Int, Double) m ()
-> ConduitT i (Int, Double) m () -> ConduitT i (Int, Double) m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Int -> ConduitT i (Int, Double) m ()
loop (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
                                else Int -> ConduitT i (Int, Double) m ()
loop (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
    l :: Int
l = DNA a -> Int
forall (s :: * -> *) a. BioSeq' s => s a -> Int
Seq.length DNA a
dna
    n :: Int
n = PWM -> Int
size PWM
pwm
    searchFn :: Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
searchFn | Bool
skip = Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
forall a.
Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
lookAheadSearch'
             | Bool
otherwise = Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
forall a.
Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
lookAheadSearch
{-# INLINE findTFBSWith #-}

{-
-- | a parallel version of findTFBS
findTFBS' :: Bkgd
          -> PWM
          -> DNA a
          -> Double
          -> Bool
          -> [Int]
findTFBS' bg pwm dna th skip = concat $ parMap rdeepseq f [0,step..l-n+1]
  where
    f x = loop x
      where
        loop i | i >= x+step || i >= l-n+1 = []
               | otherwise = let d = fst $ searchFn bg pwm sigma dna i th
                             in if d == n-1
                                   then i : loop (i+1)
                                   else loop (i+1)
    sigma = optimalScoresSuffix bg pwm
    l = Seq.length dna
    n = size pwm
    step = 500000
    searchFn | skip = lookAheadSearch'
             | otherwise = lookAheadSearch
{-# INLINE findTFBS' #-}
-}

-- | use naive search
findTFBSSlow :: Monad m => Bkgd -> PWM -> DNA a -> Double -> ConduitT i (Int, Double) m ()
findTFBSSlow :: Bkgd -> PWM -> DNA a -> Double -> ConduitT i (Int, Double) m ()
findTFBSSlow Bkgd
bg PWM
pwm DNA a
dna Double
thres = Bkgd -> PWM -> DNA a -> ConduitT i Double m ()
forall (m :: * -> *) a i.
Monad m =>
Bkgd -> PWM -> DNA a -> ConduitT i Double m ()
scores' Bkgd
bg PWM
pwm DNA a
dna ConduitT i Double m ()
-> ConduitM Double (Int, Double) m ()
-> ConduitT i (Int, Double) m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| Int -> ConduitM Double (Int, Double) m ()
forall (m :: * -> *) a.
(Monad m, Num a) =>
a -> ConduitT Double (a, Double) m ()
loop Int
0
  where
    loop :: a -> ConduitT Double (a, Double) m ()
loop a
i = do Maybe Double
v <- ConduitT Double (a, Double) m (Maybe Double)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await
                case Maybe Double
v of
                    Just Double
v' ->  if Double
v' Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
>= Double
thres then (a, Double) -> ConduitT Double (a, Double) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (a
i, Double
v') ConduitT Double (a, Double) m ()
-> ConduitT Double (a, Double) m ()
-> ConduitT Double (a, Double) m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> a -> ConduitT Double (a, Double) m ()
loop (a
ia -> a -> a
forall a. Num a => a -> a -> a
+a
1)
                                               else a -> ConduitT Double (a, Double) m ()
loop (a
ia -> a -> a
forall a. Num a => a -> a -> a
+a
1)
                    Maybe Double
_ -> () -> ConduitT Double (a, Double) m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
{-# INLINE findTFBSSlow #-}

-- | the largest possible match scores starting from every position of a DNA sequence
maxMatchSc :: Bkgd -> PWM -> DNA a -> Double
maxMatchSc :: Bkgd -> PWM -> DNA a -> Double
maxMatchSc Bkgd
bg PWM
pwm DNA a
dna = Double -> Int -> Double
loop (-Double
1Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/Double
0) Int
0
  where
    loop :: Double -> Int -> Double
loop !Double
max' !Int
i | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1 = Double
max'
                  | Bool
otherwise = if Int
d Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1 then Double -> Int -> Double
loop Double
sc (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
                                              else Double -> Int -> Double
loop Double
max' (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
      where
        (Int
d, Double
sc) = Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
forall a.
Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
lookAheadSearch Bkgd
bg PWM
pwm Vector Double
sigma DNA a
dna Int
i Double
max'
    sigma :: Vector Double
sigma = Bkgd -> PWM -> Vector Double
optimalScoresSuffix Bkgd
bg PWM
pwm
    l :: Int
l = DNA a -> Int
forall (s :: * -> *) a. BioSeq' s => s a -> Int
Seq.length DNA a
dna
    n :: Int
n = PWM -> Int
size PWM
pwm
{-# INLINE maxMatchSc #-}

optimalScoresSuffix :: Bkgd -> PWM -> U.Vector Double
optimalScoresSuffix :: Bkgd -> PWM -> Vector Double
optimalScoresSuffix (BG (Double
a, Double
c, Double
g, Double
t)) (PWM Maybe Int
_ Matrix Double
pwm) =
    [Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
U.fromList ([Double] -> Vector Double)
-> ([Double] -> [Double]) -> [Double] -> Vector Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Double] -> [Double]
forall a. [a] -> [a]
tail ([Double] -> [Double])
-> ([Double] -> [Double]) -> [Double] -> [Double]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Double -> Double) -> [Double] -> [Double]
forall a b. (a -> b) -> [a] -> [b]
map ([Double] -> Double
forall a. [a] -> a
last [Double]
sigma Double -> Double -> Double
forall a. Num a => a -> a -> a
-) ([Double] -> Vector Double) -> [Double] -> Vector Double
forall a b. (a -> b) -> a -> b
$ [Double]
sigma
  where
    sigma :: [Double]
sigma = (Double -> Vector Double -> Double)
-> Double -> [Vector Double] -> [Double]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl Double -> Vector Double -> Double
f Double
0 ([Vector Double] -> [Double]) -> [Vector Double] -> [Double]
forall a b. (a -> b) -> a -> b
$ Matrix Double -> [Vector Double]
forall a. Context a => Matrix a -> [Vector a]
M.toRows Matrix Double
pwm
    f :: Double -> Vector Double -> Double
f !Double
acc Vector Double
xs = let (Int
i, Double
s) = ((Int, Double) -> (Int, Double) -> Ordering)
-> Vector (Int, Double) -> (Int, Double)
forall a. Unbox a => (a -> a -> Ordering) -> Vector a -> a
U.maximumBy (((Int, Double) -> Double)
-> (Int, Double) -> (Int, Double) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Int, Double) -> Double
forall a b. (a, b) -> b
snd) (Vector (Int, Double) -> (Int, Double))
-> (Vector Double -> Vector (Int, Double))
-> Vector Double
-> (Int, Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
                                Vector Int -> Vector Double -> Vector (Int, Double)
forall a b.
(Unbox a, Unbox b) =>
Vector a -> Vector b -> Vector (a, b)
U.zip ([Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
U.fromList ([Int
0..Int
3] :: [Int])) (Vector Double -> (Int, Double)) -> Vector Double -> (Int, Double)
forall a b. (a -> b) -> a -> b
$ Vector Double
xs
                in Double
acc Double -> Double -> Double
forall a. Num a => a -> a -> a
+ case Int
i of
                    Int
0 -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Double
s Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
a
                    Int
1 -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Double
s Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
c
                    Int
2 -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Double
s Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
g
                    Int
3 -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Double
s Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
t
                    Int
_ -> Double
forall a. HasCallStack => a
undefined
{-# INLINE optimalScoresSuffix #-}

lookAheadSearch :: Bkgd              -- ^ background nucleotide distribution
                -> PWM               -- ^ pwm
                -> U.Vector Double   -- ^ best possible match score of suffixes
                -> DNA a             -- ^ DNA sequence
                -> Int               -- ^ starting location on the DNA
                -> Double            -- ^ threshold
                -> (Int, Double)     -- ^ (d, sc_d), the largest d such that sc_d > th_d
lookAheadSearch :: Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
lookAheadSearch (BG (Double
a, Double
c, Double
g, Double
t)) PWM
pwm Vector Double
sigma DNA a
dna Int
start Double
thres = (Double, Double) -> Int -> (Int, Double)
loop (Double
0, -Double
1) Int
0
  where
    loop :: (Double, Double) -> Int -> (Int, Double)
loop (!Double
acc, !Double
th_d) !Int
d
      | Double
acc Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
th_d = (Int
dInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
2, Double
acc)
      | Bool
otherwise = if Int
d Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
n
                       then (Int
dInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1, Double
acc)
                       else (Double, Double) -> Int -> (Int, Double)
loop (Double
acc Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
sc, Double
thres Double -> Double -> Double
forall a. Num a => a -> a -> a
- Vector Double
sigma Vector Double -> Int -> Double
forall a. Unbox a => Vector a -> Int -> a
U.! Int
d) (Int
dInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
      where
        sc :: Double
sc = case DNA a -> ByteString
forall (s :: * -> *) a. BioSeq' s => s a -> ByteString
toBS DNA a
dna ByteString -> Int -> Char
`B.index` (Int
start Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
d) of
              Char
'A' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! Double
matchA Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
a
              Char
'C' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! Double
matchC Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
c
              Char
'G' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! Double
matchG Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
g
              Char
'T' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! Double
matchT Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
t
              Char
'N' -> Double
0
              Char
'V' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchA Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchC Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchG) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
a Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
c Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
g)
              Char
'H' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchA Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchC Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchT) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
a Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
c Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
t)
              Char
'D' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchA Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchG Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchT) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
a Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
g Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
t)
              Char
'B' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchC Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchG Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchT) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
c Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
g Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
t)
              Char
'M' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchA Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchC) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
a Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
c)
              Char
'K' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchG Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchT) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
g Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
t)
              Char
'W' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchA Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchT) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
a Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
t)
              Char
'S' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchC Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchG) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
c Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
g)
              Char
'Y' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchC Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchT) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
c Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
t)
              Char
'R' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! (Double
matchA Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
matchG) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
a Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
g)
              Char
_   -> [Char] -> Double
forall a. HasCallStack => [Char] -> a
error [Char]
"Bio.Motif.Search.lookAheadSearch: invalid nucleotide"
        matchA :: Double
matchA = Double -> Double
addSome (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Matrix Double -> (Int, Int) -> Double
forall a. Context a => Matrix a -> (Int, Int) -> a
M.unsafeIndex (PWM -> Matrix Double
_mat PWM
pwm) (Int
d,Int
0)
        matchC :: Double
matchC = Double -> Double
addSome (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Matrix Double -> (Int, Int) -> Double
forall a. Context a => Matrix a -> (Int, Int) -> a
M.unsafeIndex (PWM -> Matrix Double
_mat PWM
pwm) (Int
d,Int
1)
        matchG :: Double
matchG = Double -> Double
addSome (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Matrix Double -> (Int, Int) -> Double
forall a. Context a => Matrix a -> (Int, Int) -> a
M.unsafeIndex (PWM -> Matrix Double
_mat PWM
pwm) (Int
d,Int
2)
        matchT :: Double
matchT = Double -> Double
addSome (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Matrix Double -> (Int, Int) -> Double
forall a. Context a => Matrix a -> (Int, Int) -> a
M.unsafeIndex (PWM -> Matrix Double
_mat PWM
pwm) (Int
d,Int
3)
        addSome :: Double -> Double
addSome !Double
x | Double
x Double -> Double -> Bool
forall a. Eq a => a -> a -> Bool
== Double
0 = Double
pseudoCount
                   | Bool
otherwise = Double
x
        pseudoCount :: Double
pseudoCount = Double
0.0001
    n :: Int
n = PWM -> Int
size PWM
pwm
{-# INLINE lookAheadSearch #-}

-- | this version skip sequences contain ambiguous bases, like "N"
lookAheadSearch' :: Bkgd              -- ^ background nucleotide distribution
                 -> PWM               -- ^ pwm
                 -> U.Vector Double   -- ^ best possible match score of suffixes
                 -> DNA a             -- ^ DNA sequence
                 -> Int               -- ^ starting location on the DNA
                 -> Double            -- ^ threshold
                 -> (Int, Double)     -- ^ (d, sc_d), the largest d such that sc_d > th_d
lookAheadSearch' :: Bkgd
-> PWM -> Vector Double -> DNA a -> Int -> Double -> (Int, Double)
lookAheadSearch' (BG (Double
a, Double
c, Double
g, Double
t)) PWM
pwm Vector Double
sigma DNA a
dna Int
start Double
thres = (Double, Double) -> Int -> (Int, Double)
loop (Double
0, -Double
1) Int
0
  where
    loop :: (Double, Double) -> Int -> (Int, Double)
loop (!Double
acc, !Double
th_d) !Int
d
      | Double
acc Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
th_d = (Int
dInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
2, Double
acc)
      | Bool
otherwise = if Int
d Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
n
                       then (Int
dInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1, Double
acc)
                       else (Double, Double) -> Int -> (Int, Double)
loop (Double
acc Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
sc, Double
thres Double -> Double -> Double
forall a. Num a => a -> a -> a
- Vector Double
sigma Vector Double -> Int -> Double
forall a. Unbox a => Vector a -> Int -> a
U.! Int
d) (Int
dInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
      where
        sc :: Double
sc = case DNA a -> ByteString
forall (s :: * -> *) a. BioSeq' s => s a -> ByteString
toBS DNA a
dna ByteString -> Int -> Char
`B.index` (Int
start Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
d) of
              Char
'A' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! Double
matchA Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
a
              Char
'C' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! Double
matchC Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
c
              Char
'G' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! Double
matchG Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
g
              Char
'T' -> Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$! Double
matchT Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
t
              Char
_   -> -Double
1 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
0
        matchA :: Double
matchA = Double -> Double
addSome (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Matrix Double -> (Int, Int) -> Double
forall a. Context a => Matrix a -> (Int, Int) -> a
M.unsafeIndex (PWM -> Matrix Double
_mat PWM
pwm) (Int
d,Int
0)
        matchC :: Double
matchC = Double -> Double
addSome (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Matrix Double -> (Int, Int) -> Double
forall a. Context a => Matrix a -> (Int, Int) -> a
M.unsafeIndex (PWM -> Matrix Double
_mat PWM
pwm) (Int
d,Int
1)
        matchG :: Double
matchG = Double -> Double
addSome (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Matrix Double -> (Int, Int) -> Double
forall a. Context a => Matrix a -> (Int, Int) -> a
M.unsafeIndex (PWM -> Matrix Double
_mat PWM
pwm) (Int
d,Int
2)
        matchT :: Double
matchT = Double -> Double
addSome (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Matrix Double -> (Int, Int) -> Double
forall a. Context a => Matrix a -> (Int, Int) -> a
M.unsafeIndex (PWM -> Matrix Double
_mat PWM
pwm) (Int
d,Int
3)
        addSome :: Double -> Double
addSome !Double
x | Double
x Double -> Double -> Bool
forall a. Eq a => a -> a -> Bool
== Double
0 = Double
pseudoCount
                   | Bool
otherwise = Double
x
        pseudoCount :: Double
pseudoCount = Double
0.0001
    n :: Int
n = PWM -> Int
size PWM
pwm
{-# INLINE lookAheadSearch' #-}

{-
data SpaceDistribution = SpaceDistribution
    { _motif1   :: Motif
    , _nSites1  :: (Int, Int)
    , _motif2   :: Motif
    , _nSites2  :: (Int, Int)
    , _same     :: [(Int, Int)]
    , _opposite :: [(Int, Int)]
    } deriving (Show, Read)

-- | search for spacing constraint between two TFs
spaceConstraint :: [(Motif, Motif)]   -- ^ motifs, names must be unique
                -> Bkgd    -- ^ backgroud nucleotide distribution
                -> Double  -- ^ p-Value for motif finding
                -> Int     -- ^ half window size, typical 5
                -> Int     -- ^ max distance to search, typical 300
                -> DNA a -> [SpaceDistribution]
spaceConstraint pairs bg th w k dna = flip map pairs $ \(a, b) ->
    let (m1, site1) = HM.lookupDefault undefined (_name a) sites
        (m2, site2) = HM.lookupDefault undefined (_name b) sites
        (same, opposite) = spaceConstraintHelper site1 site2 w k
    in SpaceDistribution m1 ((U.length *** U.length) site1) m2
        ((U.length *** U.length) site2) same opposite
  where
    motifs = nubBy ((==) `on` _name) $ concatMap (\(a,b) -> [a,b]) pairs
    findSites (Motif _ pwm) = (fwd, rev)
      where
        fwd = runST $ runConduit $ findTFBS bg pwm dna cutoff True .| sinkVector
        rev = runST $ runConduit $ findTFBS bg pwm' dna cutoff' True .|
              mapC (+ (size pwm - 1)) .| sinkVector
        cutoff = pValueToScore th bg pwm
        cutoff' = pValueToScore th bg pwm'
        pwm' = rcPWM pwm
    sites = HM.fromList $ map (\m -> (_name m, (m, findSites m))) motifs
{-# INLINE spaceConstraint #-}

spaceConstraintHelper :: (U.Vector Int, U.Vector Int)
                      -> (U.Vector Int, U.Vector Int)
                      -> Int
                      -> Int
                      -> ([(Int, Int)], [(Int, Int)])
spaceConstraintHelper (fw1, rv1) (fw2, rv2) w k = (same, oppose)
  where
    rs = let rs' = [-k, -k+2*w+1 .. 0]
         in rs' ++ map (*(-1)) (reverse rs')
    fw2' = S.fromList $ U.toList fw2
    rv2' = S.fromList $ U.toList rv2
    nOverlap :: U.Vector Int -> S.HashSet Int -> Int -> Int -> Int
    nOverlap xs ys w' i = U.foldl' f 0 xs
      where
        f acc x | any (`S.member` ys) [x + i - w' .. x + i + w'] = acc + 1
                | otherwise = acc
    same = zip rs $ zipWith (+) nFF nRR
      where
        nFF = map (nOverlap fw1 fw2' w) rs
        nRR = map (nOverlap rv1 rv2' w) $ reverse rs
    oppose = zip rs $ zipWith (+) nFR nRF
      where
        nFR = map (nOverlap fw1 rv2' w) rs
        nRF = map (nOverlap rv1 fw2' w) $ reverse rs
{-# INLINE spaceConstraintHelper #-}

computePValue :: Double -> [Int] -> [(Int, Double)]
computePValue p xs = zip xs $ map (pValue n p) xs
  where
    n = foldl' (+) 0 xs

pValue :: Int -> Double -> Int -> Double
pValue n p x | n > 2000 = complCumulative (poisson (fromIntegral n* p)) $ fromIntegral x
             | otherwise = complCumulative (binomial n p) $ fromIntegral x

-}