{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}

module Bio.RealWorld.GENCODE
    ( Gene(..)
    , Transcript(..)
    , TranscriptType(..)
    , readGenes
    , readGenesC
    , getPromoters
    , getDomains
    ) where

import           Conduit
import qualified Data.ByteString.Char8 as B
import           Data.CaseInsensitive  (CI, mk)
import qualified Data.HashMap.Strict   as M
import Data.List.Ordered (nubSort)
import           Data.Maybe            (fromMaybe, fromJust, isNothing)
import Lens.Micro
import Data.List (foldl')
import Data.Char (toLower)
import qualified Data.Vector as V

import Bio.Data.Bed
import Bio.Data.Bed.Types
import           Bio.Utils.Misc        (readInt)

data TranscriptType = Coding
                    | NonCoding
                    deriving (Int -> TranscriptType -> ShowS
[TranscriptType] -> ShowS
TranscriptType -> String
(Int -> TranscriptType -> ShowS)
-> (TranscriptType -> String)
-> ([TranscriptType] -> ShowS)
-> Show TranscriptType
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [TranscriptType] -> ShowS
$cshowList :: [TranscriptType] -> ShowS
show :: TranscriptType -> String
$cshow :: TranscriptType -> String
showsPrec :: Int -> TranscriptType -> ShowS
$cshowsPrec :: Int -> TranscriptType -> ShowS
Show, TranscriptType -> TranscriptType -> Bool
(TranscriptType -> TranscriptType -> Bool)
-> (TranscriptType -> TranscriptType -> Bool) -> Eq TranscriptType
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TranscriptType -> TranscriptType -> Bool
$c/= :: TranscriptType -> TranscriptType -> Bool
== :: TranscriptType -> TranscriptType -> Bool
$c== :: TranscriptType -> TranscriptType -> Bool
Eq, Eq TranscriptType
Eq TranscriptType
-> (TranscriptType -> TranscriptType -> Ordering)
-> (TranscriptType -> TranscriptType -> Bool)
-> (TranscriptType -> TranscriptType -> Bool)
-> (TranscriptType -> TranscriptType -> Bool)
-> (TranscriptType -> TranscriptType -> Bool)
-> (TranscriptType -> TranscriptType -> TranscriptType)
-> (TranscriptType -> TranscriptType -> TranscriptType)
-> Ord TranscriptType
TranscriptType -> TranscriptType -> Bool
TranscriptType -> TranscriptType -> Ordering
TranscriptType -> TranscriptType -> TranscriptType
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: TranscriptType -> TranscriptType -> TranscriptType
$cmin :: TranscriptType -> TranscriptType -> TranscriptType
max :: TranscriptType -> TranscriptType -> TranscriptType
$cmax :: TranscriptType -> TranscriptType -> TranscriptType
>= :: TranscriptType -> TranscriptType -> Bool
$c>= :: TranscriptType -> TranscriptType -> Bool
> :: TranscriptType -> TranscriptType -> Bool
$c> :: TranscriptType -> TranscriptType -> Bool
<= :: TranscriptType -> TranscriptType -> Bool
$c<= :: TranscriptType -> TranscriptType -> Bool
< :: TranscriptType -> TranscriptType -> Bool
$c< :: TranscriptType -> TranscriptType -> Bool
compare :: TranscriptType -> TranscriptType -> Ordering
$ccompare :: TranscriptType -> TranscriptType -> Ordering
$cp1Ord :: Eq TranscriptType
Ord)

-- | GTF's position is 1-based, but here we convert it to 0-based indexing.
data Gene = Gene
    { Gene -> CI ByteString
geneName        :: !(CI B.ByteString)
    , Gene -> ByteString
geneId          :: !B.ByteString
    , Gene -> ByteString
geneChrom       :: !B.ByteString
    , Gene -> Int
geneLeft        :: !Int
    , Gene -> Int
geneRight       :: !Int
    , Gene -> Bool
geneStrand      :: !Bool
    , Gene -> [Transcript]
geneTranscripts :: ![Transcript]
    } deriving (Int -> Gene -> ShowS
[Gene] -> ShowS
Gene -> String
(Int -> Gene -> ShowS)
-> (Gene -> String) -> ([Gene] -> ShowS) -> Show Gene
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Gene] -> ShowS
$cshowList :: [Gene] -> ShowS
show :: Gene -> String
$cshow :: Gene -> String
showsPrec :: Int -> Gene -> ShowS
$cshowsPrec :: Int -> Gene -> ShowS
Show, Gene -> Gene -> Bool
(Gene -> Gene -> Bool) -> (Gene -> Gene -> Bool) -> Eq Gene
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Gene -> Gene -> Bool
$c/= :: Gene -> Gene -> Bool
== :: Gene -> Gene -> Bool
$c== :: Gene -> Gene -> Bool
Eq, Eq Gene
Eq Gene
-> (Gene -> Gene -> Ordering)
-> (Gene -> Gene -> Bool)
-> (Gene -> Gene -> Bool)
-> (Gene -> Gene -> Bool)
-> (Gene -> Gene -> Bool)
-> (Gene -> Gene -> Gene)
-> (Gene -> Gene -> Gene)
-> Ord Gene
Gene -> Gene -> Bool
Gene -> Gene -> Ordering
Gene -> Gene -> Gene
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Gene -> Gene -> Gene
$cmin :: Gene -> Gene -> Gene
max :: Gene -> Gene -> Gene
$cmax :: Gene -> Gene -> Gene
>= :: Gene -> Gene -> Bool
$c>= :: Gene -> Gene -> Bool
> :: Gene -> Gene -> Bool
$c> :: Gene -> Gene -> Bool
<= :: Gene -> Gene -> Bool
$c<= :: Gene -> Gene -> Bool
< :: Gene -> Gene -> Bool
$c< :: Gene -> Gene -> Bool
compare :: Gene -> Gene -> Ordering
$ccompare :: Gene -> Gene -> Ordering
$cp1Ord :: Eq Gene
Ord)

data Transcript = Transcript
    { Transcript -> ByteString
transId          :: !B.ByteString
    , Transcript -> Int
transLeft        :: !Int
    , Transcript -> Int
transRight       :: !Int
    , Transcript -> Bool
transStrand      :: !Bool
    , Transcript -> [(Int, Int)]
transExon        :: ![(Int, Int)]
    , Transcript -> [(Int, Int)]
transUTR         :: ![(Int, Int)]
    , Transcript -> TranscriptType
transType        :: TranscriptType
    } deriving (Int -> Transcript -> ShowS
[Transcript] -> ShowS
Transcript -> String
(Int -> Transcript -> ShowS)
-> (Transcript -> String)
-> ([Transcript] -> ShowS)
-> Show Transcript
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Transcript] -> ShowS
$cshowList :: [Transcript] -> ShowS
show :: Transcript -> String
$cshow :: Transcript -> String
showsPrec :: Int -> Transcript -> ShowS
$cshowsPrec :: Int -> Transcript -> ShowS
Show, Transcript -> Transcript -> Bool
(Transcript -> Transcript -> Bool)
-> (Transcript -> Transcript -> Bool) -> Eq Transcript
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Transcript -> Transcript -> Bool
$c/= :: Transcript -> Transcript -> Bool
== :: Transcript -> Transcript -> Bool
$c== :: Transcript -> Transcript -> Bool
Eq, Eq Transcript
Eq Transcript
-> (Transcript -> Transcript -> Ordering)
-> (Transcript -> Transcript -> Bool)
-> (Transcript -> Transcript -> Bool)
-> (Transcript -> Transcript -> Bool)
-> (Transcript -> Transcript -> Bool)
-> (Transcript -> Transcript -> Transcript)
-> (Transcript -> Transcript -> Transcript)
-> Ord Transcript
Transcript -> Transcript -> Bool
Transcript -> Transcript -> Ordering
Transcript -> Transcript -> Transcript
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Transcript -> Transcript -> Transcript
$cmin :: Transcript -> Transcript -> Transcript
max :: Transcript -> Transcript -> Transcript
$cmax :: Transcript -> Transcript -> Transcript
>= :: Transcript -> Transcript -> Bool
$c>= :: Transcript -> Transcript -> Bool
> :: Transcript -> Transcript -> Bool
$c> :: Transcript -> Transcript -> Bool
<= :: Transcript -> Transcript -> Bool
$c<= :: Transcript -> Transcript -> Bool
< :: Transcript -> Transcript -> Bool
$c< :: Transcript -> Transcript -> Bool
compare :: Transcript -> Transcript -> Ordering
$ccompare :: Transcript -> Transcript -> Ordering
$cp1Ord :: Eq Transcript
Ord)

-- | Read gene information from Gencode GTF file
readGenes :: FilePath -> IO [Gene]
readGenes :: String -> IO [Gene]
readGenes String
input = ResourceT IO [Gene] -> IO [Gene]
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO [Gene] -> IO [Gene])
-> ResourceT IO [Gene] -> IO [Gene]
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) [Gene] -> ResourceT IO [Gene]
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) [Gene] -> ResourceT IO [Gene])
-> ConduitT () Void (ResourceT IO) [Gene] -> ResourceT IO [Gene]
forall a b. (a -> b) -> a -> b
$ String -> ConduitT () ByteString (ResourceT IO) ()
forall (m :: * -> *) i.
MonadResource m =>
String -> ConduitT i ByteString m ()
sourceFile String
input ConduitT () ByteString (ResourceT IO) ()
-> ConduitM ByteString Void (ResourceT IO) [Gene]
-> ConduitT () Void (ResourceT IO) [Gene]
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM ByteString Void (ResourceT IO) [Gene]
forall (m :: * -> *) o. Monad m => ConduitT ByteString o m [Gene]
readGenesC

readGenesC :: Monad m => ConduitT B.ByteString o m [Gene]
readGenesC :: ConduitT ByteString o m [Gene]
readGenesC = do
    ([Gene]
genes, [(ByteString, Transcript)]
transcripts, [(ByteString, (Int, Int))]
exons, [(ByteString, (Int, Int))]
utrs) <- ConduitT
  ByteString
  o
  m
  ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
   [(ByteString, (Int, Int))])
forall (m :: * -> *) o.
Monad m =>
ConduitT
  ByteString
  o
  m
  ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
   [(ByteString, (Int, Int))])
readElements
    let t :: HashMap ByteString (ByteString, Transcript)
t = [(ByteString, (ByteString, Transcript))]
-> HashMap ByteString (ByteString, Transcript)
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.fromList ([(ByteString, (ByteString, Transcript))]
 -> HashMap ByteString (ByteString, Transcript))
-> [(ByteString, (ByteString, Transcript))]
-> HashMap ByteString (ByteString, Transcript)
forall a b. (a -> b) -> a -> b
$ ((ByteString, Transcript)
 -> (ByteString, (ByteString, Transcript)))
-> [(ByteString, Transcript)]
-> [(ByteString, (ByteString, Transcript))]
forall a b. (a -> b) -> [a] -> [b]
map (\(ByteString
a,Transcript
b) -> (Transcript -> ByteString
transId Transcript
b, (ByteString
a,Transcript
b))) [(ByteString, Transcript)]
transcripts
    [Gene] -> ConduitT ByteString o m [Gene]
forall (m :: * -> *) a. Monad m => a -> m a
return ([Gene] -> ConduitT ByteString o m [Gene])
-> [Gene] -> ConduitT ByteString o m [Gene]
forall a b. (a -> b) -> a -> b
$ [Gene] -> [Gene]
nubGene ([Gene] -> [Gene]) -> [Gene] -> [Gene]
forall a b. (a -> b) -> a -> b
$ HashMap ByteString Gene -> [Gene]
forall k v. HashMap k v -> [v]
M.elems (HashMap ByteString Gene -> [Gene])
-> HashMap ByteString Gene -> [Gene]
forall a b. (a -> b) -> a -> b
$ (HashMap ByteString Gene
 -> (ByteString, Transcript) -> HashMap ByteString Gene)
-> HashMap ByteString Gene
-> [(ByteString, Transcript)]
-> HashMap ByteString Gene
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' HashMap ByteString Gene
-> (ByteString, Transcript) -> HashMap ByteString Gene
addTranscript
        ([(ByteString, Gene)] -> HashMap ByteString Gene
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.fromList ([(ByteString, Gene)] -> HashMap ByteString Gene)
-> [(ByteString, Gene)] -> HashMap ByteString Gene
forall a b. (a -> b) -> a -> b
$ (Gene -> (ByteString, Gene)) -> [Gene] -> [(ByteString, Gene)]
forall a b. (a -> b) -> [a] -> [b]
map (\Gene
x -> (Gene -> ByteString
geneId Gene
x, Gene
x)) [Gene]
genes) ([(ByteString, Transcript)] -> HashMap ByteString Gene)
-> [(ByteString, Transcript)] -> HashMap ByteString Gene
forall a b. (a -> b) -> a -> b
$
        HashMap ByteString (ByteString, Transcript)
-> [(ByteString, Transcript)]
forall k v. HashMap k v -> [v]
M.elems (HashMap ByteString (ByteString, Transcript)
 -> [(ByteString, Transcript)])
-> HashMap ByteString (ByteString, Transcript)
-> [(ByteString, Transcript)]
forall a b. (a -> b) -> a -> b
$ (HashMap ByteString (ByteString, Transcript)
 -> (ByteString, (Int, Int))
 -> HashMap ByteString (ByteString, Transcript))
-> HashMap ByteString (ByteString, Transcript)
-> [(ByteString, (Int, Int))]
-> HashMap ByteString (ByteString, Transcript)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' HashMap ByteString (ByteString, Transcript)
-> (ByteString, (Int, Int))
-> HashMap ByteString (ByteString, Transcript)
forall a.
HashMap ByteString (a, Transcript)
-> (ByteString, (Int, Int)) -> HashMap ByteString (a, Transcript)
addUTR ((HashMap ByteString (ByteString, Transcript)
 -> (ByteString, (Int, Int))
 -> HashMap ByteString (ByteString, Transcript))
-> HashMap ByteString (ByteString, Transcript)
-> [(ByteString, (Int, Int))]
-> HashMap ByteString (ByteString, Transcript)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' HashMap ByteString (ByteString, Transcript)
-> (ByteString, (Int, Int))
-> HashMap ByteString (ByteString, Transcript)
forall a.
HashMap ByteString (a, Transcript)
-> (ByteString, (Int, Int)) -> HashMap ByteString (a, Transcript)
addExon HashMap ByteString (ByteString, Transcript)
t [(ByteString, (Int, Int))]
exons) [(ByteString, (Int, Int))]
utrs
{-# INLINE readGenesC #-}

nubGene :: [Gene] -> [Gene]
nubGene :: [Gene] -> [Gene]
nubGene [Gene]
gs = [Gene] -> [Gene]
forall a. Ord a => [a] -> [a]
nubSort ([Gene] -> [Gene]) -> [Gene] -> [Gene]
forall a b. (a -> b) -> a -> b
$ (Gene -> Gene) -> [Gene] -> [Gene]
forall a b. (a -> b) -> [a] -> [b]
map Gene -> Gene
nubG [Gene]
gs
  where
    nubG :: Gene -> Gene
nubG Gene
g = Gene
g { geneTranscripts :: [Transcript]
geneTranscripts = [Transcript] -> [Transcript]
forall a. Ord a => [a] -> [a]
nubSort ([Transcript] -> [Transcript]) -> [Transcript] -> [Transcript]
forall a b. (a -> b) -> a -> b
$ (Transcript -> Transcript) -> [Transcript] -> [Transcript]
forall a b. (a -> b) -> [a] -> [b]
map Transcript -> Transcript
nubT ([Transcript] -> [Transcript]) -> [Transcript] -> [Transcript]
forall a b. (a -> b) -> a -> b
$ Gene -> [Transcript]
geneTranscripts Gene
g}
    nubT :: Transcript -> Transcript
nubT Transcript
t = Transcript
t { transExon :: [(Int, Int)]
transExon = [(Int, Int)] -> [(Int, Int)]
forall a. Ord a => [a] -> [a]
nubSort ([(Int, Int)] -> [(Int, Int)]) -> [(Int, Int)] -> [(Int, Int)]
forall a b. (a -> b) -> a -> b
$ Transcript -> [(Int, Int)]
transExon Transcript
t 
               , transUTR :: [(Int, Int)]
transUTR = [(Int, Int)] -> [(Int, Int)]
forall a. Ord a => [a] -> [a]
nubSort ([(Int, Int)] -> [(Int, Int)]) -> [(Int, Int)] -> [(Int, Int)]
forall a b. (a -> b) -> a -> b
$ Transcript -> [(Int, Int)]
transUTR Transcript
t  }
{-# INLINE nubGene #-}

readElements :: Monad m => ConduitT B.ByteString o m
    ( [Gene], [(B.ByteString, Transcript)]
    , [(B.ByteString, (Int, Int))], [(B.ByteString, (Int, Int))] )
readElements :: ConduitT
  ByteString
  o
  m
  ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
   [(ByteString, (Int, Int))])
readElements = ConduitT ByteString ByteString m ()
forall (m :: * -> *) seq.
(Monad m, IsSequence seq, Element seq ~ Word8) =>
ConduitT seq seq m ()
linesUnboundedAsciiC ConduitT ByteString ByteString m ()
-> ConduitT
     ByteString
     o
     m
     ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
      [(ByteString, (Int, Int))])
-> ConduitT
     ByteString
     o
     m
     ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
      [(ByteString, (Int, Int))])
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| (([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
  [(ByteString, (Int, Int))])
 -> ByteString
 -> ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
     [(ByteString, (Int, Int))]))
-> ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
    [(ByteString, (Int, Int))])
-> ConduitT
     ByteString
     o
     m
     ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
      [(ByteString, (Int, Int))])
forall (m :: * -> *) a b o.
Monad m =>
(a -> b -> a) -> a -> ConduitT b o m a
foldlC ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
 [(ByteString, (Int, Int))])
-> ByteString
-> ([Gene], [(ByteString, Transcript)], [(ByteString, (Int, Int))],
    [(ByteString, (Int, Int))])
forall s.
(Field1 s s [Gene] [Gene],
 Field2 s s [(ByteString, Transcript)] [(ByteString, Transcript)],
 Field3 s s [(ByteString, (Int, Int))] [(ByteString, (Int, Int))],
 Field4
   s s [(ByteString, (Int, Int))] [(ByteString, (Int, Int))]) =>
s -> ByteString -> s
f ([], [], [], [])
  where
    f :: s -> ByteString -> s
f s
acc ByteString
l
        | ByteString -> Char
B.head ByteString
l Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#' = s
acc
        | ByteString
featType ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
"gene" = ([Gene] -> Identity [Gene]) -> s -> Identity s
forall s t a b. Field1 s t a b => Lens s t a b
_1 (([Gene] -> Identity [Gene]) -> s -> Identity s)
-> ([Gene] -> [Gene]) -> s -> s
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
%~ (Gene
geneGene -> [Gene] -> [Gene]
forall a. a -> [a] -> [a]
:) (s -> s) -> s -> s
forall a b. (a -> b) -> a -> b
$ s
acc
        | ByteString
featType ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
"transcript" = ([(ByteString, Transcript)] -> Identity [(ByteString, Transcript)])
-> s -> Identity s
forall s t a b. Field2 s t a b => Lens s t a b
_2 (([(ByteString, Transcript)]
  -> Identity [(ByteString, Transcript)])
 -> s -> Identity s)
-> ([(ByteString, Transcript)] -> [(ByteString, Transcript)])
-> s
-> s
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
%~ ((ByteString
gid, Transcript
transcript)(ByteString, Transcript)
-> [(ByteString, Transcript)] -> [(ByteString, Transcript)]
forall a. a -> [a] -> [a]
:) (s -> s) -> s -> s
forall a b. (a -> b) -> a -> b
$ s
acc
        | ByteString
featType ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
"exon" = ([(ByteString, (Int, Int))] -> Identity [(ByteString, (Int, Int))])
-> s -> Identity s
forall s t a b. Field3 s t a b => Lens s t a b
_3 (([(ByteString, (Int, Int))]
  -> Identity [(ByteString, (Int, Int))])
 -> s -> Identity s)
-> ([(ByteString, (Int, Int))] -> [(ByteString, (Int, Int))])
-> s
-> s
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
%~ ((ByteString
tid, (Int, Int)
exon)(ByteString, (Int, Int))
-> [(ByteString, (Int, Int))] -> [(ByteString, (Int, Int))]
forall a. a -> [a] -> [a]
:) (s -> s) -> s -> s
forall a b. (a -> b) -> a -> b
$ s
acc
        | ByteString
featType ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
"utr" = ([(ByteString, (Int, Int))] -> Identity [(ByteString, (Int, Int))])
-> s -> Identity s
forall s t a b. Field4 s t a b => Lens s t a b
_4 (([(ByteString, (Int, Int))]
  -> Identity [(ByteString, (Int, Int))])
 -> s -> Identity s)
-> ([(ByteString, (Int, Int))] -> [(ByteString, (Int, Int))])
-> s
-> s
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
%~ ((ByteString
tid, (Int, Int)
utr)(ByteString, (Int, Int))
-> [(ByteString, (Int, Int))] -> [(ByteString, (Int, Int))]
forall a. a -> [a] -> [a]
:) (s -> s) -> s -> s
forall a b. (a -> b) -> a -> b
$ s
acc
        | Bool
otherwise = s
acc
      where
        gene :: Gene
gene = CI ByteString
-> ByteString
-> ByteString
-> Int
-> Int
-> Bool
-> [Transcript]
-> Gene
Gene (ByteString -> CI ByteString
forall s. FoldCase s => s -> CI s
mk (ByteString -> CI ByteString) -> ByteString -> CI ByteString
forall a b. (a -> b) -> a -> b
$ ByteString -> Maybe ByteString -> ByteString
forall a. a -> Maybe a -> a
fromMaybe (String -> ByteString
forall a. HasCallStack => String -> a
error String
"could not find \"gene_name\"") (Maybe ByteString -> ByteString) -> Maybe ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$
            ByteString -> Maybe ByteString
getField ByteString
"gene_name") ByteString
gid ByteString
chr Int
lPos Int
rPos (ByteString
f7ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
==ByteString
"+") []
        transcript :: Transcript
transcript = ByteString
-> Int
-> Int
-> Bool
-> [(Int, Int)]
-> [(Int, Int)]
-> TranscriptType
-> Transcript
Transcript ByteString
tid Int
lPos Int
rPos (ByteString
f7ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
==ByteString
"+") [] [] TranscriptType
tTy
        exon :: (Int, Int)
exon = (Int
lPos, Int
rPos)
        utr :: (Int, Int)
utr = (Int
lPos, Int
rPos)
        [ByteString
chr,ByteString
_,ByteString
f3,ByteString
f4,ByteString
f5,ByteString
_,ByteString
f7,ByteString
_,ByteString
f9] = Char -> ByteString -> [ByteString]
B.split Char
'\t' ByteString
l
        gid :: ByteString
gid = ByteString -> Maybe ByteString -> ByteString
forall a. a -> Maybe a -> a
fromMaybe (String -> ByteString
forall a. HasCallStack => String -> a
error String
"could not find \"gene_id\"") (Maybe ByteString -> ByteString) -> Maybe ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ ByteString -> Maybe ByteString
getField ByteString
"gene_id"
        tid :: ByteString
tid = ByteString -> Maybe ByteString -> ByteString
forall a. a -> Maybe a -> a
fromMaybe (String -> ByteString
forall a. HasCallStack => String -> a
error String
"could not find \"transcript_id\"") (Maybe ByteString -> ByteString) -> Maybe ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ ByteString -> Maybe ByteString
getField ByteString
"transcript_id"
        tTy :: TranscriptType
tTy = case ByteString -> Maybe ByteString
getField ByteString
"transcript_type" of
            Just ByteString
"protein_coding" -> TranscriptType
Coding
            Maybe ByteString
Nothing -> TranscriptType
Coding
            Maybe ByteString
_ -> TranscriptType
NonCoding
        lPos :: Int
lPos = ByteString -> Int
readInt ByteString
f4 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
        rPos :: Int
rPos = ByteString -> Int
readInt ByteString
f5 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
        featType :: ByteString
featType = (Char -> Char) -> ByteString -> ByteString
B.map Char -> Char
toLower ByteString
f3
        getField :: ByteString -> Maybe ByteString
getField ByteString
x = (ByteString -> ByteString) -> Maybe ByteString -> Maybe ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (ByteString -> ByteString
B.init (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> ByteString
B.drop Int
2) (Maybe ByteString -> Maybe ByteString)
-> Maybe ByteString -> Maybe ByteString
forall a b. (a -> b) -> a -> b
$ ByteString -> [(ByteString, ByteString)] -> Maybe ByteString
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup ByteString
x ([(ByteString, ByteString)] -> Maybe ByteString)
-> [(ByteString, ByteString)] -> Maybe ByteString
forall a b. (a -> b) -> a -> b
$
            (ByteString -> (ByteString, ByteString))
-> [ByteString] -> [(ByteString, ByteString)]
forall a b. (a -> b) -> [a] -> [b]
map ((Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.break Char -> Bool
isSpace (ByteString -> (ByteString, ByteString))
-> (ByteString -> ByteString)
-> ByteString
-> (ByteString, ByteString)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
strip) ([ByteString] -> [(ByteString, ByteString)])
-> [ByteString] -> [(ByteString, ByteString)]
forall a b. (a -> b) -> a -> b
$ Char -> ByteString -> [ByteString]
B.split Char
';' ByteString
f9
    strip :: ByteString -> ByteString
strip = (ByteString, ByteString) -> ByteString
forall a b. (a, b) -> a
fst ((ByteString, ByteString) -> ByteString)
-> (ByteString -> (ByteString, ByteString))
-> ByteString
-> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.spanEnd Char -> Bool
isSpace (ByteString -> (ByteString, ByteString))
-> (ByteString -> ByteString)
-> ByteString
-> (ByteString, ByteString)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> ByteString -> ByteString
B.dropWhile Char -> Bool
isSpace
    isSpace :: Char -> Bool
isSpace = (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
' ')
{-# INLINE readElements #-}

addExon :: M.HashMap B.ByteString (a, Transcript)
        -> (B.ByteString, (Int, Int))
        -> M.HashMap B.ByteString (a, Transcript)
addExon :: HashMap ByteString (a, Transcript)
-> (ByteString, (Int, Int)) -> HashMap ByteString (a, Transcript)
addExon HashMap ByteString (a, Transcript)
m (ByteString
key, (Int, Int)
val) = ((a, Transcript) -> (a, Transcript))
-> ByteString
-> HashMap ByteString (a, Transcript)
-> HashMap ByteString (a, Transcript)
forall k v.
(Eq k, Hashable k) =>
(v -> v) -> k -> HashMap k v -> HashMap k v
M.adjust (\(a
x, Transcript
trans) ->
    (a
x, Transcript
trans{transExon :: [(Int, Int)]
transExon = (Int, Int)
val (Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
: Transcript -> [(Int, Int)]
transExon Transcript
trans})) ByteString
key HashMap ByteString (a, Transcript)
m
{-# INLINE addExon #-}

addUTR :: M.HashMap B.ByteString (a, Transcript)
       -> (B.ByteString, (Int, Int))
       -> M.HashMap B.ByteString (a, Transcript)
addUTR :: HashMap ByteString (a, Transcript)
-> (ByteString, (Int, Int)) -> HashMap ByteString (a, Transcript)
addUTR HashMap ByteString (a, Transcript)
m (ByteString
key, (Int, Int)
val) = ((a, Transcript) -> (a, Transcript))
-> ByteString
-> HashMap ByteString (a, Transcript)
-> HashMap ByteString (a, Transcript)
forall k v.
(Eq k, Hashable k) =>
(v -> v) -> k -> HashMap k v -> HashMap k v
M.adjust (\(a
x, Transcript
trans) ->
    (a
x, Transcript
trans{transUTR :: [(Int, Int)]
transUTR = (Int, Int)
val (Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
: Transcript -> [(Int, Int)]
transUTR Transcript
trans})) ByteString
key HashMap ByteString (a, Transcript)
m
{-# INLINE addUTR #-}

addTranscript :: M.HashMap B.ByteString Gene
              -> (B.ByteString, Transcript)
              -> M.HashMap B.ByteString Gene
addTranscript :: HashMap ByteString Gene
-> (ByteString, Transcript) -> HashMap ByteString Gene
addTranscript HashMap ByteString Gene
m (ByteString
key, Transcript
val) = (Gene -> Gene)
-> ByteString -> HashMap ByteString Gene -> HashMap ByteString Gene
forall k v.
(Eq k, Hashable k) =>
(v -> v) -> k -> HashMap k v -> HashMap k v
M.adjust (\Gene
gene ->
    Gene
gene{geneTranscripts :: [Transcript]
geneTranscripts = Transcript
val Transcript -> [Transcript] -> [Transcript]
forall a. a -> [a] -> [a]
: Gene -> [Transcript]
geneTranscripts Gene
gene}) ByteString
key HashMap ByteString Gene
m
{-# INLINE addTranscript #-}

getPromoters :: Int   -- ^ upstream
             -> Int   -- ^ downstream
             -> Gene
             -> [BEDExt BED3 (Int, CI B.ByteString)]
getPromoters :: Int -> Int -> Gene -> [BEDExt BED3 (Int, CI ByteString)]
getPromoters Int
up Int
down Gene{Bool
Int
[Transcript]
ByteString
CI ByteString
geneTranscripts :: [Transcript]
geneStrand :: Bool
geneRight :: Int
geneLeft :: Int
geneChrom :: ByteString
geneId :: ByteString
geneName :: CI ByteString
geneTranscripts :: Gene -> [Transcript]
geneStrand :: Gene -> Bool
geneRight :: Gene -> Int
geneLeft :: Gene -> Int
geneChrom :: Gene -> ByteString
geneId :: Gene -> ByteString
geneName :: Gene -> CI ByteString
..} = (Int -> BEDExt BED3 (Int, CI ByteString))
-> [Int] -> [BEDExt BED3 (Int, CI ByteString)]
forall a b. (a -> b) -> [a] -> [b]
map Int -> BEDExt BED3 (Int, CI ByteString)
forall bed.
BEDConvert bed =>
Int -> BEDExt bed (Int, CI ByteString)
g ([Int] -> [BEDExt BED3 (Int, CI ByteString)])
-> [Int] -> [BEDExt BED3 (Int, CI ByteString)]
forall a b. (a -> b) -> a -> b
$ [Int] -> [Int]
forall a. Ord a => [a] -> [a]
nubSort [Int]
tss
  where
    g :: Int -> BEDExt bed (Int, CI ByteString)
g Int
x | Bool
geneStrand = bed -> (Int, CI ByteString) -> BEDExt bed (Int, CI ByteString)
forall bed a. bed -> a -> BEDExt bed a
BEDExt (ByteString -> Int -> Int -> bed
forall b. BEDConvert b => ByteString -> Int -> Int -> b
asBed ByteString
geneChrom (Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
0 (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Int
x Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
up) (Int
x Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
down)) (Int
x, CI ByteString
geneName)
        | Bool
otherwise = bed -> (Int, CI ByteString) -> BEDExt bed (Int, CI ByteString)
forall bed a. bed -> a -> BEDExt bed a
BEDExt (ByteString -> Int -> Int -> bed
forall b. BEDConvert b => ByteString -> Int -> Int -> b
asBed ByteString
geneChrom (Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
0 (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Int
x Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
down) (Int
x Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
up)) (Int
x, CI ByteString
geneName)
    tss :: [Int]
tss | Bool
geneStrand = Int
geneLeft Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: (Transcript -> Int) -> [Transcript] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map Transcript -> Int
transLeft [Transcript]
geneTranscripts
        | Bool
otherwise = Int
geneRight Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: (Transcript -> Int) -> [Transcript] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map Transcript -> Int
transRight [Transcript]
geneTranscripts
{-# INLINE getPromoters #-}

-- | Compute genes' regulatory domains using the algorithm described in GREAT.
-- NOTE: the result doesn't contain promoters
getDomains :: BEDLike b
           => Int             -- ^ Extension length. A good default is 1M.
           -> [b] -- ^ A list of promoters
           -> [b] -- ^ Regulatory domains
getDomains :: Int -> [b] -> [b]
getDomains Int
ext [b]
genes
    | [b] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [b]
genes = String -> [b]
forall a. HasCallStack => String -> a
error String
"No gene available for domain assignment!"
    | Bool
otherwise = (b -> Bool) -> [b] -> [b]
forall a. (a -> Bool) -> [a] -> [a]
filter ((Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>Int
0) (Int -> Bool) -> (b -> Int) -> b -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. b -> Int
forall b. BEDLike b => b -> Int
size) ([b] -> [b]) -> [b] -> [b]
forall a b. (a -> b) -> a -> b
$ ((Maybe b, Maybe b, Maybe b) -> [b])
-> [(Maybe b, Maybe b, Maybe b)] -> [b]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap (Maybe b, Maybe b, Maybe b) -> [b]
forall t s s.
(BEDLike t, BEDLike s, BEDLike s) =>
(Maybe s, Maybe t, Maybe s) -> [t]
f ([(Maybe b, Maybe b, Maybe b)] -> [b])
-> [(Maybe b, Maybe b, Maybe b)] -> [b]
forall a b. (a -> b) -> a -> b
$ [Maybe b] -> [(Maybe b, Maybe b, Maybe b)]
forall c. [c] -> [(c, c, c)]
triplet ([Maybe b] -> [(Maybe b, Maybe b, Maybe b)])
-> [Maybe b] -> [(Maybe b, Maybe b, Maybe b)]
forall a b. (a -> b) -> a -> b
$
        [Maybe b
forall a. Maybe a
Nothing] [Maybe b] -> [Maybe b] -> [Maybe b]
forall a. [a] -> [a] -> [a]
++ (b -> Maybe b) -> [b] -> [Maybe b]
forall a b. (a -> b) -> [a] -> [b]
map b -> Maybe b
forall a. a -> Maybe a
Just [b]
basal [Maybe b] -> [Maybe b] -> [Maybe b]
forall a. [a] -> [a] -> [a]
++ [Maybe b
forall a. Maybe a
Nothing]
  where
    f :: (Maybe s, Maybe t, Maybe s) -> [t]
f (Maybe s
left, Just t
bed, Maybe s
right) =
      [ (Int -> Identity Int) -> t -> Identity t
forall b. BEDLike b => Lens' b Int
chromStart ((Int -> Identity Int) -> t -> Identity t) -> Int -> t -> t
forall s t a b. ASetter s t a b -> b -> s -> t
.~ Int
leftPos (t -> t) -> t -> t
forall a b. (a -> b) -> a -> b
$ (Int -> Identity Int) -> t -> Identity t
forall b. BEDLike b => Lens' b Int
chromEnd ((Int -> Identity Int) -> t -> Identity t) -> Int -> t -> t
forall s t a b. ASetter s t a b -> b -> s -> t
.~ Int
s (t -> t) -> t -> t
forall a b. (a -> b) -> a -> b
$ t
bed
      , (Int -> Identity Int) -> t -> Identity t
forall b. BEDLike b => Lens' b Int
chromStart ((Int -> Identity Int) -> t -> Identity t) -> Int -> t -> t
forall s t a b. ASetter s t a b -> b -> s -> t
.~ Int
e (t -> t) -> t -> t
forall a b. (a -> b) -> a -> b
$ (Int -> Identity Int) -> t -> Identity t
forall b. BEDLike b => Lens' b Int
chromEnd ((Int -> Identity Int) -> t -> Identity t) -> Int -> t -> t
forall s t a b. ASetter s t a b -> b -> s -> t
.~ Int
rightPos (t -> t) -> t -> t
forall a b. (a -> b) -> a -> b
$ t
bed ]
      where
        chr :: ByteString
chr = t
bedt -> Getting ByteString t ByteString -> ByteString
forall s a. s -> Getting a s a -> a
^.Getting ByteString t ByteString
forall b. BEDLike b => Lens' b ByteString
chrom
        s :: Int
s = t
bedt -> Getting Int t Int -> Int
forall s a. s -> Getting a s a -> a
^.Getting Int t Int
forall b. BEDLike b => Lens' b Int
chromStart
        e :: Int
e = t
bedt -> Getting Int t Int -> Int
forall s a. s -> Getting a s a -> a
^.Getting Int t Int
forall b. BEDLike b => Lens' b Int
chromEnd
        leftPos :: Int
leftPos
            | Maybe s -> Bool
forall a. Maybe a -> Bool
isNothing Maybe s
left Bool -> Bool -> Bool
|| ByteString
chr ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
/= Maybe s -> s
forall a. HasCallStack => Maybe a -> a
fromJust Maybe s
left s -> Getting ByteString s ByteString -> ByteString
forall s a. s -> Getting a s a -> a
^. Getting ByteString s ByteString
forall b. BEDLike b => Lens' b ByteString
chrom = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max (Int
s Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
ext) Int
0
            | Bool
otherwise = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
s (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Ord a => a -> a -> a
max (Int
s Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
ext) (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Maybe s -> s
forall a. HasCallStack => Maybe a -> a
fromJust Maybe s
left s -> Getting Int s Int -> Int
forall s a. s -> Getting a s a -> a
^. Getting Int s Int
forall b. BEDLike b => Lens' b Int
chromEnd
        rightPos :: Int
rightPos
            | Maybe s -> Bool
forall a. Maybe a -> Bool
isNothing Maybe s
right Bool -> Bool -> Bool
|| ByteString
chr ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
/= Maybe s -> s
forall a. HasCallStack => Maybe a -> a
fromJust Maybe s
right s -> Getting ByteString s ByteString -> ByteString
forall s a. s -> Getting a s a -> a
^. Getting ByteString s ByteString
forall b. BEDLike b => Lens' b ByteString
chrom = Int
e Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
ext   -- TODO: bound check
            | Bool
otherwise = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
e (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Ord a => a -> a -> a
min (Int
e Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
ext) (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ Maybe s -> s
forall a. HasCallStack => Maybe a -> a
fromJust Maybe s
right s -> Getting Int s Int -> Int
forall s a. s -> Getting a s a -> a
^. Getting Int s Int
forall b. BEDLike b => Lens' b Int
chromStart
    f (Maybe s, Maybe t, Maybe s)
_ = [t]
forall a. HasCallStack => a
undefined
    triplet :: [c] -> [(c, c, c)]
triplet (c
x1:c
x2:c
x3:[c]
xs) = (c
x1,c
x2,c
x3) (c, c, c) -> [(c, c, c)] -> [(c, c, c)]
forall a. a -> [a] -> [a]
: [c] -> [(c, c, c)]
triplet [c]
xs
    triplet [c]
_ = []
    basal :: [b]
basal = Vector b -> [b]
forall a. Vector a -> [a]
V.toList (Vector b -> [b]) -> Vector b -> [b]
forall a b. (a -> b) -> a -> b
$ Sorted (Vector b) -> Vector b
forall b. Sorted b -> b
fromSorted (Sorted (Vector b) -> Vector b) -> Sorted (Vector b) -> Vector b
forall a b. (a -> b) -> a -> b
$ [b] -> Sorted (Vector b)
forall b. BEDLike b => [b] -> Sorted (Vector b)
sortBed [b]
genes
{-# INLINE getDomains #-}