{-# LANGUAGE OverloadedStrings #-} module Bio.RealWorld.GENCODE ( Gene(..) , readGenes ) where import Conduit import qualified Data.ByteString.Char8 as B import Data.CaseInsensitive (CI, mk) import qualified Data.HashMap.Strict as M import Data.Maybe (fromJust) import Bio.Utils.Misc (readInt) -- | GTF's position is 1-based, but here we convert it to 0-based indexing. data Gene = Gene { geneName :: !(CI B.ByteString) , geneId :: !B.ByteString , geneChrom :: !B.ByteString , geneLeft :: !Int , geneRight :: !Int , geneStrand :: !Bool , geneTranscripts :: [(Int, Int)] } deriving (Show) -- | Read gene information from Gencode GTF file readGenes :: FilePath -> IO [Gene] readGenes input = do (genes, transcripts) <- runResourceT $ runConduit $ sourceFile input .| linesUnboundedAsciiC .| foldlC f (M.empty, M.empty) return $ M.elems $ M.foldlWithKey' (\m k v -> M.adjust (\g -> g{geneTranscripts=v}) k m) genes transcripts where f (genes, transcripts) l | B.head l == '#' = (genes, transcripts) | f3 == "gene" = let g = Gene (mk $ getField "gene_name") i f1 (readInt f4 - 1) (readInt f5 - 1) (f7=="+") [] i = getField "gene_id" in (M.insert i g genes, transcripts) | f3 == "transcript" = let t = (readInt f4 - 1, readInt f5 - 1) i = getField "gene_id" updateFn Nothing = Just [t] updateFn (Just x) = Just $ t : x in (genes, M.alter updateFn i transcripts) | otherwise = (genes, transcripts) where [f1,_,f3,f4,f5,_,f7,_,f9] = B.split '\t' l fields = map (B.break (==' ') . strip) $ B.split ';' f9 getField x = B.init $ B.drop 2 $ fromJust $ lookup x fields strip = fst . B.spanEnd isSpace . B.dropWhile isSpace isSpace = (== ' ')