{-# LANGUAGE OverloadedStrings #-} module Bio.RealWorld.GENCODE ( Gene(..) , readGenes , readGenes' , parseGenes ) where import Conduit import qualified Data.ByteString.Char8 as B import Data.Maybe (fromJust) import Data.CaseInsensitive (CI, mk) import Bio.Utils.Misc (readInt) data Gene = Gene { geneName :: !(CI B.ByteString) , geneId :: !B.ByteString , geneChrom :: !B.ByteString , geneStart :: !Int , geneEnd :: !Int , geneStrand :: !Bool } deriving (Show) -- | Read gene information from Gencode GTF file readGenes :: MonadResource m => FilePath -> Source m Gene readGenes input = sourceFile input =$= parseGenes readGenes' :: FilePath -> IO [Gene] readGenes' input = runResourceT $ readGenes input $$ sinkList parseGenes :: Monad m => Conduit B.ByteString m Gene parseGenes = linesUnboundedAsciiC =$= concatMapC f where f l | B.head l == '#' || f3 /= "gene" = Nothing | otherwise = Just $ Gene (mk $ getField "gene_name") (getField "gene_id") f1 (readInt f4 - 1) (readInt f5) (f7=="+") where [f1,_,f3,f4,f5,_,f7,_,f9] = B.split '\t' l fields = map (B.break (==' ') . strip) $ B.split ';' f9 getField x = B.init $ B.drop 2 $ fromJust $ lookup x fields strip = fst . B.spanEnd isSpace . B.dropWhile isSpace isSpace = (== ' ') {-# INLINE parseGenes #-}