module Bio.RealWorld.GENCODE
( Gene(..)
, readGenes
, readGenes'
, parseGenes
) where
import Conduit
import qualified Data.ByteString.Char8 as B
import Data.Maybe (fromJust)
import Data.CaseInsensitive (CI, mk)
import Bio.Utils.Misc (readInt)
data Gene = Gene
{ geneName :: !(CI B.ByteString)
, geneId :: !B.ByteString
, geneChrom :: !B.ByteString
, geneStart :: !Int
, geneEnd :: !Int
, geneStrand :: !Bool
} deriving (Show)
readGenes :: MonadResource m => FilePath -> Source m Gene
readGenes input = sourceFile input =$= parseGenes
readGenes' :: FilePath -> IO [Gene]
readGenes' input = runResourceT $ readGenes input $$ sinkList
parseGenes :: Monad m => Conduit B.ByteString m Gene
parseGenes = linesUnboundedAsciiC =$= concatMapC f
where
f l | B.head l == '#' || f3 /= "gene" = Nothing
| otherwise = Just $ Gene (mk $ getField "gene_name") (getField "gene_id") f1
(readInt f4 1) (readInt f5) (f7=="+")
where
[f1,_,f3,f4,f5,_,f7,_,f9] = B.split '\t' l
fields = map (B.break (==' ') . strip) $ B.split ';' f9
getField x = B.init $ B.drop 2 $ fromJust $ lookup x fields
strip = fst . B.spanEnd isSpace . B.dropWhile isSpace
isSpace = (== ' ')