module Bio.RealWorld.GENCODE
( Gene(..)
, readGenes
) where
import Conduit
import qualified Data.ByteString.Char8 as B
import Data.CaseInsensitive (CI, mk)
import qualified Data.HashMap.Strict as M
import Data.Maybe (fromJust)
import Bio.Utils.Misc (readInt)
data Gene = Gene
{ geneName :: !(CI B.ByteString)
, geneId :: !B.ByteString
, geneChrom :: !B.ByteString
, geneLeft :: !Int
, geneRight :: !Int
, geneStrand :: !Bool
, geneTranscripts :: [(Int, Int)]
} deriving (Show)
readGenes :: FilePath -> IO [Gene]
readGenes input = do
(genes, transcripts) <- runResourceT $ runConduit $ sourceFile input .|
linesUnboundedAsciiC .| foldlC f (M.empty, M.empty)
return $ M.elems $ M.foldlWithKey'
(\m k v -> M.adjust (\g -> g{geneTranscripts=v}) k m)
genes transcripts
where
f (genes, transcripts) l
| B.head l == '#' = (genes, transcripts)
| f3 == "gene" =
let g = Gene (mk $ getField "gene_name") i f1 (readInt f4 1)
(readInt f5 1) (f7=="+") []
i = getField "gene_id"
in (M.insert i g genes, transcripts)
| f3 == "transcript" =
let t = (readInt f4 1, readInt f5 1)
i = getField "gene_id"
updateFn Nothing = Just [t]
updateFn (Just x) = Just $ t : x
in (genes, M.alter updateFn i transcripts)
| otherwise = (genes, transcripts)
where
[f1,_,f3,f4,f5,_,f7,_,f9] = B.split '\t' l
fields = map (B.break (==' ') . strip) $ B.split ';' f9
getField x = B.init $ B.drop 2 $ fromJust $ lookup x fields
strip = fst . B.spanEnd isSpace . B.dropWhile isSpace
isSpace = (== ' ')