module Bio.GFF3.SGD ( chromosomes, genes, rRNAs , sortExons , geneSequence, geneSeqLoc, geneCDSes , noncodingSequence, noncodingSeqLoc, noncodingExons , namedSLM, geneCDS_SLM ) where import Control.Arrow ((&&&)) import Control.Monad.Error import qualified Data.ByteString.Lazy.Char8 as LBS import Data.List import Data.Maybe import Data.Ord import qualified Data.Set as S import Bio.Sequence.SeqData import qualified Bio.GFF3.Feature as F import Bio.GFF3.FeatureHierSequences -- import qualified Bio.Location.ContigLocation as CLoc import qualified Bio.Location.Location as Loc import Bio.Location.OnSeq import qualified Bio.Location.SeqLocation as SeqLoc import Bio.Location.Strand import qualified Bio.Location.SeqLocMap as SLM -- SGD feature types cdsType, chromosomeType, geneType, noncodingExonType :: LBS.ByteString chromosomeType = LBS.pack "chromosome" cdsType = LBS.pack "CDS" geneType = LBS.pack "gene" noncodingExonType = LBS.pack "noncoding_exon" chromosomes :: FeatureHierSequences -> [F.Feature] chromosomes = filter isChromosome . S.toList . features where isChromosome = (== chromosomeType) . F.ftype genes :: FeatureHierSequences -> [F.Feature] genes = filter isGene . S.toList . features where isGene = (== geneType) . F.ftype rRNAs :: FeatureHierSequences -> [F.Feature] rRNAs = filter isRRNA . S.toList . features where isRRNA = any (flip S.member $ S.fromList rRNAFeatureIDs) . F.ids rRNAFeatureIDs = map LBS.pack ["RDN25-1", "RDN58-1", "RDN18-1", "RDN5-1"] geneSequence :: (Error e, MonadError e m) => FeatureHierSequences -> F.Feature -> m (Sequence a) geneSequence fhs g = do sequ <- geneSeqLoc fhs g >>= SeqLoc.seqData (getSequence fhs) seqname <- maybe (throwError $ strMsg $ "No gene ID for " ++ show g) return $ listToMaybe $ F.ids g return $ Seq seqname sequ Nothing geneSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> F.Feature -> m SeqLoc.SeqLoc geneSeqLoc fhs g = exonSeqLoc $ geneCDSes fhs g geneCDSes :: FeatureHierSequences -> F.Feature -> [F.Feature] geneCDSes fhs g = filter isCDS $ children fhs g where isCDS = (== cdsType) . F.ftype noncodingSequence :: (Error e, MonadError e m) => FeatureHierSequences -> F.Feature -> m (Sequence a) noncodingSequence fhs nc = do sequ <- noncodingSeqLoc fhs nc >>= SeqLoc.seqData (getSequence fhs) seqname <- maybe (throwError $ strMsg $ "No gene ID for " ++ show nc) return $ listToMaybe $ F.ids nc return $ Seq seqname sequ Nothing noncodingSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> F.Feature -> m SeqLoc.SeqLoc noncodingSeqLoc fhs nc = exonSeqLoc $ noncodingExons fhs nc noncodingExons :: FeatureHierSequences -> F.Feature -> [F.Feature] noncodingExons fhs nc = filter isNoncodingExon $ children fhs nc where isNoncodingExon = (== noncodingExonType) . F.ftype sortExons :: (Error e, MonadError e m) => [F.Feature] -> m [F.Feature] sortExons exons | all fwdStrand exons = return $ sortBy (comparing F.start) exons | all rcStrand exons = return $ sortBy (comparing $ negate . F.start) exons | otherwise = throwError $ strMsg "sortExons: discordant strands" where fwdStrand = (== (Just Fwd)) . F.strand rcStrand = (== (Just RevCompl)) . F.strand exonSeqLoc :: (Error e, MonadError e m) => [F.Feature] -> m SeqLoc.SeqLoc exonSeqLoc [] = throwError $ strMsg "exonSeqLoc: empty list of exons" exonSeqLoc exons@(exon0:exonrest) = let seqid = F.seqid exon0 in if all ((== seqid) . F.seqid) exonrest then liftM (OnSeq seqid . Loc.Loc . map F.contigLoc) $ sortExons exons else throwError $ strMsg "exonSeqLoc: discordant seqid amongst exons" namedSLM :: FeatureHierSequences -> SLM.SeqLocMap F.Feature namedSLM = SLM.fromList . map (F.seqLoc &&& id) . filter isNamed . S.toList . features where isNamed = not . null . F.ids geneCDS_SLM :: (Error e, MonadError e m) => FeatureHierSequences -> m (SLM.SeqLocMap F.Feature) geneCDS_SLM fhs = liftM SLM.fromList $ mapM cdsLocAndGene $ genes fhs where cdsLocAndGene g = liftM (flip (,) g) $ geneSeqLoc fhs g