module Bio.GFF3.SGD ( chromosomes, genes, rRNAs
, sortExons
, geneSequence, geneSeqLoc, geneCDSes
, noncodingSequence, noncodingSeqLoc, noncodingExons
, namedSLM, geneCDS_SLM
)
where
import Control.Arrow ((&&&))
import Control.Monad.Error
import Control.Monad.Reader
import qualified Data.ByteString.Lazy.Char8 as LBS
import Data.List
import Data.Maybe
import Data.Ord
import qualified Data.Set as S
import Bio.Sequence.SeqData
import qualified Bio.GFF3.Feature as F
import Bio.GFF3.FeatureHierSequences
import qualified Bio.Location.Location as Loc
import Bio.Location.OnSeq
import qualified Bio.Location.SeqLocation as SeqLoc
import Bio.Location.Strand
import qualified Bio.Location.SeqLocMap as SLM
cdsType, chromosomeType, geneType, noncodingExonType :: LBS.ByteString
chromosomeType = LBS.pack "chromosome"
cdsType = LBS.pack "CDS"
geneType = LBS.pack "gene"
noncodingExonType = LBS.pack "noncoding_exon"
chromosomes :: FeatureHierSequences -> [F.Feature]
chromosomes = filter isChromosome . S.toList . features
where isChromosome = (== chromosomeType) . F.ftype
genes :: FeatureHierSequences -> [F.Feature]
genes = filter isGene . S.toList . features
where isGene = (== geneType) . F.ftype
rRNAs :: FeatureHierSequences -> [F.Feature]
rRNAs = filter isRRNA . S.toList . features
where isRRNA = any (flip S.member $ S.fromList rRNAFeatureIDs) . F.ids
rRNAFeatureIDs = map LBS.pack ["RDN25-1", "RDN58-1", "RDN18-1", "RDN5-1"]
geneSequence :: (Error e, MonadError e m) => FeatureHierSequences -> F.Feature -> m (Sequence a)
geneSequence fhs g = do sequ <- geneSeqLoc fhs g >>= SeqLoc.seqData (getSequence fhs)
seqname <- maybe (throwError $ strMsg $ "No gene ID for " ++ show g) return $ listToMaybe $ F.ids g
return $ Seq seqname sequ Nothing
geneSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> F.Feature -> m SeqLoc.SeqLoc
geneSeqLoc fhs g = exonSeqLoc $ geneCDSes fhs g
geneCDSes :: FeatureHierSequences -> F.Feature -> [F.Feature]
geneCDSes fhs g = filter isCDS $ children fhs g
where isCDS = (== cdsType) . F.ftype
noncodingSequence :: (Error e, MonadError e m) => FeatureHierSequences -> F.Feature -> m (Sequence a)
noncodingSequence fhs nc = do sequ <- noncodingSeqLoc fhs nc >>= SeqLoc.seqData (getSequence fhs)
seqname <- maybe (throwError $ strMsg $ "No gene ID for " ++ show nc) return $ listToMaybe $ F.ids nc
return $ Seq seqname sequ Nothing
noncodingSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> F.Feature -> m SeqLoc.SeqLoc
noncodingSeqLoc fhs nc = exonSeqLoc $ noncodingExons fhs nc
noncodingExons :: FeatureHierSequences -> F.Feature -> [F.Feature]
noncodingExons fhs nc = filter isNoncodingExon $ children fhs nc
where isNoncodingExon = (== noncodingExonType) . F.ftype
sortExons :: (Error e, MonadError e m) => [F.Feature] -> m [F.Feature]
sortExons exons | all fwdStrand exons = return $ sortBy (comparing F.start) exons
| all rcStrand exons = return $ sortBy (comparing $ negate . F.start) exons
| otherwise = throwError $ strMsg "sortExons: discordant strands"
where fwdStrand = (== (Just Fwd)) . F.strand
rcStrand = (== (Just RevCompl)) . F.strand
exonSeqLoc :: (Error e, MonadError e m) => [F.Feature] -> m SeqLoc.SeqLoc
exonSeqLoc [] = throwError $ strMsg "exonSeqLoc: empty list of exons"
exonSeqLoc exons@(exon0:exonrest)
= let seqid = F.seqid exon0
in if all ((== seqid) . F.seqid) exonrest
then liftM (OnSeq seqid . Loc.Loc . map F.contigLoc) $ sortExons exons
else throwError $ strMsg "exonSeqLoc: discordant seqid amongst exons"
namedSLM :: FeatureHierSequences -> SLM.SeqLocMap F.Feature
namedSLM = SLM.fromList . map (F.seqLoc &&& id) . filter isNamed . S.toList . features
where isNamed = not . null . F.ids
geneCDS_SLM :: (Error e, MonadError e m) => FeatureHierSequences -> m (SLM.SeqLocMap F.Feature)
geneCDS_SLM fhs = liftM SLM.fromList $ mapM cdsLocAndGene $ genes fhs
where cdsLocAndGene g = liftM (flip (,) g) $ geneSeqLoc fhs g