module Bio.SeqLoc.Transcript ( -- * Type for splice junctions Junction (..) , fromDonorAcceptor, donor, acceptor , junctions -- * Representation of transcript , Transcript(..), utr5, utr3 , cdsLocation , sortContigs ) where import Control.Applicative import Control.Monad import qualified Data.ByteString.Char8 as BS import Data.List import Data.Ord import qualified Data.Attoparsec.Zepto as ZP import Bio.SeqLoc.LocRepr import qualified Bio.SeqLoc.Location as Loc import Bio.SeqLoc.OnSeq import qualified Bio.SeqLoc.Position as Pos import qualified Bio.SeqLoc.SpliceLocation as SpLoc import Bio.SeqLoc.Strand -- | Splice junctions, which are isomorphic to the introns they span, -- but which support other biologically relevant constructors and -- accessors. newtype Junction = Junction { intron :: Loc.ContigLoc } deriving (Show) slash :: BS.ByteString slash = BS.pack "/" instance LocRepr Junction where repr j = BS.concat [ repr . donor $ j, slash, repr . acceptor $ j ] unrepr = fromDonorAcceptor <$> unrepr <*> (ZP.string slash *> unrepr) -- | Create a splice junction from a donor position (the last position -- in the 5' exon) and the acceptor position (the first position in -- the 3' exon). fromDonorAcceptor :: Pos.Pos -> Pos.Pos -> Junction fromDonorAcceptor d a = let len = 1 + abs (Pos.offset a - Pos.offset d) in case Pos.strand d of Plus -> Junction $! Loc.fromPosLen (Pos.slide d 1) len Minus -> Junction $! Loc.fromPosLen (Pos.slide d (-1)) len -- | Donor position, i.e., the last position in the 5' exon around a -- junction. donor :: Junction -> Pos.Pos donor = Loc.startPos . Loc.extend (1, 0) . intron -- | Acceptor position, i.e., the first position in the 3' exon around -- a junction. acceptor :: Junction -> Pos.Pos acceptor = Loc.endPos . Loc.extend (0, 1) . intron -- | List of splice junctions from a spliced location, in order. junctions :: SpLoc.SpliceLoc -> [Junction] junctions sploc = zipWith junction contigs (drop 1 contigs) where contigs = Loc.toContigs sploc junction c5 c3 = let p5 = Loc.endPos . Loc.extend (0, 1) $ c5 p3 = Loc.startPos . Loc.extend (1, 0) $ c3 len = 1 + abs (Pos.offset p3 - Pos.offset p5) in Junction $ Loc.fromPosLen p5 len -- | Representation of a genomic transcript, with a gene and a -- transcript identifier, along with the genomic location of the -- processed transcript and an optional coding sequence on that -- transcript. data Transcript = Transcript { geneId :: !SeqLabel -- ^ Gene or locus name for a collection of transcripts , trxId :: !SeqLabel -- ^ Specific transcript identifier , location :: !SpliceSeqLoc -- ^ Sequence location of processed transcript , cds :: !(Maybe Loc.ContigLoc) -- ^ Location of CDS on the transcript } -- | 'Just' the location of the 5' UTR on the transcript, or 'Nothing' -- if there is no 'cds' on the transcript or if the 'cds' location -- begins at the first nucleotide of the transcript--if a region is -- returned it will have positive length. utr5 :: Transcript -> Maybe Loc.ContigLoc utr5 trx = cds trx >>= utr5loc where utr5loc cdsloc = case Loc.startPos cdsloc of (Pos.Pos startoff Plus) | startoff > 0 -> Just $! Loc.fromBoundsStrand 0 (startoff - 1) Plus _ -> Nothing -- | 'Just' the location of the 3' UTR on the transcript, or 'Nothing' -- if there is no 'cds' on the transcript or if the 'cds' location -- ends at the last nucleotide of the transcript--if a region is -- returned it will have positive length. utr3 :: Transcript -> Maybe Loc.ContigLoc utr3 trx = cds trx >>= utr3loc where utr3loc cdsloc = case Loc.endPos cdsloc of (Pos.Pos endoff Plus) | endoff < trxlast -> Just $! Loc.fromBoundsStrand (endoff + 1) trxlast Plus _ -> Nothing trxlast = (Loc.length . unOnSeq . location $ trx) - 1 -- | Genomic location of CDS within the transcript cdsLocation :: Transcript -> Maybe SpliceSeqLoc cdsLocation trx = cds trx >>= liftM (OnSeq name) . flip Loc.clocOutof loc where (OnSeq name loc) = location trx -- | 'Just' the input contigs sorted in stranded order, when all lie -- on the same strand, or 'Nothing' if they are not all on the same -- strand. sortContigs :: [Loc.ContigLoc] -> Maybe [Loc.ContigLoc] sortContigs [] = Nothing sortContigs cs@(c0:_)= liftM sortStrand contigStrand where contigStrand | all ((== Loc.strand c0) . Loc.strand) cs = Just . Loc.strand $ c0 | otherwise = Nothing sortStrand Plus = sortBy (comparing Loc.offset5) cs sortStrand Minus = sortBy (comparing (negate . Loc.offset5)) cs