module Bio.SeqLoc.GTF
( readGtfTranscripts, readGtfTranscriptsErr
, transcriptToGtf
)
where
import Control.Applicative
import Control.Monad
import qualified Data.ByteString.Char8 as BS
import Data.ByteString.Internal (c2w)
import Data.List
import Data.Maybe
import qualified Data.Attoparsec.Char8 as AP (isSpace_w8)
import qualified Data.Attoparsec.Zepto as ZP
import qualified Data.HashMap.Strict as HM
import qualified Data.Iteratee as Iter
import qualified Data.Iteratee.Char as IterChar
import Bio.SeqLoc.LocRepr
import qualified Bio.SeqLoc.Location as Loc
import Bio.SeqLoc.OnSeq
import qualified Bio.SeqLoc.Position as Pos
import qualified Bio.SeqLoc.SpliceLocation as SpLoc
import Bio.SeqLoc.Strand
import Bio.SeqLoc.Transcript
import Bio.SeqLoc.ZeptoUtils
transcriptToGtf :: BS.ByteString -> Transcript -> BS.ByteString
transcriptToGtf src trx = BS.unlines $ exonLines ++ cdsLines
where exonLines = splocLines src trx "exon" (location trx)
cdsLines = maybe [] (splocLines src trx "CDS") $! cdsLocation trx
splocLines :: BS.ByteString -> Transcript -> BS.ByteString -> SpliceSeqLoc -> [BS.ByteString]
splocLines src trx ftype (OnSeq refname sploc) = map contigLines . Loc.toContigs $ sploc
where contigLines contig = let (start0, end0) = Loc.bounds contig
strchr = case Loc.strand contig of
Plus -> "+"
Minus -> "-"
in unfields [ unSeqLabel refname
, src
, ftype
, BS.pack . show . Pos.unOff . (+ 1) $ start0
, BS.pack . show . Pos.unOff . (+ 1) $ end0
, "0.0"
, strchr
, "."
, attrs
]
attrs = BS.concat [ "gene_id \"", unSeqLabel . geneId $ trx
, "\"; transcript_id \"", unSeqLabel . trxId $ trx
, "\"; " ]
unfields = BS.intercalate (BS.singleton '\t')
readGtfTranscripts :: FilePath -> IO [Transcript]
readGtfTranscripts = Iter.fileDriver gtfTrxsIter >=>
either (ioError . userError) return . mkTranscripts
where gtfTrxsIter = Iter.joinI . IterChar.enumLinesBS . Iter.joinI . gtflineIter $ Iter.foldl' insertGtfLine trxs0
trxs0 = GtfTrxs HM.empty HM.empty HM.empty
readGtfTranscriptsErr :: FilePath -> IO ([Transcript], [String])
readGtfTranscriptsErr = Iter.fileDriver gtfTrxsIter >=>
return . mkTranscriptsErr
where gtfTrxsIter = Iter.joinI . IterChar.enumLinesBS . Iter.joinI . gtflineIter $ Iter.foldl' insertGtfLine trxs0
trxs0 = GtfTrxs HM.empty HM.empty HM.empty
mkTranscripts :: GtfTrxs -> Either String [Transcript]
mkTranscripts trxs = go [] allTrxs
where allTrxs = HM.toList . gtfTogene $ trxs
go curr [] = Right curr
go curr (trxAndGene:rest) = case mkOne trxAndGene of
Left err -> Left err
Right t -> let next = t : curr in next `seq` go next rest
mkOne (trxname, genename) = mkTranscript trxname exons cdses genename
where exons = fromMaybe [] . HM.lookup trxname . gtfExonLocs $ trxs
cdses = fromMaybe [] . HM.lookup trxname . gtfCdsLocs $ trxs
mkTranscriptsErr :: GtfTrxs -> ([Transcript], [String])
mkTranscriptsErr trxs = go ([], []) allTrxs
where allTrxs = HM.toList . gtfTogene $ trxs
go curr [] = curr
go (currtrx, currerr) (trxAndGene:rest) = case mkOne trxAndGene of
Left err -> let nexterr = err : currerr
in nexterr `seq` go (currtrx, nexterr) rest
Right t -> let next = t : currtrx
in next `seq` go (next, currerr) rest
mkOne (trxname, genename) = mkTranscript trxname exons cdses genename
where exons = fromMaybe [] . HM.lookup trxname . gtfExonLocs $ trxs
cdses = fromMaybe [] . HM.lookup trxname . gtfCdsLocs $ trxs
mkTranscript :: BS.ByteString -> [ContigSeqLoc] -> [ContigSeqLoc] -> BS.ByteString -> Either String Transcript
mkTranscript trx exons cdses gene = moderr $ do loc <- exonLoc exons
cdsloc <- cdsLoc loc cdses
return $ Transcript (toSeqLabel gene) (toSeqLabel trx) loc cdsloc
where moderr = either (Left . (("Transcript " ++ show trx ++ ": ") ++)) Right
exonLoc :: [ContigSeqLoc] -> Either String SpliceSeqLoc
exonLoc exons = do
(seqname, rawcontigs) <- allSameName exons
contigs <- sortclocs rawcontigs
sploc <- maybe (badClocs contigs) Right $! SpLoc.fromContigs contigs
return $! OnSeq seqname sploc
where sortclocs clocs = maybe (badClocs clocs) Right . sortContigs $ clocs
badClocs clocs = Left . unwords $ [ "bad transcript exons" ] ++ map (BS.unpack . repr) clocs
cdsLoc :: SpliceSeqLoc -> [ContigSeqLoc] -> Either String (Maybe Loc.ContigLoc)
cdsLoc _ [] = return Nothing
cdsLoc (OnSeq trxname trxloc) cdses@(_:_) = do
(seqname, rawcontigs) <- allSameName cdses
when (trxname /= seqname) $ Left . unwords $ [ "CDS sequence name mismatch",
show $ unSeqLabel trxname
, show $ unSeqLabel seqname ]
contigs <- sortclocs rawcontigs
(contigIn0:contigInRest) <- mapM (cdsIntoTranscript trxloc) contigs
cloc <- foldM mergeCLocs contigIn0 contigInRest
return $! Just $! Loc.extend (0, 3) cloc
where sortclocs clocs = maybe badClocs Right . sortContigs $ clocs
where badClocs = Left . unwords $ [ "bad transcript CDSes" ] ++ map (BS.unpack . repr) clocs
cdsIntoTranscript :: SpLoc.SpliceLoc -> Loc.ContigLoc -> Either String Loc.ContigLoc
cdsIntoTranscript trxloc cdscontig = maybe badIn Right . Loc.clocInto cdscontig $ trxloc
where badIn = Left . unwords $ [ "Mapping CDS contig into transcript: "
, BS.unpack . repr $ cdscontig
, BS.unpack . repr $ trxloc
]
mergeCLocs :: Loc.ContigLoc -> Loc.ContigLoc -> Either String Loc.ContigLoc
mergeCLocs cloc0 clocnext
| (Loc.strand cloc0 == Plus) && (Loc.startPos clocnext == Loc.endPos cloc0 `Pos.slide` 1)
= return $! Loc.extend (0, Loc.length clocnext) cloc0
| otherwise = Left . unwords $ [ "Merging non-adjacent contigs: "
, BS.unpack . repr $ cloc0
, BS.unpack . repr $ clocnext
]
allSameName :: [OnSeq a] -> Either String (SeqLabel, [a])
allSameName s = case group . map onSeqLabel $ s of
[(name0:_)] -> return (name0, map unOnSeq s)
names -> Left $ "allSameName: names " ++ show (map (unSeqLabel . head) names)
data GtfLine = GtfLine { gtfGene, gtfTrx, gtfFtype :: !BS.ByteString, gtfLoc :: !ContigSeqLoc }
data GtfTrxs = GtfTrxs { gtfExonLocs, gtfCdsLocs :: !(HM.HashMap BS.ByteString [ContigSeqLoc])
, gtfTogene :: !(HM.HashMap BS.ByteString BS.ByteString)
}
ftypeCds :: BS.ByteString
ftypeCds = BS.pack "CDS"
ftypeExon :: BS.ByteString
ftypeExon = BS.pack "exon"
insertGtfLine :: GtfTrxs -> GtfLine -> GtfTrxs
insertGtfLine trxsin l
| gtfFtype l == ftypeCds = insertCds . insertGene $ trxsin
| gtfFtype l == ftypeExon = insertExon . insertGene $ trxsin
| otherwise = trxsin
where trx = BS.copy $ gtfTrx l
insertGene t0 = t0 { gtfTogene = HM.insert trx (BS.copy . gtfGene $ l) (gtfTogene t0) }
insertCds t0 = t0 { gtfCdsLocs = HM.insertWith (++) trx [gtfLoc l] (gtfCdsLocs t0) }
insertExon t0 = t0 { gtfExonLocs = HM.insertWith (++) trx [gtfLoc l] (gtfExonLocs t0) }
gtflineIter :: (Monad m) => Iter.Enumeratee [BS.ByteString] [GtfLine] m a
gtflineIter = Iter.convStream $ Iter.head >>= liftM (: []) . handleErr . ZP.parse gtfline
where handleErr = either (Iter.throwErr . Iter.iterStrExc) return
gtfline :: ZP.Parser GtfLine
gtfline = do seqname <- firstfield
_source <- dropField
ftype <- field
start <- decfield
end <- decfield
_score <- dropField
str <- strand
_frame <- dropField
gene <- attr "gene_id"
trx <- attr "transcript_id"
let name = toSeqLabel . BS.copy $ seqname
loc = Loc.fromBoundsStrand (start 1) (end 1) str
return $! GtfLine gene trx ftype (OnSeq name loc)
attr :: String -> ZP.Parser BS.ByteString
attr name = ZP.takeWhile AP.isSpace_w8 *>
ZP.string (BS.pack name) *>
ZP.takeWhile AP.isSpace_w8 *>
ZP.string "\"" *>
ZP.takeWhile (/= c2w '\"') <*
ZP.string "\";"