{-# LANGUAGE OverloadedStrings, FlexibleContexts #-} {-| Utilities for reading and writing BED format gene annotations -} module Bio.SeqLoc.Bed ( readBedTranscripts , bedZP, bedTranscriptEnum , bedConduit, unbedConduit , transcriptToBed, transcriptToBedStd ) where import Control.Applicative import qualified Control.Exception.Lifted as E import Control.Monad import Control.Monad.Base import Control.Monad.Trans.Resource import qualified Data.ByteString.Char8 as BS import Data.List import Data.Maybe import Data.Ord import qualified Data.Attoparsec.Zepto as ZP import qualified Data.Conduit as C import qualified Data.Conduit.Binary as CB import qualified Data.Conduit.List as C import qualified Data.Iteratee as Iter import qualified Data.Iteratee.Char as IterChar import Bio.SeqLoc.LocRepr import qualified Bio.SeqLoc.Location as Loc import Bio.SeqLoc.OnSeq import qualified Bio.SeqLoc.Position as Pos import qualified Bio.SeqLoc.SpliceLocation as SpLoc import Bio.SeqLoc.Strand import Bio.SeqLoc.Transcript import Bio.SeqLoc.ZeptoUtils -- | Convert a 'Transcript' to a BED annotation line. transcriptToBedStd :: Transcript -> BS.ByteString transcriptToBedStd = transcriptToBed "0" "0" -- | Convert a 'Transcript' to a BED annotation line, specifying the -- /score/ and /itemRGB/ fields. transcriptToBed :: BS.ByteString -- ^ score -> BS.ByteString -- ^ itemRGB -> Transcript -- ^ transcript -> BS.ByteString transcriptToBed score rgb trx = unfields fields where unfields = BS.intercalate (BS.singleton '\t') fields = [ unSeqLabel chrom , repr $ chromStart , repr $ chromEnd + 1 , unSeqLabel . trxId $ trx , score , strandchr , repr $ thickStart , repr $ thickEnd + 1 , rgb , BS.pack . show . length $ blockSizes , unCommaList blockSizes , unCommaList blockStarts ] (OnSeq chrom loc) = location trx (chromStart, chromEnd) = Loc.bounds loc strandchr = case Loc.strand loc of Plus -> "+"; Minus -> "-" (thickStart, thickEnd) = maybe noCds (Loc.bounds . unOnSeq) . cdsLocation $ trx noCds = (chromStart, chromStart - 1) contigs = sortBy (comparing Loc.offset5) . Loc.toContigs $ loc blockSizes = map Loc.length contigs blockStarts = map (subtract chromStart . Loc.offset5) contigs unCommaList = BS.concat . map (flip BS.append (BS.singleton ',') . repr) -- | Read all BED format annotations in a BED file readBedTranscripts :: FilePath -> IO [Transcript] readBedTranscripts bedfile = runResourceT $ C.runConduit $ CB.sourceFile bedfile C.$= bedConduit C.$= C.consume -- | Iteratee to convert an 'Iter.Iteratee' over a 'BS.ByteString', -- such as the standard 'Iter.fileDriver', into an iteratee over a -- list of 'Transcript' annotations from the file. bedTranscriptEnum :: (Monad m) => Iter.Iteratee [Transcript] m a -> Iter.Iteratee BS.ByteString m a bedTranscriptEnum = Iter.joinI . IterChar.enumLinesBS . Iter.joinI . bedLineEnum bedLineEnum :: (Monad m) => Iter.Enumeratee [BS.ByteString] [Transcript] m a bedLineEnum = Iter.convStream $ Iter.head >>= liftM (: []) . handleErr . ZP.parse bedZP where handleErr = either (Iter.throwErr . Iter.iterStrExc) return -- | Conduit from a 'BS.ByteString' source such as a BED file to a -- source of 'Transcript' annotations from the file. bedConduit :: (Monad m, MonadBase IO m) => C.Conduit BS.ByteString m Transcript bedConduit = CB.lines C.$= loop where loop = C.head >>= maybe (return ()) (\l -> case ZP.parse bedZP l of Left err -> E.ioError . userError $ err ++ "\n in BED line\n" ++ show l Right res -> C.yield res >> loop) unbedConduit :: (Monad m) => C.Conduit Transcript m BS.ByteString unbedConduit = C.head >>= maybe (return ()) go where go t = C.yield (transcriptToBedStd t `BS.append` "\n") >> unbedConduit -- | Minimalistic 'ZP.Parser'-style parser for a BED format line, not -- including the trailing newline. bedZP :: ZP.Parser Transcript bedZP = do chrom <- firstfield -- The name of the chromosome chromStart <- decfield -- The starting position of the -- feature in the chromosome or -- scaffold. The first base in a -- chromosome is numbered 0 chromEnd <- decfield -- The ending position of the feature -- in the chromosome or scaffold. The -- chromEnd base is not included in -- the display of the feature. For -- example, the first 100 bases of a -- chromosome are defined as -- chromStart=0, chromEnd=100, and -- span the bases numbered 0-99. name <- field -- Defines the name of the BED line. _score <- unlessAtEnd dropField -- A score between 0 and 1000. str <- fromMaybe Plus <$> unlessAtEnd strand -- Defines the strand thickStart <- unlessAtEnd decfield -- The starting position at which -- the feature is drawn thickly (for -- example, the start codon in gene -- displays). thickEnd <- unlessAtEnd decfield -- The ending position at which the -- feature is drawn thickly (for -- example, the stop codon in gene -- displays). _itemRGB <- unlessAtEnd dropField -- An RGB value of the form R,G,B -- (e.g. 255,0,0). blockCount <- unlessAtEnd decfield -- The number of blocks (exons) in -- the BED line. blockSizes <- case blockCount of Just ct -> unlessAtEnd $ commas ct decimal -- A comma-separated list of the block sizes. Nothing -> return Nothing blockStarts <- case blockCount of Just ct -> unlessAtEnd $ commas ct decimal -- A comma-separated list of block starts. Nothing -> return Nothing loc <- case liftM2 zip blockSizes blockStarts of Just blocks -> bedTrxLoc chromStart chromEnd str blocks Nothing -> maybe badContig return $ SpLoc.fromContigs [ Loc.fromBoundsStrand chromStart (chromEnd - 1) str ] where badContig = error "bedZP: Bad singleton sploc!" unless (Loc.bounds loc == (chromStart, chromEnd - 1)) $ fail $ "Bio.SeqLoc.Bed: bad sploc:" ++ (BS.unpack . BS.unwords $ [ repr loc, repr chromStart, repr chromEnd ]) cdsloc <- case liftM2 (,) thickStart thickEnd of Just (start, end) | end > start -> liftM Just $! bedCdsLoc loc start end _ -> return Nothing let n = toSeqLabel $ BS.copy name c = toSeqLabel $ BS.copy chrom return $! Transcript n n (OnSeq c loc) cdsloc bedTrxLoc :: (Monad m) => Pos.Offset -> Pos.Offset -> Strand -> [(Pos.Offset, Pos.Offset)] -> m SpLoc.SpliceLoc bedTrxLoc chromStart chromEnd str = maybe badContigs (return . stranded str) . SpLoc.fromContigs . map blockContig where blockContig (bsize, bstart) = Loc.fromPosLen (Pos.Pos (chromStart + bstart) Plus) bsize badContigs = fail $ "Bio.SeqLoc.Bed: bad blocks in " ++ show (chromStart, chromEnd) bedCdsLoc :: (Monad m) => SpLoc.SpliceLoc -> Pos.Offset -> Pos.Offset -> m Loc.ContigLoc bedCdsLoc loc thickStart thickEnd = maybe badCdsLoc return $ do relstart <- Loc.posInto (Pos.Pos thickStart Plus) loc relend <- Loc.posInto (Pos.Pos (thickEnd - 1) Plus) loc case Loc.strand loc of Plus -> if relstart <= relend then return $! Loc.fromBoundsStrand (Pos.offset relstart) (Pos.offset relend) Plus else Nothing Minus -> if relend <= relstart then return $! Loc.fromBoundsStrand (Pos.offset relend) (Pos.offset relstart) Plus else Nothing where badCdsLoc = fail $ "Bio.SeqLoc.Bed: bad cds in " ++ (BS.unpack . BS.unwords $ [ repr loc, repr thickStart, repr thickEnd ]) commas :: Int -> ZP.Parser a -> ZP.Parser [a] commas n p | n < 1 = return [] | otherwise = ZP.string "\t" *> ( (:) <$> p <*> replicateM (n - 1) (ZP.string "," *> p) ) <* (ZP.string "," <|> return ())