module Bio.SeqLoc.Bed
( readBedTranscripts
, bedZP, bedTranscriptEnum
, transcriptToBed, transcriptToBedStd
)
where
import Control.Applicative
import Control.Monad
import qualified Data.ByteString.Char8 as BS
import Data.List
import Data.Maybe
import Data.Ord
import qualified Data.Attoparsec.Zepto as ZP
import qualified Data.Iteratee as Iter
import qualified Data.Iteratee.Char as IterChar
import Bio.SeqLoc.LocRepr
import qualified Bio.SeqLoc.Location as Loc
import Bio.SeqLoc.OnSeq
import qualified Bio.SeqLoc.Position as Pos
import qualified Bio.SeqLoc.SpliceLocation as SpLoc
import Bio.SeqLoc.Strand
import Bio.SeqLoc.Transcript
import Bio.SeqLoc.ZeptoUtils
transcriptToBedStd :: Transcript -> BS.ByteString
transcriptToBedStd = transcriptToBed "0" "0"
transcriptToBed :: BS.ByteString
-> BS.ByteString
-> Transcript
-> BS.ByteString
transcriptToBed score rgb trx = unfields fields
where unfields = BS.intercalate (BS.singleton '\t')
fields = [ unSeqLabel chrom
, repr $ chromStart
, repr $ chromEnd + 1
, unSeqLabel . trxId $ trx
, score
, strandchr
, repr $ thickStart
, repr $ thickEnd + 1
, rgb
, BS.pack . show . length $ blockSizes
, unCommaList blockSizes
, unCommaList blockStarts
]
(OnSeq chrom loc) = location trx
(chromStart, chromEnd) = Loc.bounds loc
strandchr = case Loc.strand loc of Plus -> "+"; Minus -> "-"
(thickStart, thickEnd) = maybe noCds (Loc.bounds . unOnSeq) . cdsLocation $ trx
noCds = (chromStart, chromStart 1)
contigs = sortBy (comparing Loc.offset5) . Loc.toContigs $ loc
blockSizes = map Loc.length contigs
blockStarts = map (subtract chromStart . Loc.offset5) contigs
unCommaList = BS.concat . map (flip BS.append (BS.singleton ',') . repr)
readBedTranscripts :: FilePath -> IO [Transcript]
readBedTranscripts = Iter.fileDriver (bedTranscriptEnum Iter.stream2list)
bedTranscriptEnum :: (Monad m) => Iter.Iteratee [Transcript] m a -> Iter.Iteratee BS.ByteString m a
bedTranscriptEnum = Iter.joinI . IterChar.enumLinesBS . Iter.joinI . bedLineEnum
bedLineEnum :: (Monad m) => Iter.Enumeratee [BS.ByteString] [Transcript] m a
bedLineEnum = Iter.convStream $ Iter.head >>= liftM (: []) . handleErr . ZP.parse bedZP
where handleErr = either (Iter.throwErr . Iter.iterStrExc) return
bedZP :: ZP.Parser Transcript
bedZP = do chrom <- firstfield
chromStart <- decfield
chromEnd <- decfield
name <- field
_score <- unlessAtEnd dropField
str <- fromMaybe Plus <$> unlessAtEnd strand
thickStart <- unlessAtEnd decfield
thickEnd <- unlessAtEnd decfield
_itemRGB <- unlessAtEnd dropField
blockCount <- unlessAtEnd decfield
blockSizes <- case blockCount of
Just ct -> unlessAtEnd $ commas ct decimal
Nothing -> return Nothing
blockStarts <- case blockCount of
Just ct -> unlessAtEnd $ commas ct decimal
Nothing -> return Nothing
loc <- case liftM2 zip blockSizes blockStarts of
Just blocks -> bedTrxLoc chromStart chromEnd str blocks
Nothing -> maybe badContig return $
SpLoc.fromContigs [ Loc.fromBoundsStrand chromStart (chromEnd 1) str ]
where badContig = error "bedZP: Bad singleton sploc!"
unless (Loc.bounds loc == (chromStart, chromEnd 1)) $
fail $ "Bio.SeqLoc.Bed: bad sploc:" ++
(BS.unpack . BS.unwords $ [ repr loc, repr chromStart, repr chromEnd ])
cdsloc <- case liftM2 (,) thickStart thickEnd of
Just (start, end) | end > start -> liftM Just $! bedCdsLoc loc start end
_ -> return Nothing
let n = toSeqLabel $ BS.copy name
c = toSeqLabel $ BS.copy chrom
return $! Transcript n n (OnSeq c loc) cdsloc
bedTrxLoc :: (Monad m) => Pos.Offset -> Pos.Offset -> Strand -> [(Pos.Offset, Pos.Offset)] -> m SpLoc.SpliceLoc
bedTrxLoc chromStart chromEnd str = maybe badContigs (return . stranded str) .
SpLoc.fromContigs . map blockContig
where blockContig (bsize, bstart) = Loc.fromPosLen (Pos.Pos (chromStart + bstart) Plus) bsize
badContigs = fail $ "Bio.SeqLoc.Bed: bad blocks in " ++ show (chromStart, chromEnd)
bedCdsLoc :: (Monad m) => SpLoc.SpliceLoc -> Pos.Offset -> Pos.Offset -> m Loc.ContigLoc
bedCdsLoc loc thickStart thickEnd
= maybe badCdsLoc return $ do
relstart <- Loc.posInto (Pos.Pos thickStart Plus) loc
relend <- Loc.posInto (Pos.Pos (thickEnd 1) Plus) loc
return $! stranded (Loc.strand loc) $ Loc.fromStartEnd (Pos.offset relstart) (Pos.offset relend)
where badCdsLoc = fail $ "Bio.SeqLoc.Bed: bad cds in " ++
(BS.unpack . BS.unwords $ [ repr loc, repr thickStart, repr thickEnd ])
commas :: Int -> ZP.Parser a -> ZP.Parser [a]
commas n p | n < 1 = return []
| otherwise = ZP.string "\t" *>
( (:) <$> p <*>
replicateM (n 1) (ZP.string "," *> p) )
<* (ZP.string "," <|> return ())