module Bio.GFF3.Feature ( GFFAttr(..)
, Feature(..), length
, parse, unparse
, parseWithFasta
, attrByTag
, ids, parentIds
, contigLoc, loc, seqLoc
, name
)
where
import Prelude hiding (length)
import Control.Monad.Error
import qualified Data.ByteString.Lazy.Char8 as LBS
import Data.List hiding (length)
import qualified Data.List as List (length)
import Data.Maybe
import Bio.GFF3.Escape
import qualified Bio.Location.ContigLocation as CLoc
import qualified Bio.Location.Location as Loc
import Bio.Location.OnSeq
import qualified Bio.Location.SeqLocation as SeqLoc
import Bio.Location.Strand
import Bio.Sequence.SeqData
data GFFAttr = GFFAttr { attrTag :: !LBS.ByteString, attrValues :: ![LBS.ByteString] } deriving (Eq, Ord, Show)
data Feature = Feature { seqid :: !LBS.ByteString
, source :: !LBS.ByteString
, ftype :: !LBS.ByteString
, start, end :: !Offset
, score :: !(Maybe Double)
, strand :: !(Maybe Strand)
, phase :: !(Maybe Offset)
, attributes :: ![GFFAttr]
} deriving (Eq, Ord, Show)
length :: Feature -> Offset
length f = 1 + end f start f
dot :: LBS.ByteString
dot = LBS.singleton '.'
parse :: (Error e, MonadError e m) => LBS.ByteString -> m Feature
parse line = case LBS.split '\t' line of
[seqidStr,sourceStr,typeStr,startStr,endStr,scoreStr,strandStr,phaseStr,attrStr]
-> do fSeqid <- unEscapeByteString seqidStr
fSource <- unEscapeByteString sourceStr
fFtype <- unEscapeByteString typeStr
fStart <- unEscapeByteString startStr >>= parseInteger
fEnd <- unEscapeByteString endStr >>= parseInteger
fScore <- unEscapeByteString scoreStr >>= parseMaybe parseDouble
fStrand <- unEscapeByteString strandStr >>= parseMaybe parseStrand
fPhase <- unEscapeByteString phaseStr >>= parseMaybe parseInteger
fAttrs <- parseAttrs attrStr
return $ Feature fSeqid fSource fFtype fStart fEnd fScore fStrand fPhase fAttrs
fs -> throwError $ strMsg $ "Malformed GFF line with " ++ show (List.length fs) ++ " tab-delimited fields"
where parseInteger zstr = case LBS.readInteger zstr of
Just (z, rest) | LBS.null rest -> return $ fromIntegral z
_ -> throwError $ strMsg $ "Malformed integer " ++ show (LBS.unpack zstr)
parseMaybe subparse str = if str == dot then return Nothing else liftM Just $ subparse str
parseDouble xstr = case reads $ LBS.unpack xstr of
[(x, "")] -> return x
_ -> throwError $ strMsg $ "Malformed double " ++ show (LBS.unpack xstr)
parseStrand sstr | sstr == (LBS.singleton '+') = return Fwd
| sstr == (LBS.singleton '-') = return RevCompl
| otherwise = throwError $ strMsg $ "Malformed strand " ++ show (LBS.unpack sstr)
parseAttrs = mapM parseAttr . LBS.split ';'
parseAttr attrStr = case LBS.break (== '=') attrStr of
(tagStr,equalValStr)
-> case LBS.uncons equalValStr of
Just ('=', valStr) -> do tag <- unEscapeByteString tagStr
vals <- parseAttrVals valStr
return $ GFFAttr tag vals
_ -> throwError $ strMsg $ "Malformed attribute " ++ show (LBS.unpack attrStr)
parseAttrVals valStr | LBS.null valStr = return [LBS.empty]
| otherwise = mapM unEscapeByteString $ LBS.split ',' valStr
escapeField :: LBS.ByteString -> LBS.ByteString
escapeField = escapeAllOf "\t\r\n;=%&,"
escapeSeqid :: LBS.ByteString -> LBS.ByteString
escapeSeqid = escapeAllBut $ ".:^*$@!+_?-|" ++ ['a'..'z'] ++ ['A'..'Z'] ++ ['0'..'9']
unparse :: Feature -> LBS.ByteString
unparse feat = LBS.intercalate (LBS.singleton '\t') [ escapeSeqid $ seqid feat
, escapeField $ source feat
, escapeField $ ftype feat
, escapeField $ unparseInteger $ start feat
, escapeField $ unparseInteger $ end feat
, escapeField $ unparseMaybe (LBS.pack . show) $ score feat
, escapeField $ unparseMaybe unparseStrand $ strand feat
, escapeField $ unparseMaybe (LBS.pack . show) $ phase feat
, unparseAttributes $ attributes feat
]
where unparseMaybe unparseJust = maybe dot unparseJust
unparseStrand Fwd = LBS.singleton '+'
unparseStrand RevCompl = LBS.singleton '-'
unparseInteger = LBS.pack . show
unparseAttributes = LBS.intercalate (LBS.singleton ';') . map unparseAttribute
unparseAttribute (GFFAttr tag values) = LBS.concat [ escapeField tag, LBS.singleton '=',
LBS.intercalate (LBS.singleton ',') $ map escapeField values ]
gffFastaDirective :: LBS.ByteString
gffFastaDirective = LBS.pack "##FASTA"
parseWithFasta :: (Error e, MonadError e m) => LBS.ByteString -> m ([Feature], [LBS.ByteString])
parseWithFasta str = case break (== gffFastaDirective) $ LBS.lines str of
(featStrs, rest) -> do gffLines <- mapM parse $ filter notComment featStrs
return (gffLines, rest)
where notComment = maybe False ((/= '#') . fst) . LBS.uncons
attrByTag :: LBS.ByteString -> Feature -> [LBS.ByteString]
attrByTag tag = findByTag . attributes
where findByTag = maybe [] attrValues . find ((== tag) . attrTag)
idTag, parentTag :: LBS.ByteString
idTag = LBS.pack "ID"
parentTag = LBS.pack "Parent"
ids :: Feature -> [LBS.ByteString]
ids = attrByTag idTag
parentIds :: Feature -> [LBS.ByteString]
parentIds = attrByTag parentTag
contigLoc :: Feature -> CLoc.ContigLoc
contigLoc f = CLoc.ContigLoc (start f 1) (1 + end f start f) (fromMaybe Fwd $ strand f)
loc :: Feature -> Loc.Loc
loc f = Loc.Loc [ contigLoc f ]
seqLoc :: Feature -> SeqLoc.SeqLoc
seqLoc f = OnSeq (seqid f) (loc f)
name :: (Error e, MonadError e m) => Feature -> m SeqName
name f = maybe (throwError $ strMsg $ "No ID for feature " ++ show f) return $ listToMaybe $ ids f