module Bio.GFF3.Feature ( GFFAttr(..) , Feature(..), length , parse, unparse , parseWithFasta , attrByTag , ids, parentIds , contigLoc, loc, seqLoc , name ) where import Prelude hiding (length) import Control.Monad.Error import qualified Data.ByteString.Lazy.Char8 as LBS import Data.List hiding (length) import qualified Data.List as List (length) import Data.Maybe import Bio.GFF3.Escape import qualified Bio.Location.ContigLocation as CLoc import qualified Bio.Location.Location as Loc import Bio.Location.OnSeq import qualified Bio.Location.SeqLocation as SeqLoc import Bio.Location.Strand import Bio.Sequence.SeqData data GFFAttr = GFFAttr { attrTag :: !LBS.ByteString, attrValues :: ![LBS.ByteString] } deriving (Eq, Ord, Show) data Feature = Feature { seqid :: !LBS.ByteString , source :: !LBS.ByteString , ftype :: !LBS.ByteString , start, end :: !Offset , score :: !(Maybe Double) , strand :: !(Maybe Strand) , phase :: !(Maybe Offset) , attributes :: ![GFFAttr] } deriving (Eq, Ord, Show) length :: Feature -> Offset length f = 1 + end f - start f dot :: LBS.ByteString dot = LBS.singleton '.' -- for Nothing in optional fields parse :: (Error e, MonadError e m) => LBS.ByteString -> m Feature parse line = case LBS.split '\t' line of [seqidStr,sourceStr,typeStr,startStr,endStr,scoreStr,strandStr,phaseStr,attrStr] -> do fSeqid <- unEscapeByteString seqidStr fSource <- unEscapeByteString sourceStr fFtype <- unEscapeByteString typeStr fStart <- unEscapeByteString startStr >>= parseInteger fEnd <- unEscapeByteString endStr >>= parseInteger fScore <- unEscapeByteString scoreStr >>= parseMaybe parseDouble fStrand <- unEscapeByteString strandStr >>= parseMaybe parseStrand fPhase <- unEscapeByteString phaseStr >>= parseMaybe parseInteger fAttrs <- parseAttrs attrStr return $ Feature fSeqid fSource fFtype fStart fEnd fScore fStrand fPhase fAttrs fs -> throwError $ strMsg $ "Malformed GFF line with " ++ show (List.length fs) ++ " tab-delimited fields" where parseInteger zstr = case LBS.readInteger zstr of Just (z, rest) | LBS.null rest -> return $ fromIntegral z _ -> throwError $ strMsg $ "Malformed integer " ++ show (LBS.unpack zstr) parseMaybe subparse str = if str == dot then return Nothing else liftM Just $ subparse str parseDouble xstr = case reads $ LBS.unpack xstr of [(x, "")] -> return x _ -> throwError $ strMsg $ "Malformed double " ++ show (LBS.unpack xstr) parseStrand sstr | sstr == (LBS.singleton '+') = return Fwd | sstr == (LBS.singleton '-') = return RevCompl | otherwise = throwError $ strMsg $ "Malformed strand " ++ show (LBS.unpack sstr) parseAttrs = mapM parseAttr . LBS.split ';' parseAttr attrStr = case LBS.break (== '=') attrStr of (tagStr,equalValStr) -> case LBS.uncons equalValStr of Just ('=', valStr) -> do tag <- unEscapeByteString tagStr vals <- parseAttrVals valStr return $ GFFAttr tag vals _ -> throwError $ strMsg $ "Malformed attribute " ++ show (LBS.unpack attrStr) parseAttrVals valStr | LBS.null valStr = return [LBS.empty] -- Special case for single null value | otherwise = mapM unEscapeByteString $ LBS.split ',' valStr escapeField :: LBS.ByteString -> LBS.ByteString escapeField = escapeAllOf "\t\r\n;=%&," escapeSeqid :: LBS.ByteString -> LBS.ByteString escapeSeqid = escapeAllBut $ ".:^*$@!+_?-|" ++ ['a'..'z'] ++ ['A'..'Z'] ++ ['0'..'9'] unparse :: Feature -> LBS.ByteString unparse feat = LBS.intercalate (LBS.singleton '\t') [ escapeSeqid $ seqid feat , escapeField $ source feat , escapeField $ ftype feat , escapeField $ unparseInteger $ start feat , escapeField $ unparseInteger $ end feat , escapeField $ unparseMaybe (LBS.pack . show) $ score feat , escapeField $ unparseMaybe unparseStrand $ strand feat , escapeField $ unparseMaybe (LBS.pack . show) $ phase feat , unparseAttributes $ attributes feat ] where unparseMaybe unparseJust = maybe dot unparseJust unparseStrand Fwd = LBS.singleton '+' unparseStrand RevCompl = LBS.singleton '-' unparseInteger = LBS.pack . show unparseAttributes = LBS.intercalate (LBS.singleton ';') . map unparseAttribute unparseAttribute (GFFAttr tag values) = LBS.concat [ escapeField tag, LBS.singleton '=', LBS.intercalate (LBS.singleton ',') $ map escapeField values ] gffFastaDirective :: LBS.ByteString gffFastaDirective = LBS.pack "##FASTA" parseWithFasta :: (Error e, MonadError e m) => LBS.ByteString -> m ([Feature], [LBS.ByteString]) parseWithFasta str = case break (== gffFastaDirective) $ LBS.lines str of (featStrs, rest) -> do gffLines <- mapM parse $ filter notComment featStrs return (gffLines, rest) where notComment = maybe False ((/= '#') . fst) . LBS.uncons attrByTag :: LBS.ByteString -> Feature -> [LBS.ByteString] attrByTag tag = findByTag . attributes where findByTag = maybe [] attrValues . find ((== tag) . attrTag) idTag, parentTag :: LBS.ByteString idTag = LBS.pack "ID" parentTag = LBS.pack "Parent" ids :: Feature -> [LBS.ByteString] ids = attrByTag idTag parentIds :: Feature -> [LBS.ByteString] parentIds = attrByTag parentTag contigLoc :: Feature -> CLoc.ContigLoc contigLoc f = CLoc.ContigLoc (start f - 1) (1 + end f - start f) (fromMaybe Fwd $ strand f) loc :: Feature -> Loc.Loc loc f = Loc.Loc [ contigLoc f ] seqLoc :: Feature -> SeqLoc.SeqLoc seqLoc f = OnSeq (seqid f) (loc f) name :: (Error e, MonadError e m) => Feature -> m SeqName name f = maybe (throwError $ strMsg $ "No ID for feature " ++ show f) return $ listToMaybe $ ids f