module Bio.GFF3.Feature ( GFFAttr(..)
                        , Feature(..), length
                        , parse, unparse
                        , parseWithFasta
                        , attrByTag
                        , ids, parentIds
                        , contigLoc, loc, seqLoc
                        , name
                        )
    where

import Prelude hiding (length)
import Control.Monad.Error
import qualified Data.ByteString.Lazy.Char8 as LBS
import Data.List hiding (length)
import qualified Data.List as List (length)
import Data.Maybe

import Bio.GFF3.Escape

import qualified Bio.Location.ContigLocation as CLoc
import qualified Bio.Location.Location as Loc
import Bio.Location.OnSeq
import qualified Bio.Location.SeqLocation as SeqLoc
import Bio.Location.Strand
import Bio.Sequence.SeqData

data GFFAttr = GFFAttr { attrTag :: !LBS.ByteString, attrValues :: ![LBS.ByteString] } deriving (Eq, Ord, Show)

data Feature = Feature { seqid :: !LBS.ByteString
                       , source :: !LBS.ByteString
                       , ftype :: !LBS.ByteString
                       , start, end :: !Offset
                       , score :: !(Maybe Double)
                       , strand :: !(Maybe Strand)
                       , phase :: !(Maybe Offset)
                       , attributes :: ![GFFAttr]
                       } deriving (Eq, Ord, Show)

length :: Feature -> Offset
length f = 1 + end f - start f

dot :: LBS.ByteString
dot = LBS.singleton '.' -- for Nothing in optional fields

parse :: (Error e, MonadError e m) => LBS.ByteString -> m Feature
parse line = case LBS.split '\t' line of
               [seqidStr,sourceStr,typeStr,startStr,endStr,scoreStr,strandStr,phaseStr,attrStr]
                   -> do fSeqid <- unEscapeByteString seqidStr
                         fSource <- unEscapeByteString sourceStr
                         fFtype <- unEscapeByteString typeStr
                         fStart <- unEscapeByteString startStr >>= parseInteger
                         fEnd <- unEscapeByteString endStr >>= parseInteger
                         fScore <- unEscapeByteString scoreStr >>= parseMaybe parseDouble
                         fStrand <- unEscapeByteString strandStr >>= parseMaybe parseStrand
                         fPhase <- unEscapeByteString phaseStr >>= parseMaybe parseInteger
                         fAttrs <- parseAttrs attrStr
                         return $ Feature fSeqid fSource fFtype fStart fEnd fScore fStrand fPhase fAttrs
               fs -> throwError $ strMsg $ "Malformed GFF line with " ++ show (List.length fs) ++ " tab-delimited fields"
    where parseInteger zstr = case LBS.readInteger zstr of
                                Just (z, rest) | LBS.null rest -> return $ fromIntegral z
                                _ ->  throwError $ strMsg $ "Malformed integer " ++ show (LBS.unpack zstr)
          parseMaybe subparse str = if str == dot then return Nothing else liftM Just $ subparse str
          parseDouble xstr = case reads $ LBS.unpack xstr of
                               [(x, "")] -> return x
                               _ -> throwError $ strMsg $ "Malformed double " ++ show (LBS.unpack xstr)
          parseStrand sstr | sstr == (LBS.singleton '+') = return Fwd
                           | sstr == (LBS.singleton '-') = return RevCompl
                           | otherwise = throwError $ strMsg $ "Malformed strand " ++ show (LBS.unpack sstr)
          parseAttrs = mapM parseAttr . LBS.split ';'
          parseAttr attrStr = case LBS.break (== '=') attrStr of
                                (tagStr,equalValStr) 
                                    -> case LBS.uncons equalValStr of
                                         Just ('=', valStr) -> do tag <- unEscapeByteString tagStr
                                                                  vals <- parseAttrVals valStr
                                                                  return $ GFFAttr tag vals
                                         _ -> throwError $ strMsg $ "Malformed attribute " ++ show (LBS.unpack attrStr)
          parseAttrVals valStr | LBS.null valStr = return [LBS.empty] -- Special case for single null value
                               | otherwise = mapM unEscapeByteString $ LBS.split ',' valStr

escapeField :: LBS.ByteString -> LBS.ByteString
escapeField = escapeAllOf "\t\r\n;=%&,"

escapeSeqid :: LBS.ByteString -> LBS.ByteString
escapeSeqid = escapeAllBut $ ".:^*$@!+_?-|" ++ ['a'..'z'] ++ ['A'..'Z'] ++ ['0'..'9']

unparse :: Feature -> LBS.ByteString
unparse feat = LBS.intercalate (LBS.singleton '\t') [ escapeSeqid $ seqid feat
                                                    , escapeField $ source feat
                                                    , escapeField $ ftype feat
                                                    , escapeField $ unparseInteger $ start feat
                                                    , escapeField $ unparseInteger $ end feat
                                                    , escapeField $ unparseMaybe (LBS.pack . show) $ score feat
                                                    , escapeField $ unparseMaybe unparseStrand $ strand feat
                                                    , escapeField $ unparseMaybe (LBS.pack . show) $ phase feat
                                                    , unparseAttributes $ attributes feat
                                                    ]
    where unparseMaybe unparseJust = maybe dot unparseJust
          unparseStrand Fwd = LBS.singleton '+'
          unparseStrand RevCompl = LBS.singleton '-'
          unparseInteger = LBS.pack . show
          unparseAttributes = LBS.intercalate (LBS.singleton ';') . map unparseAttribute
          unparseAttribute (GFFAttr tag values) = LBS.concat [ escapeField tag, LBS.singleton '=', 
                                                               LBS.intercalate (LBS.singleton ',') $ map escapeField values ]

gffFastaDirective :: LBS.ByteString
gffFastaDirective = LBS.pack "##FASTA"

parseWithFasta :: (Error e, MonadError e m) => LBS.ByteString -> m ([Feature], [LBS.ByteString])
parseWithFasta str = case break (== gffFastaDirective) $ LBS.lines str of
                       (featStrs, rest) -> do gffLines <- mapM parse $ filter notComment featStrs
                                              return (gffLines, rest)
    where notComment = maybe False ((/= '#') . fst) . LBS.uncons 

attrByTag :: LBS.ByteString -> Feature -> [LBS.ByteString]
attrByTag tag = findByTag . attributes
    where findByTag = maybe [] attrValues . find ((== tag) . attrTag)

idTag, parentTag :: LBS.ByteString
idTag = LBS.pack "ID"
parentTag = LBS.pack "Parent"

ids :: Feature -> [LBS.ByteString]
ids = attrByTag idTag

parentIds :: Feature -> [LBS.ByteString]
parentIds = attrByTag parentTag

contigLoc :: Feature -> CLoc.ContigLoc
contigLoc f = CLoc.ContigLoc (start f - 1) (1 + end f - start f) (fromMaybe Fwd $ strand f)

loc :: Feature -> Loc.Loc
loc f = Loc.Loc [ contigLoc f ]

seqLoc :: Feature -> SeqLoc.SeqLoc
seqLoc f = OnSeq (seqid f) (loc f)

name :: (Error e, MonadError e m) => Feature -> m SeqName
name f = maybe (throwError $ strMsg $ "No ID for feature " ++ show f) return $ listToMaybe $ ids f