{-| Data type for a sequence location consiting of a contiguous range
of positions on the sequence.

Throughout, /sequence position/ refers to a 'Pos.Pos' which includes a
strand.  An index into a sequence is referred to as an /offset/, and
is generally of type 'Offset'.

 -}

module Bio.Location.ContigLocation ( 
  -- * Sequence locations
  ContigLoc(..), fromStartEnd, fromPosLen

  -- * Locations and positions
  , bounds, startPos, endPos, posInto, posOutof, isWithin, overlaps

  -- * Extracting subsequences
  , seqData, seqDataPadded

  -- * Transforming locations
  , slide, extend

  -- * Displaying locations
  , display
                                   ) where

import Prelude hiding (length)

import Control.Monad.Error
import qualified Data.ByteString.Lazy.Char8 as LBS

import Bio.Sequence.SeqData

import qualified Bio.Location.Position as Pos
import Bio.Location.Strand

-- | Contiguous sequence location defined by a span of sequence
-- positions, lying on a specific strand of the sequence.
data ContigLoc = ContigLoc { offset5 :: !Offset   -- ^ The offset of the 5\' end of the location, as a 0-based index
                           , length :: !Offset    -- ^ The length of the location
                           , strand :: !Strand    -- ^ The strand of the location
                           } deriving (Eq, Ord, Show)

instance Stranded ContigLoc where
    revCompl (ContigLoc seq5 len str) = ContigLoc seq5 len $ revCompl str

-- | Create a sequence location lying between 0-based starting and
-- ending offsets.  When @start < end@, the location
-- be on the forward strand, otherwise it will be on the
-- reverse complement strand.

fromStartEnd :: Offset -> Offset -> ContigLoc
fromStartEnd start end
    | start < end = ContigLoc start (1 + end - start) Fwd
    | otherwise   = ContigLoc end   (1 + start - end) RevCompl

-- | Create a sequence location from the sequence position of the
-- start of the location and the length of the position.  The strand
-- of the location, and the direction it extends from the starting
-- position, are determined by the strand of the starting position.
fromPosLen :: Pos.Pos -> Offset -> ContigLoc
fromPosLen (Pos.Pos off5 Fwd)      len = ContigLoc off5               len Fwd
fromPosLen (Pos.Pos off3 RevCompl) len = ContigLoc (off3 - (len - 1)) len RevCompl

-- | The bounds of a sequence location.  This is a pair consisting of
-- the lowest and highest sequence offsets covered by the region.  The
-- bounds ignore the strand of the sequence location, and the first
-- element of the pair will always be lower than the second.
bounds :: ContigLoc -> (Offset, Offset)
bounds (ContigLoc seq5 len _) = (seq5, seq5 + len - 1)

-- | Sequence position of the start of the location.  This is the 5'
-- end on the location strand, which will have a higher offset than
-- 'endPos' if the location is on the 'RevCompl' strand.
startPos :: ContigLoc -> Pos.Pos
startPos (ContigLoc seq5 len str) 
    = case str of
        Fwd      -> Pos.Pos seq5             str
        RevCompl -> Pos.Pos (seq5 + len - 1) str

-- | Sequence position of the end of the location, as described in 'startPos'.
endPos :: ContigLoc -> Pos.Pos
endPos (ContigLoc seq5 len str) 
    = case str of
        Fwd      -> Pos.Pos (seq5 + len - 1) str
        RevCompl -> Pos.Pos seq5             str

-- | Returns a location resulting from sliding the original location
-- along the sequence by a specified offset.  A positive offset will
-- move the location away from the 5\' end of the forward stand of the
-- sequence regardless of the strand of the location itself.  Thus,
-- 
-- > slide (revCompl cloc) off == revCompl (slide cloc off)
slide :: Offset -> ContigLoc -> ContigLoc
slide dpos (ContigLoc seq5 len str) = ContigLoc (seq5 + dpos) len str

-- | Extract the nucleotide 'SeqData' for the sequence location.  If
-- any part of the location lies outside the bounds of the sequence,
-- an error results.
seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData
seqData sequ (ContigLoc seq5 len str)
    | seq5 < 0 = outOfBounds
    | otherwise = case LBS.take len $ LBS.drop seq5 sequ of
                    fwdseq | LBS.length fwdseq == len -> return $ stranded str fwdseq
                           | otherwise -> outOfBounds
    where outOfBounds = throwError $ strMsg $ "contig seq loc " ++ show (seq5, seq5 + len - 1) ++ " out of SeqData bounds"

-- | As 'seqData', extract the nucleotide subsequence for the
-- location.  Any positions in the location lying outside the bounds
-- of the sequence are returned as @N@ rather than producing an error.
seqDataPadded :: SeqData -> ContigLoc -> SeqData
seqDataPadded sequ (ContigLoc seq5 len str) = stranded str fwdseq
    where fwdseq
              | seq5 + len <= 0         = LBS.replicate len 'N'
              | seq5 >= LBS.length sequ = LBS.replicate len 'N'
              | seq5 < 0  = LBS.replicate (negate seq5) 'N' `LBS.append` takePadded (len + seq5) sequ
              | otherwise = takePadded len $ LBS.drop seq5 sequ
          takePadded sublen subsequ
              | sublen <= LBS.length subsequ = LBS.take sublen subsequ
              | otherwise = subsequ `LBS.append` LBS.replicate (sublen - LBS.length subsequ) 'N'

-- | Given a sequence position and a sequence location relative to the
-- same sequence, compute a new position representing the original
-- position relative to the subsequence defined by the location.  If
-- the sequence position lies outside of the sequence location,
-- @Nothing@ is returned; thus, the offset of the new position will
-- always be in the range @[0, length cloc - 1]@.
posInto :: Pos.Pos -> ContigLoc -> Maybe Pos.Pos
posInto (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand)
    | pos < seq5 || pos >= seq5 + len = Nothing
    | otherwise = Just $ case cStrand of
                           Fwd      -> Pos.Pos (pos - seq5)              pStrand
                           RevCompl -> Pos.Pos (seq5 + len  - (pos + 1)) (revCompl pStrand)


-- | Given a sequence location and a sequence position within that
-- location, compute a new position representing the original position
-- relative to the outer sequence.  If the sequence position lies
-- outside the location, @Nothing@ is returned.
-- 
-- This function inverts 'posInto' when the sequence position lies
-- within the position is actually within the location.
posOutof :: Pos.Pos -> ContigLoc -> Maybe Pos.Pos
posOutof (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand)
    | pos < 0 || pos >= len = Nothing
    | otherwise = Just $ case cStrand of
                           Fwd      -> Pos.Pos (pos + seq5)             pStrand
                           RevCompl -> Pos.Pos (seq5 + len - (pos + 1)) (revCompl pStrand)

-- | Returns a sequence location produced by extending the original
-- location on each end, based on a pair of (/5\' extension/, /3\'
-- extension/).  The 5\' extension is applied to the 5\' end of the
-- location on the location strand; if the location is on the
-- 'RevCompl' strand, the 5\' end will have a higher offset than the
-- 3\' end and this offset will increase by the amount of the 5\'
-- extension.  Similarly, the 3\' extension is applied to the 3\' end
-- of the location.
extend :: (Offset, Offset) -> ContigLoc -> ContigLoc
extend (ext5, ext3) (ContigLoc seq5 len str)
    = case str of
        Fwd -> ContigLoc (seq5 - ext5) (len + ext5 + ext3) str
        RevCompl -> ContigLoc (seq5 - ext3) (len + ext5 + ext3) str

-- | Returns @True@ when  a sequence position lies within a sequence
-- location on the same sequence, and occupies the same strand.
isWithin :: Pos.Pos -> ContigLoc -> Bool
isWithin (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand)
    = (pos >= seq5) && (pos < seq5 + len) && (cStrand == pStrand)

-- | Returns @True@ when two sequence locations overlap at any
-- position.
overlaps :: ContigLoc -> ContigLoc -> Bool
overlaps contig1 contig2
    = case (bounds contig1, bounds contig2) of
        ((low1, high1),(low2, high2)) -> (strand contig1 == strand contig2)
                                         && (low1 <= high2) && (low2 <= high1)

-- | Display a human-friendly, zero-based representation of a sequence location.
display :: ContigLoc -> String
display cloc = show (Pos.offset $ startPos cloc) ++ "to" ++ show (Pos.offset $ endPos cloc)