-- |Data types for working with locations in a sequence.  Zero-based
--  'Int64' indices are used throughout, to facilitate direct use of
--  indexing functions on 'SeqData'.

module Bio.Location.ContigLocation ( ContigLoc(..), fromStartEnd, fromPosLen
                                   , bounds, startPos, endPos
                                   , slide, extend, posInto, posOutof
                                   , seqData, seqDataPadded, isWithin, overlaps
                                   , display
                                   )
    where 

import Prelude hiding (length)

import Control.Monad.Error
import qualified Data.ByteString.Lazy.Char8 as LBS

import Bio.Sequence.SeqData

import qualified Bio.Location.Position as Pos
import Bio.Location.Strand

-- | Contiguous set of positions in a sequence
data ContigLoc = ContigLoc { offset5 :: !Offset   -- ^ 5' end of region on target sequence, 0-based index
                           , length :: !Offset -- ^ length of region on target sequence
                           , strand :: !Strand -- ^ strand of region
                           } deriving (Eq, Ord, Show)

instance Stranded ContigLoc where
    revCompl (ContigLoc seq5 len str) = ContigLoc seq5 len $ revCompl str

-- | Create a 'ContigLoc' from 0-based starting and ending positions.
--   When 'start' is less than 'end' the position will be on the 'Fwd'
--   'Strand', otherwise it will be on the 'RevCompl' strand.
fromStartEnd :: Offset -> Offset -> ContigLoc
fromStartEnd start end
    | start < end = ContigLoc start (1 + end - start) Fwd
    | otherwise   = ContigLoc end   (1 + start - end) RevCompl

-- | Create a 'ContigLoc' from a Pos.Pos defining the start
-- ('ContigLoc' 5 prime end) position on the sequence and the length.
fromPosLen :: Pos.Pos -> Offset -> ContigLoc
fromPosLen (Pos.Pos off5 Fwd)      len = ContigLoc off5               len Fwd
fromPosLen (Pos.Pos off3 RevCompl) len = ContigLoc (off3 - (len - 1)) len RevCompl

-- | The bounds of a 'ContigLoc', a pair of the lowest and highest
--   sequence indices covered by the region, which ignores the strand
--   of the 'ContigLoc'.  The first element of the pair will always be
--   lower than the second.
bounds :: ContigLoc -> (Offset, Offset)
bounds (ContigLoc seq5 len _) = (seq5, seq5 + len - 1)

-- | 0-based starting (5' in the region orientation) position
startPos :: ContigLoc -> Pos.Pos
startPos (ContigLoc seq5 len str) 
    = case str of
        Fwd      -> Pos.Pos seq5             str
        RevCompl -> Pos.Pos (seq5 + len - 1) str

-- | 0-based ending (3' in the region orientation) position
endPos :: ContigLoc -> Pos.Pos
endPos (ContigLoc seq5 len str) 
    = case str of
        Fwd      -> Pos.Pos (seq5 + len - 1) str
        RevCompl -> Pos.Pos seq5             str

-- | Move a 'ContigLoc' region by a specified offset
slide :: Offset -> ContigLoc -> ContigLoc
slide dpos (ContigLoc seq5 len str) = ContigLoc (seq5 + dpos) len str

-- | Subsequence 'SeqData' for a 'ContigLoc', provided that the region
--   is entirely within the sequence.
seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData
seqData sequ (ContigLoc seq5 len str)
    | seq5 < 0 = outOfBounds
    | otherwise = case LBS.take len $ LBS.drop seq5 sequ of
                    fwdseq | LBS.length fwdseq == len -> return $ stranded str fwdseq
                           | otherwise -> outOfBounds
    where outOfBounds = throwError $ strMsg $ "contig seq loc " ++ show (seq5, seq5 + len - 1) ++ " out of SeqData bounds"

-- | Subsequence 'SeqData' for a 'ContigLoc', padded as needed with Ns
seqDataPadded :: SeqData -> ContigLoc -> SeqData
seqDataPadded sequ (ContigLoc seq5 len str) = stranded str fwdseq
    where fwdseq
              | seq5 + len <= 0         = LBS.replicate len 'N'
              | seq5 >= LBS.length sequ = LBS.replicate len 'N'
              | seq5 < 0  = LBS.replicate (negate seq5) 'N' `LBS.append` takePadded (len + seq5) sequ
              | otherwise = takePadded len $ LBS.drop seq5 sequ
          takePadded sublen subsequ
              | sublen <= LBS.length subsequ = LBS.take sublen subsequ
              | otherwise = subsequ `LBS.append` LBS.replicate (sublen - LBS.length subsequ) 'N'

-- | For a 'Pos' and a 'ContigLoc' on the same sequence, find the
--   corresponding 'Pos' relative to the 'ContigLoc', provided it is
--   within the 'ContigLoc'.
posInto :: Pos.Pos -> ContigLoc -> Maybe Pos.Pos
posInto (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand)
    | pos < seq5 || pos >= seq5 + len = Nothing
    | otherwise = Just $ case cStrand of
                           Fwd      -> Pos.Pos (pos - seq5)              pStrand
                           RevCompl -> Pos.Pos (seq5 + len  - (pos + 1)) (revCompl pStrand)

-- | For a 'Pos' specified relative to a 'ContigLoc', find the
--   corresponding 'Pos' relative to the outer sequence, provided that
--   the 'Pos' is within the bounds of the 'ContigLoc'.
posOutof :: Pos.Pos -> ContigLoc -> Maybe Pos.Pos
posOutof (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand)
    | pos < 0 || pos >= len = Nothing
    | otherwise = Just $ case cStrand of
                           Fwd      -> Pos.Pos (pos + seq5)             pStrand
                           RevCompl -> Pos.Pos (seq5 + len - (pos + 1)) (revCompl pStrand)

-- | 'ContigLoc' extended on the 5' and 3' ends.
extend :: (Offset, Offset) -> ContigLoc -> ContigLoc
extend (ext5, ext3) (ContigLoc seq5 len str)
    = case str of
        Fwd -> ContigLoc (seq5 - ext5) (len + ext5 + ext3) str
        RevCompl -> ContigLoc (seq5 - ext3) (len + ext5 + ext3) str

-- | For a 'Pos' and a 'ContigLoc' on the same sequence, is the 'Pos'
--   within the 'ContigLoc'.
isWithin :: Pos.Pos -> ContigLoc -> Bool
isWithin (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand)
    = (pos >= seq5) && (pos < seq5 + len) && (cStrand == pStrand)

-- | For a pair of 'ContigLoc' regions on the same sequence, indicates
--   if they overlap at all.
overlaps :: ContigLoc -> ContigLoc -> Bool
overlaps contig1 contig2
    = case (bounds contig1, bounds contig2) of
        ((low1, high1),(low2, high2)) -> (strand contig1 == strand contig2)
                                         && (low1 <= high2) && (low2 <= high1)

display :: ContigLoc -> String
display cloc = show (Pos.offset $ startPos cloc) ++ "to" ++ show (Pos.offset $ endPos cloc)