{-| Data type for a more general sequence location consiting of
potentially disjoint ranges of positions on the sequence.

Throughout, /sequence position/ refers to a 'Pos.Pos' which includes a
strand.  An index into a sequence is referred to as an /offset/, and
is generally of type 'Offset'.


module Bio.Location.Location ( 
  -- * Sequence locations

  -- * Locations and positions
  , bounds, length, startPos, endPos, posInto, posOutof, isWithin, overlaps

  -- * Extracting subsequences
  , seqData, seqDataPadded

  -- * Transforming locations
  , extend

  -- * Displaying locations
  , display
  ) where 

import Prelude hiding (length)

import Control.Arrow ((***))
import Control.Monad.Error
import qualified Data.ByteString.Lazy.Char8 as LBS
import Data.List (intercalate)

import qualified Bio.Location.ContigLocation as CLoc
import qualified Bio.Location.Position as Pos
import Bio.Location.Strand
import Bio.Sequence.SeqData

-- | General (disjoint) sequence region consisting of a concatenated
--   set of contiguous regions (see 'CLoc.ContigLoc').
newtype Loc = Loc [CLoc.ContigLoc] deriving (Eq, Ord, Show)

instance Stranded Loc where
    revCompl (Loc contigs) = Loc $ reverse $ map revCompl contigs

-- | Returns the length of the region
length :: Loc -> Offset
length (Loc contigs) = sum $ map CLoc.length contigs

-- | The bounds of a sequence location.  This is a pair consisting of
-- the lowest and highest sequence offsets covered by the region.  The
-- bounds ignore the strand of the sequence location, and the first
-- element of the pair will always be lower than the second.  Even if
-- the positions in the location do not run monotonically through the
-- location, the overall lowest and highest sequence offsets are returned.
bounds :: Loc -> (Offset, Offset)
bounds (Loc []) = error "locBounds on zero-contig Loc"
bounds (Loc contigs) = (minimum *** maximum) $ unzip $ map CLoc.bounds contigs

-- | Sequence position of the start of the location.  This is the 5'
-- end on the location strand, which will have a higher offset than
-- 'endPos' if the location is on the 'RevCompl' strand.
startPos :: Loc -> Pos.Pos
startPos (Loc [])      = error "startPos: zero-contig Loc"
startPos (Loc contigs) = CLoc.startPos $ head contigs

-- | Sequence position of the end of the location, as described in 'startPos'.
endPos :: Loc -> Pos.Pos
endPos (Loc [])      = error "endPos: zero-contig Loc"
endPos (Loc contigs) = CLoc.endPos $ last contigs

-- | Extract the nucleotide 'SeqData' for the sequence location.  If
-- any part of the location lies outside the bounds of the sequence,
-- an error results.
seqData :: (Error e, MonadError e m) => SeqData -> Loc -> m SeqData
seqData sequ (Loc contigs)
    = liftM LBS.concat $ mapM (CLoc.seqData sequ) contigs

-- | As 'seqData', extract the nucleotide subsequence for the
-- location.  Any positions in the location lying outside the bounds
-- of the sequence are returned as @N@ rather than producing an error.
seqDataPadded :: SeqData -> Loc -> SeqData
seqDataPadded sequ (Loc contigs)
    = LBS.concat $ map (CLoc.seqDataPadded sequ) contigs

-- | Given a sequence position and a sequence location relative to the
-- same sequence, compute a new position representing the original
-- position relative to the subsequence defined by the location.  If
-- the sequence position lies outside of the sequence location,
-- @Nothing@ is returned; thus, the offset of the new position will
-- always be in the range @[0, length cloc - 1]@.
-- When the sequence positions in the location are not monotonic,
-- there may be multiple possible posInto solutions.  That is, if the
-- same outer sequence position is covered by two different contiguous
-- blocks of the location, then it would have two possible sequence
-- positions relative to the location. In this case, the position
-- 5\'-most in the location orientation is returned.
posInto :: Pos.Pos -> Loc -> Maybe Pos.Pos
posInto seqpos (Loc contigs) = posIntoContigs seqpos contigs

posIntoContigs :: Pos.Pos -> [CLoc.ContigLoc] -> Maybe Pos.Pos
posIntoContigs _ [] = Nothing
posIntoContigs seqpos (contig@(CLoc.ContigLoc _ len _):rest)
    = case CLoc.posInto seqpos contig of
        just@(Just _) -> just
        Nothing -> liftM (flip Pos.slide len) $ posIntoContigs seqpos rest

-- | Given a sequence location and a sequence position within that
-- location, compute a new position representing the original position
-- relative to the outer sequence.  If the sequence position lies
-- outside the location, @Nothing@ is returned.  
-- This function inverts 'posInto' when the sequence position lies
-- within the position is actually within the location.  Due to the
-- possibility of redundant location-relative positions for a given
-- absolute position, 'posInto' does not necessary invert 'posOutof'
posOutof :: Pos.Pos -> Loc -> Maybe Pos.Pos
posOutof pos (Loc contigs) = posOutofContigs pos contigs

posOutofContigs :: Pos.Pos -> [CLoc.ContigLoc] -> Maybe Pos.Pos
posOutofContigs _ [] = Nothing
posOutofContigs seqpos (contig@(CLoc.ContigLoc _ len _):rest)
    = case CLoc.posOutof seqpos contig of
        just@(Just _) -> just
        Nothing -> posOutofContigs (Pos.slide seqpos $ negate len) rest

-- | Returns a sequence location produced by extending the original
-- location on each end, based on a pair of (/5\' extension/, /3\'
-- extension/).  These add contiguous positions to the 5\' and 3\'
-- ends of the original location.  The 5\' extension is applied to the
-- 5\' end of the location on the location strand; if the location is
-- on the 'RevCompl' strand, the 5\' end will have a higher offset
-- than the 3\' end and this offset will increase by the amount of the
-- 5\' extension.  Similarly, the 3\' extension is applied to the 3\'
-- end of the location.
extend :: (Offset, Offset) -- ^ (5' extension, 3' extension)
       -> Loc -> Loc
extend _ (Loc []) = error "extendLoc on zero-contig Loc"
extend (ext5, ext3) (Loc contigs) = Loc $ case extendContigs3 contigs of
                                            [] -> error "Empty contigs after extendContigs3"
                                            (cfirst:crest) -> (CLoc.extend (ext5, 0) cfirst):crest
    where extendContigs3 [] = error "Empty contigs in extendContigs3"
          extendContigs3 [clast] = [CLoc.extend (0, ext3) clast]
          extendContigs3 (contig:crest) = contig : extendContigs3 crest

-- | Returns @True@ when  a sequence position lies within a sequence
-- location on the same sequence, and occupies the same strand.
isWithin :: Pos.Pos -> Loc -> Bool
isWithin seqpos (Loc contigs) = or $ map (CLoc.isWithin seqpos) contigs

overlappingContigs :: Loc -> Loc -> [(CLoc.ContigLoc, CLoc.ContigLoc)]
overlappingContigs (Loc contigs1) (Loc contigs2) 
    = filter (uncurry CLoc.overlaps) [(c1, c2) | c1 <- contigs1, c2 <- contigs2 ]

-- | Returns @True@ when two sequence locations overlap at any
-- position.
overlaps :: Loc -> Loc -> Bool
overlaps l1 l2 = not $ null $ overlappingContigs l1 l2

-- | Display a human-friendly, zero-based representation of a sequence location.
display :: Loc -> String
display (Loc contigs) = intercalate ";" $ map CLoc.display contigs