{-| Data type for a more general sequence location consiting of potentially disjoint ranges of positions on the sequence. Throughout, /sequence position/ refers to a 'Pos.Pos' which includes a strand. An index into a sequence is referred to as an /offset/, and is generally of type 'Offset'. -} module Bio.Location.Location ( -- * Sequence locations Loc(..) -- * Locations and positions , bounds, length, startPos, endPos, posInto, posOutof, isWithin, overlaps -- * Extracting subsequences , seqData, seqDataPadded -- * Transforming locations , extend -- * Displaying locations , display ) where import Prelude hiding (length) import Control.Arrow ((***)) import Control.Monad.Error import qualified Data.ByteString.Lazy.Char8 as LBS import Data.List (intercalate) import qualified Bio.Location.ContigLocation as CLoc import qualified Bio.Location.Position as Pos import Bio.Location.Strand import Bio.Sequence.SeqData -- | General (disjoint) sequence region consisting of a concatenated -- set of contiguous regions (see 'CLoc.ContigLoc'). newtype Loc = Loc [CLoc.ContigLoc] deriving (Eq, Ord, Show) instance Stranded Loc where revCompl (Loc contigs) = Loc $ reverse $ map revCompl contigs -- | Returns the length of the region length :: Loc -> Offset length (Loc contigs) = sum $ map CLoc.length contigs -- | The bounds of a sequence location. This is a pair consisting of -- the lowest and highest sequence offsets covered by the region. The -- bounds ignore the strand of the sequence location, and the first -- element of the pair will always be lower than the second. Even if -- the positions in the location do not run monotonically through the -- location, the overall lowest and highest sequence offsets are returned. bounds :: Loc -> (Offset, Offset) bounds (Loc []) = error "locBounds on zero-contig Loc" bounds (Loc contigs) = (minimum *** maximum) $ unzip $ map CLoc.bounds contigs -- | Sequence position of the start of the location. This is the 5' -- end on the location strand, which will have a higher offset than -- 'endPos' if the location is on the 'RevCompl' strand. startPos :: Loc -> Pos.Pos startPos (Loc []) = error "startPos: zero-contig Loc" startPos (Loc contigs) = CLoc.startPos $ head contigs -- | Sequence position of the end of the location, as described in 'startPos'. endPos :: Loc -> Pos.Pos endPos (Loc []) = error "endPos: zero-contig Loc" endPos (Loc contigs) = CLoc.endPos $ last contigs -- | Extract the nucleotide 'SeqData' for the sequence location. If -- any part of the location lies outside the bounds of the sequence, -- an error results. seqData :: (Error e, MonadError e m) => SeqData -> Loc -> m SeqData seqData sequ (Loc contigs) = liftM LBS.concat $ mapM (CLoc.seqData sequ) contigs -- | As 'seqData', extract the nucleotide subsequence for the -- location. Any positions in the location lying outside the bounds -- of the sequence are returned as @N@ rather than producing an error. seqDataPadded :: SeqData -> Loc -> SeqData seqDataPadded sequ (Loc contigs) = LBS.concat $ map (CLoc.seqDataPadded sequ) contigs -- | Given a sequence position and a sequence location relative to the -- same sequence, compute a new position representing the original -- position relative to the subsequence defined by the location. If -- the sequence position lies outside of the sequence location, -- @Nothing@ is returned; thus, the offset of the new position will -- always be in the range @[0, length cloc - 1]@. -- -- When the sequence positions in the location are not monotonic, -- there may be multiple possible posInto solutions. That is, if the -- same outer sequence position is covered by two different contiguous -- blocks of the location, then it would have two possible sequence -- positions relative to the location. In this case, the position -- 5\'-most in the location orientation is returned. posInto :: Pos.Pos -> Loc -> Maybe Pos.Pos posInto seqpos (Loc contigs) = posIntoContigs seqpos contigs posIntoContigs :: Pos.Pos -> [CLoc.ContigLoc] -> Maybe Pos.Pos posIntoContigs _ [] = Nothing posIntoContigs seqpos (contig@(CLoc.ContigLoc _ len _):rest) = case CLoc.posInto seqpos contig of just@(Just _) -> just Nothing -> liftM (flip Pos.slide len) $ posIntoContigs seqpos rest -- | Given a sequence location and a sequence position within that -- location, compute a new position representing the original position -- relative to the outer sequence. If the sequence position lies -- outside the location, @Nothing@ is returned. -- -- This function inverts 'posInto' when the sequence position lies -- within the position is actually within the location. Due to the -- possibility of redundant location-relative positions for a given -- absolute position, 'posInto' does not necessary invert 'posOutof' posOutof :: Pos.Pos -> Loc -> Maybe Pos.Pos posOutof pos (Loc contigs) = posOutofContigs pos contigs posOutofContigs :: Pos.Pos -> [CLoc.ContigLoc] -> Maybe Pos.Pos posOutofContigs _ [] = Nothing posOutofContigs seqpos (contig@(CLoc.ContigLoc _ len _):rest) = case CLoc.posOutof seqpos contig of just@(Just _) -> just Nothing -> posOutofContigs (Pos.slide seqpos $ negate len) rest -- | Returns a sequence location produced by extending the original -- location on each end, based on a pair of (/5\' extension/, /3\' -- extension/). These add contiguous positions to the 5\' and 3\' -- ends of the original location. The 5\' extension is applied to the -- 5\' end of the location on the location strand; if the location is -- on the 'RevCompl' strand, the 5\' end will have a higher offset -- than the 3\' end and this offset will increase by the amount of the -- 5\' extension. Similarly, the 3\' extension is applied to the 3\' -- end of the location. extend :: (Offset, Offset) -- ^ (5' extension, 3' extension) -> Loc -> Loc extend _ (Loc []) = error "extendLoc on zero-contig Loc" extend (ext5, ext3) (Loc contigs) = Loc $ case extendContigs3 contigs of [] -> error "Empty contigs after extendContigs3" (cfirst:crest) -> (CLoc.extend (ext5, 0) cfirst):crest where extendContigs3 [] = error "Empty contigs in extendContigs3" extendContigs3 [clast] = [CLoc.extend (0, ext3) clast] extendContigs3 (contig:crest) = contig : extendContigs3 crest -- | Returns @True@ when a sequence position lies within a sequence -- location on the same sequence, and occupies the same strand. isWithin :: Pos.Pos -> Loc -> Bool isWithin seqpos (Loc contigs) = or $ map (CLoc.isWithin seqpos) contigs overlappingContigs :: Loc -> Loc -> [(CLoc.ContigLoc, CLoc.ContigLoc)] overlappingContigs (Loc contigs1) (Loc contigs2) = filter (uncurry CLoc.overlaps) [(c1, c2) | c1 <- contigs1, c2 <- contigs2 ] -- | Returns @True@ when two sequence locations overlap at any -- position. overlaps :: Loc -> Loc -> Bool overlaps l1 l2 = not $ null $ overlappingContigs l1 l2 -- | Display a human-friendly, zero-based representation of a sequence location. display :: Loc -> String display (Loc contigs) = intercalate ";" $ map CLoc.display contigs