{-| Data type for a sequence location consiting of a contiguous range of positions on the sequence. Throughout, /sequence position/ refers to a 'Pos.Pos' which includes a strand. An index into a sequence is referred to as an /offset/, and is generally of type 'Offset'. -} module Bio.Location.ContigLocation ( -- * Sequence locations ContigLoc(..), fromStartEnd, fromPosLen -- * Locations and positions , bounds, startPos, endPos, posInto, posOutof, isWithin, overlaps -- * Extracting subsequences , seqData, seqDataPadded -- * Transforming locations , slide, extend -- * Displaying locations , display ) where import Prelude hiding (length) import Control.Monad.Error import qualified Data.ByteString.Lazy.Char8 as LBS import Bio.Sequence.SeqData import qualified Bio.Location.Position as Pos import Bio.Location.Strand -- | Contiguous sequence location defined by a span of sequence -- positions, lying on a specific strand of the sequence. data ContigLoc = ContigLoc { offset5 :: !Offset -- ^ The offset of the 5\' end of the location, as a 0-based index , length :: !Offset -- ^ The length of the location , strand :: !Strand -- ^ The strand of the location } deriving (Eq, Ord, Show) instance Stranded ContigLoc where revCompl (ContigLoc seq5 len str) = ContigLoc seq5 len $ revCompl str -- | Create a sequence location lying between 0-based starting and -- ending offsets. When @start < end@, the location -- be on the forward strand, otherwise it will be on the -- reverse complement strand. fromStartEnd :: Offset -> Offset -> ContigLoc fromStartEnd start end | start < end = ContigLoc start (1 + end - start) Fwd | otherwise = ContigLoc end (1 + start - end) RevCompl -- | Create a sequence location from the sequence position of the -- start of the location and the length of the position. The strand -- of the location, and the direction it extends from the starting -- position, are determined by the strand of the starting position. fromPosLen :: Pos.Pos -> Offset -> ContigLoc fromPosLen (Pos.Pos off5 Fwd) len = ContigLoc off5 len Fwd fromPosLen (Pos.Pos off3 RevCompl) len = ContigLoc (off3 - (len - 1)) len RevCompl -- | The bounds of a sequence location. This is a pair consisting of -- the lowest and highest sequence offsets covered by the region. The -- bounds ignore the strand of the sequence location, and the first -- element of the pair will always be lower than the second. bounds :: ContigLoc -> (Offset, Offset) bounds (ContigLoc seq5 len _) = (seq5, seq5 + len - 1) -- | Sequence position of the start of the location. This is the 5' -- end on the location strand, which will have a higher offset than -- 'endPos' if the location is on the 'RevCompl' strand. startPos :: ContigLoc -> Pos.Pos startPos (ContigLoc seq5 len str) = case str of Fwd -> Pos.Pos seq5 str RevCompl -> Pos.Pos (seq5 + len - 1) str -- | Sequence position of the end of the location, as described in 'startPos'. endPos :: ContigLoc -> Pos.Pos endPos (ContigLoc seq5 len str) = case str of Fwd -> Pos.Pos (seq5 + len - 1) str RevCompl -> Pos.Pos seq5 str -- | Returns a location resulting from sliding the original location -- along the sequence by a specified offset. A positive offset will -- move the location away from the 5\' end of the forward stand of the -- sequence regardless of the strand of the location itself. Thus, -- -- > slide (revCompl cloc) off == revCompl (slide cloc off) slide :: Offset -> ContigLoc -> ContigLoc slide dpos (ContigLoc seq5 len str) = ContigLoc (seq5 + dpos) len str -- | Extract the nucleotide 'SeqData' for the sequence location. If -- any part of the location lies outside the bounds of the sequence, -- an error results. seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData seqData sequ (ContigLoc seq5 len str) | seq5 < 0 = outOfBounds | otherwise = case LBS.take len $ LBS.drop seq5 sequ of fwdseq | LBS.length fwdseq == len -> return $ stranded str fwdseq | otherwise -> outOfBounds where outOfBounds = throwError $ strMsg $ "contig seq loc " ++ show (seq5, seq5 + len - 1) ++ " out of SeqData bounds" -- | As 'seqData', extract the nucleotide subsequence for the -- location. Any positions in the location lying outside the bounds -- of the sequence are returned as @N@ rather than producing an error. seqDataPadded :: SeqData -> ContigLoc -> SeqData seqDataPadded sequ (ContigLoc seq5 len str) = stranded str fwdseq where fwdseq | seq5 + len <= 0 = LBS.replicate len 'N' | seq5 >= LBS.length sequ = LBS.replicate len 'N' | seq5 < 0 = LBS.replicate (negate seq5) 'N' `LBS.append` takePadded (len + seq5) sequ | otherwise = takePadded len $ LBS.drop seq5 sequ takePadded sublen subsequ | sublen <= LBS.length subsequ = LBS.take sublen subsequ | otherwise = subsequ `LBS.append` LBS.replicate (sublen - LBS.length subsequ) 'N' -- | Given a sequence position and a sequence location relative to the -- same sequence, compute a new position representing the original -- position relative to the subsequence defined by the location. If -- the sequence position lies outside of the sequence location, -- @Nothing@ is returned; thus, the offset of the new position will -- always be in the range @[0, length cloc - 1]@. posInto :: Pos.Pos -> ContigLoc -> Maybe Pos.Pos posInto (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand) | pos < seq5 || pos >= seq5 + len = Nothing | otherwise = Just $ case cStrand of Fwd -> Pos.Pos (pos - seq5) pStrand RevCompl -> Pos.Pos (seq5 + len - (pos + 1)) (revCompl pStrand) -- | Given a sequence location and a sequence position within that -- location, compute a new position representing the original position -- relative to the outer sequence. If the sequence position lies -- outside the location, @Nothing@ is returned. -- -- This function inverts 'posInto' when the sequence position lies -- within the position is actually within the location. posOutof :: Pos.Pos -> ContigLoc -> Maybe Pos.Pos posOutof (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand) | pos < 0 || pos >= len = Nothing | otherwise = Just $ case cStrand of Fwd -> Pos.Pos (pos + seq5) pStrand RevCompl -> Pos.Pos (seq5 + len - (pos + 1)) (revCompl pStrand) -- | Returns a sequence location produced by extending the original -- location on each end, based on a pair of (/5\' extension/, /3\' -- extension/). The 5\' extension is applied to the 5\' end of the -- location on the location strand; if the location is on the -- 'RevCompl' strand, the 5\' end will have a higher offset than the -- 3\' end and this offset will increase by the amount of the 5\' -- extension. Similarly, the 3\' extension is applied to the 3\' end -- of the location. extend :: (Offset, Offset) -> ContigLoc -> ContigLoc extend (ext5, ext3) (ContigLoc seq5 len str) = case str of Fwd -> ContigLoc (seq5 - ext5) (len + ext5 + ext3) str RevCompl -> ContigLoc (seq5 - ext3) (len + ext5 + ext3) str -- | Returns @True@ when a sequence position lies within a sequence -- location on the same sequence, and occupies the same strand. isWithin :: Pos.Pos -> ContigLoc -> Bool isWithin (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand) = (pos >= seq5) && (pos < seq5 + len) && (cStrand == pStrand) -- | Returns @True@ when two sequence locations overlap at any -- position. overlaps :: ContigLoc -> ContigLoc -> Bool overlaps contig1 contig2 = case (bounds contig1, bounds contig2) of ((low1, high1),(low2, high2)) -> (strand contig1 == strand contig2) && (low1 <= high2) && (low2 <= high1) -- | Display a human-friendly, zero-based representation of a sequence location. display :: ContigLoc -> String display cloc = show (Pos.offset $ startPos cloc) ++ "to" ++ show (Pos.offset $ endPos cloc)