{-# LANGUAGE GeneralizedNewtypeDeriving, TypeSynonymInstances, FlexibleInstances #-} {-| Efficient lookup based on potential location overlap Collection of objects with locations, potentially on named sequences, that can be queried to recover all objects whose location could potentially overlap a query location. |-} module Bio.SeqLoc.LocMap ( -- * Mapping objects to locations LocMap , emptyLM, insertLoc , queryLoc -- * Mapping objects to locations on named sequneces , SeqLocMap , emptySLM, insertSeqLoc , querySeqLoc -- * Mapping transcripts to their locations , transcriptSeqLocMap -- * Generalization of objects with locations , Locatable(..) , WithLocation(..) , locatableSeqLocMap , queryLocatable, queryLocCompatible, queryLocInto ) where import qualified Data.HashMap.Strict as HM import Data.List import Data.Maybe import qualified Bio.SeqLoc.Location as Loc import Bio.SeqLoc.OnSeq import qualified Bio.SeqLoc.Position as Pos import qualified Bio.SeqLoc.SpliceLocation as SpLoc import Bio.SeqLoc.Strand import Bio.SeqLoc.Transcript import qualified Bio.SeqLoc.ShiftedVector as ShV -- | Mapping objects to sequence locations in the sense of `Loc.Location`. data LocMap a = LocMap { binSize :: !Pos.Offset, bins :: !(ShV.ShiftedVector [a]) } deriving (Show) -- | Mapping objects to locations on named sequences data SeqLocMap a = SeqLocMap { slmBinSize :: !Pos.Offset, locmaps :: !(HM.HashMap SeqLabel (LocMap a)) } deriving (Show) -- | Object with a genomic location expressed by `SpliceSeqLoc`. class Locatable o where locate :: o -> SpliceSeqLoc instance Locatable ContigSeqLoc where locate (OnSeq ref loc) = OnSeq ref (SpLoc.singleton loc) instance Locatable SpliceSeqLoc where locate = id instance Locatable Transcript where locate = location -- | Simple representation of a `Locatable` object as an arbitrary -- object adjoined with a location. data WithLocation a = WithLocation { withoutLocation :: !a, withLocate :: !SpliceSeqLoc } deriving (Eq) instance Locatable (WithLocation a) where locate = withLocate -- | Create an empty object / location map. -- -- Specify a characteristic size for efficient queries, which depends -- on the underlying genome. Smaller sizes yield fewer false -- candidates but greater memory usage and potentially slower queries. emptyLM :: Pos.Offset -> LocMap a emptyLM bsz = LocMap { binSize = bsz, bins = ShV.empty } -- | Insert a new object / location pair and reutrn the updated map insertLoc :: (Loc.Location l) => l -> a -> LocMap a -> LocMap a insertLoc l x lm0 = lm0 { bins = ShV.modifySome (bins lm0) (locBins l lm0) (x :) } -- | Retrieve a list of objects that could potentially overlap the -- query location. -- -- Some objects may not actually overlap the query location. Some -- objects may appear more than once in the list. However, no object -- whose location truly overlaps the query will be missing from the -- list. -- -- Overlap is defined by one or more nucleotides in common between the -- bounds of the two locations, regardless of the relative strands of -- the query and the object location. queryLoc :: (Loc.Location l) => l -> LocMap a -> [a] queryLoc l lm = concat [ (bins lm) ShV.!? b | b <- locBins l lm ] locBins :: (Loc.Location l) => l -> LocMap a -> [Int] locBins l lm = let (start, end) = Loc.bounds l binlow = fromIntegral $ start `div` binSize lm binhigh = fromIntegral $ end `div` binSize lm in [binlow..binhigh] -- | Create an empty object / location map -- -- Specify a characteristic size as per `emptyLM`. emptySLM :: Pos.Offset -> SeqLocMap a emptySLM bsz = SeqLocMap { slmBinSize = bsz, locmaps = HM.empty } -- | Insert a new object / location pair and reutrn the updated map insertSeqLoc :: (Loc.Location l) => OnSeq l -> a -> SeqLocMap a -> SeqLocMap a insertSeqLoc sl x slm0 = let lm0 = HM.lookupDefault (emptyLM $ slmBinSize slm0) (onSeqLabel sl) (locmaps slm0) lm' = insertLoc (unOnSeq sl) x lm0 in slm0 { locmaps = HM.insert (onSeqLabel sl) lm' (locmaps slm0) } -- | Retrieve a list of objects that could potentially overlap the -- query location. -- -- Some objects may not actually overlap the query location. Some -- objects may appear more than once in the list. However, no object -- whose location truly overlaps the query will be missing from the -- list. -- -- Overlap is defined by one or more nucleotides in common between the -- bounds of the two locations, regardless of the relative strands of -- the query and the object location, as well as the same name for the -- underlying reference sequence. -- -- To retrieve objects and test for actual overlap see the `Locatable` -- interface and `queryLocatable` or `queryLocCompatible`. querySeqLoc :: (Loc.Location l) => OnSeq l -> SeqLocMap a -> [a] querySeqLoc sl slm = maybe [] (queryLoc (unOnSeq sl)) $ HM.lookup (onSeqLabel sl) (locmaps slm) -- | Construct a mapping from transcripts to their genomic locations. transcriptSeqLocMap :: Pos.Offset -> [Transcript] -> SeqLocMap Transcript transcriptSeqLocMap bsz = foldl' insertTrx (emptySLM bsz) where insertTrx slm0 t = insertSeqLoc (location t) t slm0 -- | Construct a mapping from a general `Locatable` object to its -- genomic location locatableSeqLocMap :: (Locatable l) => Pos.Offset -> [l] -> SeqLocMap l locatableSeqLocMap bsz = foldl' insertLocable (emptySLM bsz) where insertLocable slm0 t = insertSeqLoc (locate t) t slm0 -- | Retrieve a list of `Locatable` objects whose overall, contiguous -- genomic coordinates intersect at any position the genomic interval -- spanned by the specified `Loc.Location`. This does not require that -- the spliced structure of the query is a subset of the spliced -- structure of the `Locatable` nor that the query location lie -- entirely within the hit location (contrast with -- `queryLocCompatible`). -- -- When a strand argument is given, restrict matches to those lying on -- the same strand as the query location, for `Just Plus`, or the -- opposite strand, for `Just Minus`. queryLocatable :: (Locatable o, Loc.Location l) => Maybe Strand -> OnSeq l -> SeqLocMap o -> [o] queryLocatable mstr qyloc = filter realHit . querySeqLoc qyloc where realHit sb = let sbloc = locate sb (qylb, qyub) = Loc.bounds . unOnSeq $ qyloc (sblb, sbub) = Loc.bounds . unOnSeq $ sbloc qystr = Loc.strand . unOnSeq $ qyloc sbstr = Loc.strand . unOnSeq $ sbloc in (qylb <= sbub) && (qyub >= sblb) && (qystr `strandCompat` sbstr) strandCompat = case mstr of Nothing -> \_ _ -> True Just Plus -> (==) Just Minus -> (/=) -- | Retrieve a list of `Locatable` objects whose spliced structure -- contains the query location specifically. queryLocCompatible :: (Locatable o) => Maybe Strand -> SpliceSeqLoc -> SeqLocMap o -> [o] queryLocCompatible mstr qyloc = map fst . queryLocInto mstr qyloc -- | Retrieve a list of `Locatable` objects whose splice structure -- contains the query location, paired with the sub-location of the -- query within the `Locatable`. queryLocInto :: (Locatable o) => Maybe Strand -> SpliceSeqLoc -> SeqLocMap o -> [(o, SpLoc.SpliceLoc)] queryLocInto mstr qyloc = filter strandCompat . mapMaybe compatHit . querySeqLoc qyloc where compatHit sb = do into <- unOnSeq qyloc `SpLoc.locInto` (unOnSeq . locate $ sb) return (sb, into) strandCompat = case mstr of Nothing -> const True Just s -> (== s) . Loc.strand . snd