{-| Efficient lookup of sequence positions and locations in a large map of target locations. For example, target locations might represent a collection of genes annotated on a chromosome. The 'LocMap' would efficiently find which gene(s) overlapped a sequence position on that chromosome. Target locations are assigned to one or more zones based on 'Loc.bounds'. Query locations are then tested only against the target locations in the relevant zones. -} module Bio.Location.LocMap ( -- * Constructing location lookup maps LocMap, fromList -- * Searching for target locations , lookupWithin, lookupOverlaps -- * Modifying location lookup maps , delete, deleteBy, insert -- * Utilities , checkInvariants ) where import qualified Data.IntMap as IM import qualified Data.IntSet as IS import Data.List hiding (insert, delete, deleteBy) import Data.Maybe import Data.Monoid import qualified Bio.Location.Location as Loc import qualified Bio.Location.Position as Pos import Bio.Sequence.SeqData (Offset) defaultZonesize :: Offset defaultZonesize = 65536 -- | Data structure allowing efficient lookup of target sequence -- locations that overlap a query location. Target locations can be -- paired with an arbitrary object. data LocMap a = LocMap !Offset !(IM.IntMap (Loc.Loc, a)) !(IM.IntMap IS.IntSet) instance Monoid (LocMap a) where mempty = LocMap defaultZonesize IM.empty IM.empty mappend lm0 (LocMap _ keyToLoc1 _) = foldl' (\lm (l,x) -> insert l x lm) lm0 $ IM.elems keyToLoc1 -- | Create a 'LocMap' from an association list of target locations. fromList :: Offset -> [(Loc.Loc, a)] -> LocMap a fromList zonesize = foldl' (\lm0 (l,x) -> insert l x lm0) mempty -- | Insert a new target association into a target location map. insert :: Loc.Loc -> a -> LocMap a -> LocMap a insert l x (LocMap zonesize keyToLoc zoneKeys) = let !key = case IM.maxViewWithKey keyToLoc of Just ((k, _), _) -> k + 1 Nothing -> 1 !newKeyToLoc = IM.insertWithKey duplicateError key (l, x) keyToLoc !newZoneKeys = foldl' (insertIntoZone key) zoneKeys $ locZones zonesize l in LocMap zonesize newKeyToLoc newZoneKeys where insertIntoZone key currZoneKeys z = case IM.lookup z currZoneKeys of Nothing -> IM.insert z (IS.singleton key) currZoneKeys Just currZoneKeySet -> IM.insert z (IS.insert key currZoneKeySet) currZoneKeys duplicateError k (l1, _) (l2, _) = error $ "LocMap: key " ++ show k ++ " duplicated: " ++ show (l1, l2) -- | Find the (possibly empty) list of target locations and associated -- objects that contain a sequence position, in the sense of -- 'Loc.isWithin' lookupWithin :: Pos.Pos -> LocMap a -> [(Loc.Loc, a)] lookupWithin pos (LocMap zonesize keyToLoc zoneKeys) = let !zoneKeySet = IM.findWithDefault IS.empty (posZone zonesize pos) zoneKeys in filter ((Loc.isWithin pos) . fst) $ keySetLocs keyToLoc zoneKeySet -- | Find the (possibly empty) list of target locations and associated -- objects that overlap a sequence location, in the sense of 'Loc.overlaps' lookupOverlaps :: Loc.Loc -> LocMap a -> [(Loc.Loc, a)] lookupOverlaps loc (LocMap zonesize keyToLoc zoneKeys) = let !zonesKeySet = IS.unions $ map (\z -> IM.findWithDefault IS.empty z zoneKeys) $ locZones zonesize loc in filter ((Loc.overlaps loc) . fst) $ keySetLocs keyToLoc zonesKeySet -- | Remove a target location and object association from the map, if it is -- present. If it is present multiple times, only the first -- occurrence will be deleted. delete :: (Eq a) => (Loc.Loc, a) -> LocMap a -> LocMap a delete target = deleteBy (== target) -- | Generalized version of 'delete' that removes the first target -- location / object association that satisfies a predicate function. deleteBy :: ((Loc.Loc, a) -> Bool) -> LocMap a -> LocMap a deleteBy isTarget lm0@(LocMap zonesize keyToLoc zoneKeys) = case filter (isTarget . snd) $ IM.toList keyToLoc of [] -> lm0 ((key,(l,_)):_) -> let !newKeyToLoc = IM.delete key keyToLoc !newZoneKeys = foldl' (deleteFromZone key) zoneKeys $ locZones zonesize l in LocMap zonesize newKeyToLoc newZoneKeys where deleteFromZone key currZoneKeys z = let !currZoneKeySet = IM.findWithDefault missingZone z currZoneKeys in IM.insert z (IS.delete key currZoneKeySet) currZoneKeys where missingZone = error $ "LocMap deleteBy: empty keyset for zone " ++ show z posZone :: Offset -> Pos.Pos -> Int posZone zonesize pos = fromIntegral $ (Pos.offset pos) `div` zonesize locZones :: Offset -> Loc.Loc -> [Int] locZones zonesize loc = let !(pmin, pmax) = Loc.bounds loc !zmin = fromIntegral $ pmin `div` zonesize !zmax = fromIntegral $ pmax `div` zonesize in [zmin..zmax] keySetLocs :: (IM.IntMap (Loc.Loc, a)) -> IS.IntSet -> [(Loc.Loc, a)] keySetLocs keyToLoc = map keyLoc . IS.toList where keyLoc key = IM.findWithDefault unknownKey key keyToLoc where unknownKey = error $ "LocMap: unknown key " ++ show key checkInvariants :: LocMap a -> [String] checkInvariants (LocMap zonesize keyToLoc zoneKeys) = concat $ checkAllKeys : map checkKeyLoc (IM.toList keyToLoc) where checkAllKeys = let !allZoneKeys = IS.unions $ IM.elems zoneKeys !allLocKeys = IM.keysSet keyToLoc !missingKeys = IS.toAscList $ allZoneKeys `IS.difference` allLocKeys !extraKeys = IS.toAscList $ allLocKeys `IS.difference` allZoneKeys in concat [ map (\key -> "Missing key " ++ show key) missingKeys , map (\key -> "Extra key " ++ show key) extraKeys] checkKeyLoc (key, (loc, _)) = let !keyZoneSet = IM.keysSet $ IM.filter (\keyset -> key `IS.member` keyset) zoneKeys !locZoneSet = IS.fromList $ locZones zonesize loc !missingZones = IS.toAscList $ locZoneSet `IS.difference` keyZoneSet !extraZones = IS.toAscList $ keyZoneSet `IS.difference` locZoneSet in concat [ map (\zone -> "Missing zone " ++ show zone ++ " for " ++ show (key, loc)) missingZones , map (\zone -> "Extra zone " ++ show zone ++ " for " ++ show (key, loc)) extraZones]