{-| Efficient lookup of sequence positions and locations in a large
map of target locations.  For example, target locations might
represent a collection of genes annotated on a chromosome.  The
'LocMap' would efficiently find which gene(s) overlapped a sequence
position on that chromosome.

Target locations are assigned to one or more zones based on
'Loc.bounds'.  Query locations are then tested only against the target
locations in the relevant zones.
-}

module Bio.Location.LocMap (
  -- * Constructing location lookup maps
  LocMap, fromList

  -- * Searching for target locations
  , lookupWithin, lookupOverlaps

  -- * Modifying location lookup maps
  , delete, deleteBy, insert

  -- * Utilities
  , checkInvariants
  ) where 

import qualified Data.IntMap as IM
import qualified Data.IntSet as IS
import Data.List hiding (insert, delete, deleteBy)
import Data.Maybe
import Data.Monoid

import qualified Bio.Location.Location as Loc
import qualified Bio.Location.Position as Pos
import Bio.Sequence.SeqData (Offset)

defaultZonesize :: Offset
defaultZonesize = 65536

-- | Data structure allowing efficient lookup of target sequence
-- locations that overlap a query location.  Target locations can be
-- paired with an arbitrary object.
data LocMap a = LocMap !Offset !(IM.IntMap (Loc.Loc, a)) !(IM.IntMap IS.IntSet)

instance Monoid (LocMap a) where
    mempty = LocMap defaultZonesize IM.empty IM.empty
    mappend lm0 (LocMap _ keyToLoc1 _) = foldl' (\lm (l,x) -> insert l x lm) lm0 $ IM.elems keyToLoc1

-- | Create a 'LocMap' from an association list of target locations.
fromList :: Offset -> [(Loc.Loc, a)] -> LocMap a
fromList zonesize = foldl' (\lm0 (l,x) -> insert l x lm0) mempty

-- | Insert a new target association into a target location map.
insert :: Loc.Loc -> a -> LocMap a -> LocMap a
insert l x (LocMap zonesize keyToLoc zoneKeys) 
    = let !key = case IM.maxViewWithKey keyToLoc of
                   Just ((k, _), _) -> k + 1
                   Nothing -> 1
          !newKeyToLoc = IM.insertWithKey duplicateError key (l, x) keyToLoc
          !newZoneKeys = foldl' (insertIntoZone key) zoneKeys $ locZones zonesize l
      in LocMap zonesize newKeyToLoc newZoneKeys
    where insertIntoZone key currZoneKeys z = case IM.lookup z currZoneKeys of
                                                Nothing -> IM.insert z (IS.singleton key) currZoneKeys
                                                Just currZoneKeySet -> IM.insert z (IS.insert key currZoneKeySet) currZoneKeys
          duplicateError k (l1, _) (l2, _) = error $ "LocMap: key " ++ show k ++ " duplicated: " ++ show (l1, l2)

-- | Find the (possibly empty) list of target locations and associated
-- objects that contain a sequence position, in the sense of
-- 'Loc.isWithin'
lookupWithin :: Pos.Pos -> LocMap a -> [(Loc.Loc, a)]
lookupWithin pos (LocMap zonesize keyToLoc zoneKeys) 
    = let !zoneKeySet = IM.findWithDefault IS.empty (posZone zonesize pos) zoneKeys
      in filter ((Loc.isWithin pos) . fst) $ keySetLocs keyToLoc zoneKeySet

-- | Find the (possibly empty) list of target locations and associated
-- objects that overlap a sequence location, in the sense of 'Loc.overlaps'
lookupOverlaps :: Loc.Loc -> LocMap a -> [(Loc.Loc, a)]
lookupOverlaps loc (LocMap zonesize keyToLoc zoneKeys)
    = let !zonesKeySet = IS.unions $ map (\z -> IM.findWithDefault IS.empty z zoneKeys) $ locZones zonesize loc
      in filter ((Loc.overlaps loc) . fst) $ keySetLocs keyToLoc zonesKeySet

-- | Remove a target location and object association from the map, if it is
-- present.  If it is present multiple times, only the first
-- occurrence will be deleted.
delete :: (Eq a) => (Loc.Loc, a) -> LocMap a -> LocMap a
delete target = deleteBy (== target)

-- | Generalized version of 'delete' that removes the first target
-- location / object association that satisfies a predicate function.
deleteBy :: ((Loc.Loc, a) -> Bool) -> LocMap a -> LocMap a
deleteBy isTarget lm0@(LocMap zonesize keyToLoc zoneKeys) 
    = case filter (isTarget . snd) $ IM.toList keyToLoc of
        [] -> lm0
        ((key,(l,_)):_) -> let !newKeyToLoc = IM.delete key keyToLoc
                               !newZoneKeys = foldl' (deleteFromZone key) zoneKeys $ locZones zonesize l
                           in LocMap zonesize newKeyToLoc newZoneKeys
    where deleteFromZone key currZoneKeys z = let !currZoneKeySet = IM.findWithDefault missingZone z currZoneKeys 
                                              in IM.insert z (IS.delete key currZoneKeySet) currZoneKeys
              where missingZone = error $ "LocMap deleteBy: empty keyset for zone " ++ show z

posZone :: Offset -> Pos.Pos -> Int
posZone zonesize pos = fromIntegral $ (Pos.offset pos) `div` zonesize

locZones :: Offset -> Loc.Loc -> [Int]
locZones zonesize loc = let !(pmin, pmax) = Loc.bounds loc
                            !zmin = fromIntegral $ pmin `div` zonesize
                            !zmax = fromIntegral $ pmax `div` zonesize
                        in [zmin..zmax]

keySetLocs :: (IM.IntMap (Loc.Loc, a)) -> IS.IntSet -> [(Loc.Loc, a)]
keySetLocs keyToLoc = map keyLoc . IS.toList
    where keyLoc key = IM.findWithDefault unknownKey key keyToLoc
              where unknownKey = error $ "LocMap: unknown key " ++ show key

checkInvariants :: LocMap a -> [String]
checkInvariants (LocMap zonesize keyToLoc zoneKeys)
    = concat $ checkAllKeys : map checkKeyLoc (IM.toList keyToLoc)
    where checkAllKeys = let !allZoneKeys = IS.unions $ IM.elems zoneKeys 
                             !allLocKeys = IM.keysSet keyToLoc
                             !missingKeys = IS.toAscList $ allZoneKeys `IS.difference` allLocKeys
                             !extraKeys = IS.toAscList $ allLocKeys `IS.difference` allZoneKeys
                         in concat [ map (\key -> "Missing key " ++ show key) missingKeys
                                   , map (\key -> "Extra key " ++ show key) extraKeys]
          checkKeyLoc (key, (loc, _)) = let !keyZoneSet = IM.keysSet $ IM.filter (\keyset -> key `IS.member` keyset) zoneKeys
                                            !locZoneSet = IS.fromList $ locZones zonesize loc
                                            !missingZones = IS.toAscList $ locZoneSet `IS.difference` keyZoneSet
                                            !extraZones = IS.toAscList $ keyZoneSet `IS.difference` locZoneSet
                                        in concat [ map (\zone -> "Missing zone " ++ show zone ++ " for " ++ show (key, loc)) missingZones
                                                  , map (\zone -> "Extra zone " ++ show zone ++ " for " ++ show (key, loc)) extraZones]