{-# LANGUAGE GeneralizedNewtypeDeriving, TypeSynonymInstances, FlexibleInstances #-}
{-| Efficient lookup based on potential location overlap

Collection of objects with locations, potentially on named sequences,
that can be queried to recover all objects whose location could
potentially overlap a query location.

|-}

module Bio.SeqLoc.LocMap (
  -- * Mapping objects to locations
  LocMap
  , emptyLM, insertLoc
  , queryLoc

  -- * Mapping objects to locations on named sequneces
  , SeqLocMap
  , emptySLM, insertSeqLoc
  , querySeqLoc

    -- * Mapping transcripts to their locations
  , transcriptSeqLocMap
    
  -- * Generalization of objects with locations
  , Locatable(..)
  , WithLocation(..)
  , locatableSeqLocMap
  , queryLocatable, queryLocCompatible, queryLocInto
    
  )
       where

import qualified Data.HashMap.Strict as HM
import Data.List
import Data.Maybe

import qualified Bio.SeqLoc.Location as Loc
import Bio.SeqLoc.OnSeq
import qualified Bio.SeqLoc.Position as Pos
import qualified Bio.SeqLoc.SpliceLocation as SpLoc
import Bio.SeqLoc.Strand
import Bio.SeqLoc.Transcript

import qualified Bio.SeqLoc.ShiftedVector as ShV

-- | Mapping objects to sequence locations in the sense of `Loc.Location`.
data LocMap a = LocMap { binSize :: !Pos.Offset,
                         bins :: !(ShV.ShiftedVector [a]) } deriving (Show)

-- | Mapping objects to locations on named sequences 
data SeqLocMap a = SeqLocMap { slmBinSize :: !Pos.Offset, locmaps :: !(HM.HashMap SeqLabel (LocMap a)) } deriving (Show)

-- | Object with a genomic location expressed by `SpliceSeqLoc`.
class Locatable o where
  locate :: o -> SpliceSeqLoc

instance Locatable ContigSeqLoc where
  locate (OnSeq ref loc) = OnSeq ref (SpLoc.singleton loc)

instance Locatable SpliceSeqLoc where
  locate = id

instance Locatable Transcript where
  locate = location

-- | Simple representation of a `Locatable` object as an arbitrary
-- object adjoined with a location.
data WithLocation a = WithLocation { withoutLocation :: !a, withLocate :: !SpliceSeqLoc } deriving (Eq)

instance Locatable (WithLocation a) where
  locate = withLocate

-- | Create an empty object / location map.
--
-- Specify a characteristic size for efficient queries, which depends
-- on the underlying genome. Smaller sizes yield fewer false
-- candidates but greater memory usage and potentially slower queries.
emptyLM :: Pos.Offset -> LocMap a
emptyLM bsz = LocMap { binSize = bsz, bins = ShV.empty }

-- | Insert a new object / location pair and reutrn the updated map
insertLoc :: (Loc.Location l) => l -> a -> LocMap a -> LocMap a
insertLoc l x lm0 = lm0 { bins = ShV.modifySome (bins lm0) (locBins l lm0) (x :)  }

-- | Retrieve a list of objects that could potentially overlap the
-- query location.
--
-- Some objects may not actually overlap the query location. Some
-- objects may appear more than once in the list. However, no object
-- whose location truly overlaps the query will be missing from the
-- list.
--
-- Overlap is defined by one or more nucleotides in common between the
-- bounds of the two locations, regardless of the relative strands of
-- the query and the object location.
queryLoc :: (Loc.Location l) => l -> LocMap a -> [a]
queryLoc l lm = concat [ (bins lm) ShV.!? b | b <- locBins l lm ]

locBins :: (Loc.Location l) => l -> LocMap a -> [Int]
locBins l lm = let (start, end) = Loc.bounds l
                   binlow = fromIntegral $ start `div` binSize lm
                   binhigh = fromIntegral $ end `div` binSize lm
               in [binlow..binhigh]

-- | Create an empty object / location map
--
-- Specify a characteristic size as per `emptyLM`.
emptySLM :: Pos.Offset -> SeqLocMap a
emptySLM bsz = SeqLocMap { slmBinSize = bsz, locmaps = HM.empty }

-- | Insert a new object / location pair and reutrn the updated map
insertSeqLoc :: (Loc.Location l) => OnSeq l -> a -> SeqLocMap a -> SeqLocMap a
insertSeqLoc sl x slm0 = let lm0 = HM.lookupDefault (emptyLM $ slmBinSize slm0) (onSeqLabel sl) (locmaps slm0)
                             lm' = insertLoc (unOnSeq sl) x lm0
                         in slm0 { locmaps = HM.insert (onSeqLabel sl) lm' (locmaps slm0) }

-- | Retrieve a list of objects that could potentially overlap the
-- query location.
--
-- Some objects may not actually overlap the query location. Some
-- objects may appear more than once in the list. However, no object
-- whose location truly overlaps the query will be missing from the
-- list.
--
-- Overlap is defined by one or more nucleotides in common between the
-- bounds of the two locations, regardless of the relative strands of
-- the query and the object location, as well as the same name for the
-- underlying reference sequence.
--
-- To retrieve objects and test for actual overlap see the `Locatable`
-- interface and `queryLocatable` or `queryLocCompatible`.
querySeqLoc :: (Loc.Location l) => OnSeq l -> SeqLocMap a -> [a]
querySeqLoc sl slm = maybe [] (queryLoc (unOnSeq sl)) $
                     HM.lookup (onSeqLabel sl) (locmaps slm)

-- | Construct a mapping from transcripts to their genomic locations.
transcriptSeqLocMap :: Pos.Offset -> [Transcript] -> SeqLocMap Transcript
transcriptSeqLocMap bsz = foldl' insertTrx (emptySLM bsz)
  where insertTrx slm0 t = insertSeqLoc (location t) t slm0

-- | Construct a mapping from a general `Locatable` object to its
-- genomic location
locatableSeqLocMap :: (Locatable l) => Pos.Offset -> [l] -> SeqLocMap l
locatableSeqLocMap bsz = foldl' insertLocable (emptySLM bsz)
  where insertLocable slm0 t = insertSeqLoc (locate t) t slm0

-- | Retrieve a list of `Locatable` objects whose overall, contiguous
-- genomic coordinates intersect at any position the genomic interval
-- spanned by the specified `Loc.Location`. This does not require that
-- the spliced structure of the query is a subset of the spliced
-- structure of the `Locatable` nor that the query location lie
-- entirely within the hit location (contrast with
-- `queryLocCompatible`).
--
-- When a strand argument is given, restrict matches to those lying on
-- the same strand as the query location, for `Just Plus`, or the
-- opposite strand, for `Just Minus`.
queryLocatable :: (Locatable o, Loc.Location l) => Maybe Strand -> OnSeq l -> SeqLocMap o -> [o]
queryLocatable mstr qyloc = filter realHit . querySeqLoc qyloc
  where realHit sb = let sbloc = locate sb
                         (qylb, qyub) = Loc.bounds . unOnSeq $ qyloc
                         (sblb, sbub) = Loc.bounds . unOnSeq $ sbloc
                         qystr = Loc.strand . unOnSeq $ qyloc
                         sbstr = Loc.strand . unOnSeq $ sbloc
                     in (qylb <= sbub) && (qyub >= sblb) && (qystr `strandCompat` sbstr)
        strandCompat = case mstr of
          Nothing -> \_ _ -> True
          Just Plus -> (==)
          Just Minus -> (/=)

-- | Retrieve a list of `Locatable` objects whose spliced structure
-- contains the query location specifically.
queryLocCompatible :: (Locatable o) => Maybe Strand -> SpliceSeqLoc -> SeqLocMap o -> [o]
queryLocCompatible mstr qyloc = map fst . queryLocInto mstr qyloc

-- | Retrieve a list of `Locatable` objects whose splice structure
-- contains the query location, paired with the sub-location of the
-- query within the `Locatable`.
queryLocInto :: (Locatable o) => Maybe Strand -> SpliceSeqLoc -> SeqLocMap o -> [(o, SpLoc.SpliceLoc)]
queryLocInto mstr qyloc = filter strandCompat . mapMaybe compatHit . querySeqLoc qyloc
  where compatHit sb = do into <- unOnSeq qyloc `SpLoc.locInto` (unOnSeq . locate $ sb)
                          return (sb, into)
        strandCompat = case mstr of
          Nothing -> const True
          Just s -> (== s) . Loc.strand . snd