{-# LANGUAGE GeneralizedNewtypeDeriving #-}

-- | Fast hash functions for 'Primary' sequences.

module Biobase.RNA.Hashes where

import Control.DeepSeq
import Control.Exception.Base (assert)
import Data.Ix
import qualified Data.Vector.Unboxed as VU

import Biobase.RNA

newtype HashedPrimary = HashedPrimary Int
  deriving (Eq,Ord,Ix,NFData,Read,Show,Enum,Bounded)

-- | Given a piece of primary sequence information, reduce it to an index.
--
-- Will throw an assertion in debug code if 'ps' are not within bounds. Note
-- that "mkPrimary [minBound]" and "mkPrimary [minBound,minBound]" map to the
-- same index. Meaning that indices are only unique within the same length
-- group. Furthermore, indices with different (l,u)-bounds are not compatible
-- with each other. All indices start at 0.
--
-- The empty input produces an index of 0.

mkHashedPrimary :: (Nucleotide,Nucleotide) -> Primary -> HashedPrimary
mkHashedPrimary (l,u) ps = assert (VU.all (\p -> l<=p && p<=u) ps) $ HashedPrimary idx where
  idx   = VU.sum $ VU.zipWith f ps (VU.enumFromStepN (VU.length ps -1) (-1) (VU.length ps))
  f p c = (fromEnum p - fromEnum l) * (cnst^c)
  cnst = fromEnum u - fromEnum l + 1
{-# INLINE mkHashedPrimary #-}