Biobase/Primary/Hashed.hs

{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}

-- | Fast hash functions for 'Primary' sequences. A hash is just an 'Int', so
-- use these only for short sequences.

module Biobase.Primary.Hashed where

import Control.Exception.Base (assert)
import Data.Ix
import Data.Primitive.Types
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Generic.Mutable as VGM
import qualified Data.Vector.Unboxed as VU

import Biobase.Primary



newtype HashedPrimary = HashedPrimary Int
  deriving (Eq,Ord,Ix,Read,Show,Enum,Bounded)

deriving instance Prim HashedPrimary
deriving instance VGM.MVector VU.MVector HashedPrimary
deriving instance VG.Vector VU.Vector HashedPrimary
deriving instance VU.Unbox HashedPrimary

-- | Given a piece of primary sequence information, reduce it to an index.
--
-- Will throw an assertion in debug code if 'ps' are not within bounds. Note
-- that "mkPrimary [minBound]" and "mkPrimary [minBound,minBound]" map to the
-- same index. Meaning that indices are only unique within the same length
-- group. Furthermore, indices with different (l,u)-bounds are not compatible
-- with each other. All indices start at 0.
--
-- The empty input produces an index of 0.

mkHashedPrimary :: (Nuc,Nuc) -> Primary -> HashedPrimary
mkHashedPrimary (l,u) ps = assert (VU.all (\p -> l<=p && p<=u) ps) $ HashedPrimary idx where
  idx   = VU.sum $ VU.zipWith f ps (VU.enumFromStepN (VU.length ps -1) (-1) (VU.length ps))
  f p c = (unNuc p - unNuc l) * (cnst^c)
  cnst = unNuc u - unNuc l + 1
{-# INLINE mkHashedPrimary #-}