{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} -- | Fast hash functions for 'Primary' sequences. A hash is just an 'Int', so -- use these only for short sequences. module Biobase.RNA.Hashes where import Control.DeepSeq import Control.Exception.Base (assert) import Data.Ix import Data.Primitive.Types import qualified Data.Vector.Generic as VG import qualified Data.Vector.Generic.Mutable as VGM import qualified Data.Vector.Unboxed as VU import Biobase.RNA newtype HashedPrimary = HashedPrimary Int deriving (Eq,Ord,Ix,NFData,Read,Show,Enum,Bounded) deriving instance Prim HashedPrimary deriving instance VGM.MVector VU.MVector HashedPrimary deriving instance VG.Vector VU.Vector HashedPrimary deriving instance VU.Unbox HashedPrimary -- | Given a piece of primary sequence information, reduce it to an index. -- -- Will throw an assertion in debug code if 'ps' are not within bounds. Note -- that "mkPrimary [minBound]" and "mkPrimary [minBound,minBound]" map to the -- same index. Meaning that indices are only unique within the same length -- group. Furthermore, indices with different (l,u)-bounds are not compatible -- with each other. All indices start at 0. -- -- The empty input produces an index of 0. mkHashedPrimary :: (Nucleotide,Nucleotide) -> Primary -> HashedPrimary mkHashedPrimary (l,u) ps = assert (VU.all (\p -> l<=p && p<=u) ps) $ HashedPrimary idx where idx = VU.sum $ VU.zipWith f ps (VU.enumFromStepN (VU.length ps -1) (-1) (VU.length ps)) f p c = (fromEnum p - fromEnum l) * (cnst^c) cnst = fromEnum u - fromEnum l + 1 {-# INLINE mkHashedPrimary #-}