{-# LANGUAGE GeneralizedNewtypeDeriving #-} -- | Fast hash functions for 'Primary' sequences. module Biobase.RNA.Hashes where import Control.DeepSeq import Control.Exception.Base (assert) import Data.Ix import qualified Data.Vector.Unboxed as VU import Biobase.RNA newtype HashedPrimary = HashedPrimary Int deriving (Eq,Ord,Ix,NFData,Read,Show,Enum,Bounded) -- | Given a piece of primary sequence information, reduce it to an index. -- -- Will throw an assertion in debug code if 'ps' are not within bounds. Note -- that "mkPrimary [minBound]" and "mkPrimary [minBound,minBound]" map to the -- same index. Meaning that indices are only unique within the same length -- group. Furthermore, indices with different (l,u)-bounds are not compatible -- with each other. All indices start at 0. -- -- The empty input produces an index of 0. mkHashedPrimary :: (Nucleotide,Nucleotide) -> Primary -> HashedPrimary mkHashedPrimary (l,u) ps = assert (VU.all (\p -> l<=p && p<=u) ps) $ HashedPrimary idx where idx = VU.sum $ VU.zipWith f ps (VU.enumFromStepN (VU.length ps -1) (-1) (VU.length ps)) f p c = (fromEnum p - fromEnum l) * (cnst^c) cnst = fromEnum u - fromEnum l + 1 {-# INLINE mkHashedPrimary #-}