{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} -- | Fast hash functions for 'Primary' sequences. A hash is just an 'Int', so -- use these only for short sequences. module Biobase.Primary.Hashed where import Control.Exception.Base (assert) import Data.Ix import Data.Primitive.Types import qualified Data.Vector.Generic as VG import qualified Data.Vector.Generic.Mutable as VGM import qualified Data.Vector.Unboxed as VU import Data.PrimitiveArray import Biobase.Primary newtype HashedPrimary = HashedPrimary Int deriving (Eq,Ord,Ix,Read,Show,Enum,Bounded) deriving instance Prim HashedPrimary deriving instance VGM.MVector VU.MVector HashedPrimary deriving instance VG.Vector VU.Vector HashedPrimary deriving instance VU.Unbox HashedPrimary -- | Given a piece of primary sequence information, reduce it to an index. -- -- Will throw an assertion in debug code if 'ps' are not within bounds. Note -- that "mkPrimary [minBound]" and "mkPrimary [minBound,minBound]" map to the -- same index. Meaning that indices are only unique within the same length -- group. Furthermore, indices with different (l,u)-bounds are not compatible -- with each other. All indices start at 0. -- -- The empty input produces an index of 0. -- -- TODO currently goes the very inefficient way of creating a temporary vector -- for 'ps'. We could in O(1) create a vector from a Primary ... mkHashedPrimary :: (Nuc,Nuc) -> Primary -> HashedPrimary mkHashedPrimary (l,u) ps' = assert (VU.all (\p -> l<=p && p<=u) ps) $ HashedPrimary idx where idx = VU.sum $ VU.zipWith f ps (VU.enumFromStepN (VU.length ps -1) (-1) (VU.length ps)) f p c = (unNuc p - unNuc l) * (cnst^c) cnst = unNuc u - unNuc l + 1 ps = VU.fromList $ toList ps' {-# INLINE mkHashedPrimary #-}