Biobase/Primary/Hashed.hs

{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}

-- | Fast hash functions for 'Primary' sequences. A hash is just an 'Int', so
-- use these only for short sequences.
--
-- TODO replace with standard hashing functions used by Haskell libs?

module Biobase.Primary.Hashed where

import           Control.Exception.Base (assert)
import           Data.Ix
import           Data.Primitive.Types
import           Data.Vector.Unboxed.Deriving
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Generic.Mutable as VGM
import qualified Data.Vector.Unboxed as VU

import           Biobase.Primary



newtype HashedPrimary = HashedPrimary { unHashedPrimary :: Int }
  deriving (Eq,Ord,Ix,Read,Show,Enum,Bounded)

derivingUnbox "HashedPrimary"
  [t| HashedPrimary -> Int |] [| unHashedPrimary |] [| HashedPrimary |]


-- | Given a piece of primary sequence information, reduce it to an index.
--
-- Will throw an assertion in debug code if 'ps' are not within bounds. Note
-- that "mkPrimary [minBound]" and "mkPrimary [minBound,minBound]" map to the
-- same index. Meaning that indices are only unique within the same length
-- group. Furthermore, indices with different (l,u)-bounds are not compatible
-- with each other. All indices start at 0.
--
-- The empty input produces an index of 0.
--
-- TODO currently goes the very inefficient way of creating a temporary vector
-- for 'ps'. We could in O(1) create a vector from a Primary ...

mkHashedPrimary :: (Nuc,Nuc) -> Primary -> HashedPrimary
mkHashedPrimary (l,u) ps = assert (VU.all (\p -> l<=p && p<=u) ps) $ HashedPrimary idx where
  idx   = VU.sum $ VU.zipWith f ps (VU.enumFromStepN (VU.length ps -1) (-1) (VU.length ps))
  f p c = (unNuc p - unNuc l) * (cnst^c)
  cnst = unNuc u - unNuc l + 1
{-# INLINE mkHashedPrimary #-}