{-# OPTIONS_HADDOCK prune #-}

-- | Constructs a minimal perfect hash from a map of key-value pairs.
--
-- = Overview of algorithm
-- A two-input hash function @F(nonce, key)@ is used.
--
-- 1. Keys are hashed into buckets for the first round with a nonce of @0@.
-- 1. Iterating over each bucket of size @>= 2@ in order of decreasing size, keep
--    testing different nonce values until all members
--    of the bucket fall into open slots in the final array.
--    When a successful nonce is found, write it to the \"intermediate\" array
--    at the bucket's position.
-- 1. For each bucket of size @1@, select an arbitrary open slot in the final
--    array, and write the slot's
--    index (after negation and subtracting @1@) to the intermediate array.
--
-- According to <http://cmph.sourceforge.net/papers/esa09.pdf this paper>,
-- the algorithm is assured to run in linear time.
--
-- = Provenance
-- This implementation was adapted from
-- <http://stevehanov.ca/blog/index.php?id=119 Steve Hanov's Blog>.
-- A refactoring of that Python implementation may be found
-- <https://github.com/kostmo/perfect-hash-generator/blob/master/python/perfect-hash.py here>.
-- This Haskell implementation was transliterated and evolved from that refactoring.
--
module Data.PerfectHash.Construction (
    createMinimalPerfectHash
  ) where

import Control.Arrow (first)
import Data.Tuple (swap)
import           Data.Default             (Default, def)
import           Control.Monad            (join)
import Data.SortedList (SortedList, toSortedList, fromSortedList)
import           Data.Foldable            (foldl')
import qualified Data.IntSet              as IntSet
import           Data.IntSet              (IntSet)
import qualified Data.IntMap              as IntMap
import           Data.IntMap              (IntMap)
import qualified Data.Map as Map
import           Data.Map                 (Map)
import Data.Function (on)
import           Data.Ord                 (Down (Down))
import qualified Data.Vector      as Vector
import qualified Data.Maybe               as Maybe

import qualified Data.PerfectHash.Hashing as Hashing
import Data.PerfectHash.Hashing (Hash, ArraySize)
import qualified Data.PerfectHash.Lookup as Lookup
import Data.PerfectHash.Types.Nonces (Nonce)
import qualified Data.PerfectHash.Types.Nonces as Nonces


data AlgorithmParams = AlgorithmParams {
    AlgorithmParams -> Nonce -> Nonce
getNextNonceCandidate :: Nonce -> Nonce
  , AlgorithmParams -> Nonce
startingNonce :: Nonce
  }


data NonceOrDirect =
    WrappedNonce Nonce
  | DirectEntry Hashing.SlotIndex

instance Default NonceOrDirect where
  def :: NonceOrDirect
def = Nonce -> NonceOrDirect
WrappedNonce Nonce
forall a. Default a => a
def


-- | NOTE: Vector might perform better for these structures, but
-- the code may not be as clean.
data LookupTable a = NewLookupTable {
    LookupTable a -> IntMap NonceOrDirect
nonces :: IntMap NonceOrDirect
  , LookupTable a -> IntMap a
vals   :: IntMap a
  }


data SingletonBucket a = SingletonBucket Hash a
  deriving SingletonBucket a -> SingletonBucket a -> Bool
(SingletonBucket a -> SingletonBucket a -> Bool)
-> (SingletonBucket a -> SingletonBucket a -> Bool)
-> Eq (SingletonBucket a)
forall a. Eq a => SingletonBucket a -> SingletonBucket a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: SingletonBucket a -> SingletonBucket a -> Bool
$c/= :: forall a. Eq a => SingletonBucket a -> SingletonBucket a -> Bool
== :: SingletonBucket a -> SingletonBucket a -> Bool
$c== :: forall a. Eq a => SingletonBucket a -> SingletonBucket a -> Bool
Eq


data HashBucket a = HashBucket {
    HashBucket a -> Hash
_hashVal :: Hash
  , HashBucket a -> [a]
bucketMembers :: [a]
  }

instance Eq (HashBucket a) where 
  == :: HashBucket a -> HashBucket a -> Bool
(==) = Down Int -> Down Int -> Bool
forall a. Eq a => a -> a -> Bool
(==) (Down Int -> Down Int -> Bool)
-> (HashBucket a -> Down Int)
-> HashBucket a
-> HashBucket a
-> Bool
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Int -> Down Int
forall a. a -> Down a
Down (Int -> Down Int)
-> (HashBucket a -> Int) -> HashBucket a -> Down Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length ([a] -> Int) -> (HashBucket a -> [a]) -> HashBucket a -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HashBucket a -> [a]
forall a. HashBucket a -> [a]
bucketMembers)

instance Ord (HashBucket a) where 
  compare :: HashBucket a -> HashBucket a -> Ordering
compare = Down Int -> Down Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Down Int -> Down Int -> Ordering)
-> (HashBucket a -> Down Int)
-> HashBucket a
-> HashBucket a
-> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Int -> Down Int
forall a. a -> Down a
Down (Int -> Down Int)
-> (HashBucket a -> Int) -> HashBucket a -> Down Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length ([a] -> Int) -> (HashBucket a -> [a]) -> HashBucket a -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HashBucket a -> [a]
forall a. HashBucket a -> [a]
bucketMembers)


data SizedList a = SizedList [a] ArraySize

data IntMapAndSize a = IntMapAndSize (IntMap a) ArraySize


-- | slots for each bucket with the current nonce attempt
data PlacementAttempt a = PlacementAttempt Nonce [SingletonBucket a]


data PartialSolution a b = PartialSolution (LookupTable b) [SingletonBucket (a, b)]


-- * Constants

emptyLookupTable :: LookupTable a
emptyLookupTable :: LookupTable a
emptyLookupTable = IntMap NonceOrDirect -> IntMap a -> LookupTable a
forall a. IntMap NonceOrDirect -> IntMap a -> LookupTable a
NewLookupTable IntMap NonceOrDirect
forall a. Monoid a => a
mempty IntMap a
forall a. Monoid a => a
mempty


defaultAlgorithmParams :: AlgorithmParams
defaultAlgorithmParams :: AlgorithmParams
defaultAlgorithmParams = (Nonce -> Nonce) -> Nonce -> AlgorithmParams
AlgorithmParams
  ((Int -> Int) -> Nonce -> Nonce
Nonces.mapNonce (Int -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1))
  (Int -> Nonce
Nonces.Nonce Int
1)


-- * Functions

toRedirector :: NonceOrDirect -> Int
toRedirector :: NonceOrDirect -> Int
toRedirector (WrappedNonce (Nonces.Nonce Int
x)) = Int
x
toRedirector (DirectEntry SlotIndex
free_slot_index) =
  SlotIndex -> Int
Lookup.encodeDirectEntry SlotIndex
free_slot_index


convertToVector
  :: (Default a)
  => LookupTable a
  -> Lookup.LookupTable a
convertToVector :: LookupTable a -> LookupTable a
convertToVector LookupTable a
x = Vector Int -> Vector a -> LookupTable a
forall a. Vector Int -> Vector a -> LookupTable a
Lookup.LookupTable Vector Int
a1 Vector a
a2
  where
    size :: Int
size = IntMap a -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length (IntMap a -> Int) -> IntMap a -> Int
forall a b. (a -> b) -> a -> b
$ LookupTable a -> IntMap a
forall a. LookupTable a -> IntMap a
vals LookupTable a
x
    
    vectorizeNonces :: IntMap NonceOrDirect -> Vector Int
vectorizeNonces IntMap NonceOrDirect
input = Int -> (Int -> Int) -> Vector Int
forall a. Int -> (Int -> a) -> Vector a
Vector.generate Int
size ((Int -> Int) -> Vector Int) -> (Int -> Int) -> Vector Int
forall a b. (a -> b) -> a -> b
$
      NonceOrDirect -> Int
toRedirector (NonceOrDirect -> Int) -> (Int -> NonceOrDirect) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int -> IntMap NonceOrDirect -> NonceOrDirect)
-> IntMap NonceOrDirect -> Int -> NonceOrDirect
forall a b c. (a -> b -> c) -> b -> a -> c
flip (NonceOrDirect -> Int -> IntMap NonceOrDirect -> NonceOrDirect
forall a. a -> Int -> IntMap a -> a
IntMap.findWithDefault NonceOrDirect
forall a. Default a => a
def) IntMap NonceOrDirect
input

    a1 :: Vector Int
a1 = IntMap NonceOrDirect -> Vector Int
vectorizeNonces (IntMap NonceOrDirect -> Vector Int)
-> IntMap NonceOrDirect -> Vector Int
forall a b. (a -> b) -> a -> b
$ LookupTable a -> IntMap NonceOrDirect
forall a. LookupTable a -> IntMap NonceOrDirect
nonces LookupTable a
x


    vectorizeVals :: IntMap a -> Vector a
vectorizeVals IntMap a
input = Int -> (Int -> a) -> Vector a
forall a. Int -> (Int -> a) -> Vector a
Vector.generate Int
size ((Int -> a) -> Vector a) -> (Int -> a) -> Vector a
forall a b. (a -> b) -> a -> b
$
      (Int -> IntMap a -> a) -> IntMap a -> Int -> a
forall a b c. (a -> b -> c) -> b -> a -> c
flip (a -> Int -> IntMap a -> a
forall a. a -> Int -> IntMap a -> a
IntMap.findWithDefault a
forall a. Default a => a
def) IntMap a
input

    a2 :: Vector a
a2 = IntMap a -> Vector a
forall a. Default a => IntMap a -> Vector a
vectorizeVals (IntMap a -> Vector a) -> IntMap a -> Vector a
forall a b. (a -> b) -> a -> b
$ LookupTable a -> IntMap a
forall a. LookupTable a -> IntMap a
vals LookupTable a
x


-- | Computes a slot in the destination array (Data.PerfectHash.Lookup.values)
-- for every element in this multi-entry bucket, for the given nonce.
--
-- Return a Nothing for a slot if it collides.
--
-- This function is able to fail fast if one of the elements of the bucket
-- yields a collision when using the new nonce.
attemptNonceRecursive
  :: Hashing.ToHashableChunks a
  => IntMapAndSize b
  -> Nonce
  -> IntSet -- ^ occupied slots
  -> [(a, b)] -- ^ keys
  -> [Maybe Hashing.SlotIndex]
attemptNonceRecursive :: IntMapAndSize b -> Nonce -> IntSet -> [(a, b)] -> [Maybe SlotIndex]
attemptNonceRecursive IntMapAndSize b
_ Nonce
_ IntSet
_ [] = []
attemptNonceRecursive
    IntMapAndSize b
values_and_size
    Nonce
nonce
    IntSet
occupied_slots
    ((a
current_key, b
_):[(a, b)]
remaining_bucket_keys) =

  if Bool
cannot_use_slot
    then Maybe SlotIndex -> [Maybe SlotIndex]
forall (f :: * -> *) a. Applicative f => a -> f a
pure Maybe SlotIndex
forall a. Maybe a
Nothing
    else SlotIndex -> Maybe SlotIndex
forall a. a -> Maybe a
Just SlotIndex
slot Maybe SlotIndex -> [Maybe SlotIndex] -> [Maybe SlotIndex]
forall a. a -> [a] -> [a]
: [Maybe SlotIndex]
recursive_result

  where
    IntMapAndSize IntMap b
values ArraySize
size = IntMapAndSize b
values_and_size
    slot :: SlotIndex
slot = Nonce -> ArraySize -> a -> SlotIndex
forall a.
ToHashableChunks a =>
Nonce -> ArraySize -> a -> SlotIndex
Hashing.hashToSlot Nonce
nonce ArraySize
size a
current_key

    Hashing.SlotIndex Int
slotval = SlotIndex
slot

    -- TODO: Create a record "SlotOccupation" to encapsulate the IntSet implementation
    cannot_use_slot :: Bool
cannot_use_slot = Int -> IntSet -> Bool
IntSet.member Int
slotval IntSet
occupied_slots Bool -> Bool -> Bool
|| Int -> IntMap b -> Bool
forall a. Int -> IntMap a -> Bool
IntMap.member Int
slotval IntMap b
values

    recursive_result :: [Maybe SlotIndex]
recursive_result = IntMapAndSize b -> Nonce -> IntSet -> [(a, b)] -> [Maybe SlotIndex]
forall a b.
ToHashableChunks a =>
IntMapAndSize b -> Nonce -> IntSet -> [(a, b)] -> [Maybe SlotIndex]
attemptNonceRecursive
      IntMapAndSize b
values_and_size
      Nonce
nonce
      (Int -> IntSet -> IntSet
IntSet.insert Int
slotval IntSet
occupied_slots)
      [(a, b)]
remaining_bucket_keys


-- | Repeatedly try different values of the nonce until we find a hash function
-- that places all items in the bucket into free slots.
--
-- Increment the candidate nonce by @1@ each time.
-- Theoretically we're guaranteed to eventually find a solution.
findNonceForBucketRecursive
  :: (Hashing.ToHashableChunks a)
  => AlgorithmParams
  -> Nonce -- ^ nonce to attempt
  -> IntMapAndSize b
  -> [(a, b)] -- ^ colliding keys for this bucket
  -> PlacementAttempt (a, b)
findNonceForBucketRecursive :: AlgorithmParams
-> Nonce -> IntMapAndSize b -> [(a, b)] -> PlacementAttempt (a, b)
findNonceForBucketRecursive AlgorithmParams
algorithm_params Nonce
nonce_attempt IntMapAndSize b
values_and_size [(a, b)]
bucket =

  -- This is a "lazy" (and awkward) way to specify recursion:
  -- If the result ("result_for_this_iteration") at this iteration of the recursion
  -- is not "Nothing", then, wrap it in a "PlacementAttempt" record.
  -- Otherwise, descend one layer deeper by computing "recursive_result".
  PlacementAttempt (a, b)
-> ([SlotIndex] -> PlacementAttempt (a, b))
-> Maybe [SlotIndex]
-> PlacementAttempt (a, b)
forall b a. b -> (a -> b) -> Maybe a -> b
maybe
    PlacementAttempt (a, b)
recursive_result
    [SlotIndex] -> PlacementAttempt (a, b)
wrapSlotIndicesAsAttempt
    Maybe [SlotIndex]
maybe_final_result

  where
    wrapSlotIndicesAsAttempt :: [SlotIndex] -> PlacementAttempt (a, b)
wrapSlotIndicesAsAttempt = Nonce -> [SingletonBucket (a, b)] -> PlacementAttempt (a, b)
forall a. Nonce -> [SingletonBucket a] -> PlacementAttempt a
PlacementAttempt Nonce
nonce_attempt ([SingletonBucket (a, b)] -> PlacementAttempt (a, b))
-> ([SlotIndex] -> [SingletonBucket (a, b)])
-> [SlotIndex]
-> PlacementAttempt (a, b)
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
      ([Hash] -> [(a, b)] -> [SingletonBucket (a, b)])
-> [(a, b)] -> [Hash] -> [SingletonBucket (a, b)]
forall a b c. (a -> b -> c) -> b -> a -> c
flip ((Hash -> (a, b) -> SingletonBucket (a, b))
-> [Hash] -> [(a, b)] -> [SingletonBucket (a, b)]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Hash -> (a, b) -> SingletonBucket (a, b)
forall a. Hash -> a -> SingletonBucket a
SingletonBucket) [(a, b)]
bucket ([Hash] -> [SingletonBucket (a, b)])
-> ([SlotIndex] -> [Hash])
-> [SlotIndex]
-> [SingletonBucket (a, b)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (SlotIndex -> Hash) -> [SlotIndex] -> [Hash]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Hash
Hashing.Hash (Int -> Hash) -> (SlotIndex -> Int) -> SlotIndex -> Hash
forall b c a. (b -> c) -> (a -> b) -> a -> c
. SlotIndex -> Int
Hashing.getIndex)

    -- NOTE: attemptNonceRecursive returns a list of "Maybe SlotIndex"
    -- records. If *any* of those elements are Nothing (that is, at
    -- least one of the slots were not successfully placed), then applying
    -- sequenceA to that list will yield Nothing.
    maybe_final_result :: Maybe [SlotIndex]
maybe_final_result = [Maybe SlotIndex] -> Maybe [SlotIndex]
forall (t :: * -> *) (f :: * -> *) a.
(Traversable t, Applicative f) =>
t (f a) -> f (t a)
sequenceA ([Maybe SlotIndex] -> Maybe [SlotIndex])
-> [Maybe SlotIndex] -> Maybe [SlotIndex]
forall a b. (a -> b) -> a -> b
$ IntMapAndSize b -> Nonce -> IntSet -> [(a, b)] -> [Maybe SlotIndex]
forall a b.
ToHashableChunks a =>
IntMapAndSize b -> Nonce -> IntSet -> [(a, b)] -> [Maybe SlotIndex]
attemptNonceRecursive
      IntMapAndSize b
values_and_size
      Nonce
nonce_attempt
      IntSet
forall a. Monoid a => a
mempty
      [(a, b)]
bucket

    recursive_result :: PlacementAttempt (a, b)
recursive_result = AlgorithmParams
-> Nonce -> IntMapAndSize b -> [(a, b)] -> PlacementAttempt (a, b)
forall a b.
ToHashableChunks a =>
AlgorithmParams
-> Nonce -> IntMapAndSize b -> [(a, b)] -> PlacementAttempt (a, b)
findNonceForBucketRecursive
      AlgorithmParams
algorithm_params
      (AlgorithmParams -> Nonce -> Nonce
getNextNonceCandidate AlgorithmParams
algorithm_params Nonce
nonce_attempt)
      IntMapAndSize b
values_and_size
      [(a, b)]
bucket


-- | Searches for a nonce for this bucket, starting with the value @1@,
-- until one is found that results in no collisions for both this bucket
-- and all previously placed buckets.
processMultiEntryBuckets
  :: (Hashing.ToHashableChunks a)
  => AlgorithmParams
  -> ArraySize
  -> LookupTable b
  -> HashBucket (a, b)
  -> LookupTable b
processMultiEntryBuckets :: AlgorithmParams
-> ArraySize -> LookupTable b -> HashBucket (a, b) -> LookupTable b
processMultiEntryBuckets
    AlgorithmParams
algorithm_params
    ArraySize
size
    LookupTable b
old_lookup_table
    (HashBucket Hash
computed_hash [(a, b)]
bucket_members) =

  IntMap NonceOrDirect -> IntMap b -> LookupTable b
forall a. IntMap NonceOrDirect -> IntMap a -> LookupTable a
NewLookupTable IntMap NonceOrDirect
new_nonces IntMap b
new_values_dict
  where
    NewLookupTable IntMap NonceOrDirect
old_nonces IntMap b
old_values_dict = LookupTable b
old_lookup_table

    sized_vals_dict :: IntMapAndSize b
sized_vals_dict = IntMap b -> ArraySize -> IntMapAndSize b
forall a. IntMap a -> ArraySize -> IntMapAndSize a
IntMapAndSize IntMap b
old_values_dict ArraySize
size

    -- This is assured to succeed; it starts with a nonce of 1
    -- but keeps incrementing it until all of the keys in this
    -- bucket are placeable.
    PlacementAttempt Nonce
nonce [SingletonBucket (a, b)]
slots_for_bucket =
      AlgorithmParams
-> Nonce -> IntMapAndSize b -> [(a, b)] -> PlacementAttempt (a, b)
forall a b.
ToHashableChunks a =>
AlgorithmParams
-> Nonce -> IntMapAndSize b -> [(a, b)] -> PlacementAttempt (a, b)
findNonceForBucketRecursive
        AlgorithmParams
algorithm_params
        (AlgorithmParams -> Nonce
startingNonce AlgorithmParams
algorithm_params)
        IntMapAndSize b
sized_vals_dict
        [(a, b)]
bucket_members

    new_nonces :: IntMap NonceOrDirect
new_nonces = Int
-> NonceOrDirect -> IntMap NonceOrDirect -> IntMap NonceOrDirect
forall a. Int -> a -> IntMap a -> IntMap a
IntMap.insert
      (Hash -> Int
Hashing.getHash Hash
computed_hash)
      (Nonce -> NonceOrDirect
WrappedNonce Nonce
nonce)
      IntMap NonceOrDirect
old_nonces

    new_values_dict :: IntMap b
new_values_dict = (SingletonBucket (a, b) -> IntMap b -> IntMap b)
-> IntMap b -> [SingletonBucket (a, b)] -> IntMap b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr SingletonBucket (a, b) -> IntMap b -> IntMap b
forall a a. SingletonBucket (a, a) -> IntMap a -> IntMap a
place_values IntMap b
old_values_dict [SingletonBucket (a, b)]
slots_for_bucket

    place_values :: SingletonBucket (a, a) -> IntMap a -> IntMap a
place_values (SingletonBucket Hash
slot_val (a
_, a
value)) =
      Int -> a -> IntMap a -> IntMap a
forall a. Int -> a -> IntMap a -> IntMap a
IntMap.insert (Hash -> Int
Hashing.getHash Hash
slot_val) a
value


-- | This function exploits the sorted structure of the list
-- by skimming the multi-entry buckets from the front of the
-- list. Then we filter the single-entry buckets by dropping
-- the empty buckets.
--
-- The partial solution produced by this function entails
-- all of the colliding nonces as fully placed.
handleCollidingNonces
  :: (Hashing.ToHashableChunks a)
  => AlgorithmParams
  -> ArraySize
  -> SortedList (HashBucket (a, b))
  -> PartialSolution a b
handleCollidingNonces :: AlgorithmParams
-> ArraySize
-> SortedList (HashBucket (a, b))
-> PartialSolution a b
handleCollidingNonces AlgorithmParams
algorithm_params ArraySize
size SortedList (HashBucket (a, b))
sorted_bucket_hash_tuples =

  LookupTable b -> [SingletonBucket (a, b)] -> PartialSolution a b
forall a b.
LookupTable b -> [SingletonBucket (a, b)] -> PartialSolution a b
PartialSolution LookupTable b
lookup_table [SingletonBucket (a, b)]
non_colliding_buckets
  where

    -- Since the buckets have been sorted by descending size,
    -- once we get to the bucket with 1 or fewer elements,
    -- we know there are no more collision buckets.
    ([HashBucket (a, b)]
multi_entry_buckets, [HashBucket (a, b)]
single_or_fewer_buckets) =
      (HashBucket (a, b) -> Bool)
-> [HashBucket (a, b)]
-> ([HashBucket (a, b)], [HashBucket (a, b)])
forall a. (a -> Bool) -> [a] -> ([a], [a])
span ((Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1) (Int -> Bool)
-> (HashBucket (a, b) -> Int) -> HashBucket (a, b) -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(a, b)] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length ([(a, b)] -> Int)
-> (HashBucket (a, b) -> [(a, b)]) -> HashBucket (a, b) -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HashBucket (a, b) -> [(a, b)]
forall a. HashBucket a -> [a]
bucketMembers) ([HashBucket (a, b)] -> ([HashBucket (a, b)], [HashBucket (a, b)]))
-> [HashBucket (a, b)]
-> ([HashBucket (a, b)], [HashBucket (a, b)])
forall a b. (a -> b) -> a -> b
$
        SortedList (HashBucket (a, b)) -> [HashBucket (a, b)]
forall a. SortedList a -> [a]
fromSortedList SortedList (HashBucket (a, b))
sorted_bucket_hash_tuples

    -- XXX Using 'foldl' rather than 'foldr' is crucial here, given the order
    -- of the buckets. 'foldr' would actually try to place the smallest buckets
    -- first, making it improbable that the large buckets will be placeable,
    -- and potentially resulting in an infinite loop.
    lookup_table :: LookupTable b
lookup_table = (LookupTable b -> HashBucket (a, b) -> LookupTable b)
-> LookupTable b -> [HashBucket (a, b)] -> LookupTable b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl'
      (AlgorithmParams
-> ArraySize -> LookupTable b -> HashBucket (a, b) -> LookupTable b
forall a b.
ToHashableChunks a =>
AlgorithmParams
-> ArraySize -> LookupTable b -> HashBucket (a, b) -> LookupTable b
processMultiEntryBuckets AlgorithmParams
algorithm_params ArraySize
size)
      LookupTable b
forall a. LookupTable a
emptyLookupTable
      [HashBucket (a, b)]
multi_entry_buckets

    non_colliding_buckets :: [SingletonBucket (a, b)]
non_colliding_buckets = (HashBucket (a, b) -> Maybe (SingletonBucket (a, b)))
-> [HashBucket (a, b)] -> [SingletonBucket (a, b)]
forall a b. (a -> Maybe b) -> [a] -> [b]
Maybe.mapMaybe
      HashBucket (a, b) -> Maybe (SingletonBucket (a, b))
forall a. HashBucket a -> Maybe (SingletonBucket a)
convertToSingletonBucket
      [HashBucket (a, b)]
single_or_fewer_buckets

    convertToSingletonBucket :: HashBucket a -> Maybe (SingletonBucket a)
convertToSingletonBucket (HashBucket Hash
hashVal [a]
elements) =
      Hash -> a -> SingletonBucket a
forall a. Hash -> a -> SingletonBucket a
SingletonBucket Hash
hashVal (a -> SingletonBucket a) -> Maybe a -> Maybe (SingletonBucket a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [a] -> Maybe a
forall a. [a] -> Maybe a
Maybe.listToMaybe [a]
elements


-- | Hash the keys into buckets and sort them by descending size
preliminaryBucketPlacement
  :: (Hashing.ToHashableChunks a)
  => SizedList (a, b)
  -> SortedList (HashBucket (a, b))
preliminaryBucketPlacement :: SizedList (a, b) -> SortedList (HashBucket (a, b))
preliminaryBucketPlacement SizedList (a, b)
sized_list =
  [HashBucket (a, b)] -> SortedList (HashBucket (a, b))
forall a. Ord a => [a] -> SortedList a
toSortedList [HashBucket (a, b)]
bucket_hash_tuples
  where
    SizedList [(a, b)]
tuplified_words_dict ArraySize
size = SizedList (a, b)
sized_list

    f :: (a, b) -> Int
f = SlotIndex -> Int
Hashing.getIndex (SlotIndex -> Int) -> ((a, b) -> SlotIndex) -> (a, b) -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Nonce -> ArraySize -> a -> SlotIndex
forall a.
ToHashableChunks a =>
Nonce -> ArraySize -> a -> SlotIndex
Hashing.hashToSlot (Int -> Nonce
Nonces.Nonce Int
0) ArraySize
size (a -> SlotIndex) -> ((a, b) -> a) -> (a, b) -> SlotIndex
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a, b) -> a
forall a b. (a, b) -> a
fst

    slot_key_pairs :: [((a, b), Int)]
slot_key_pairs = ((a, b) -> Int) -> [(a, b)] -> [((a, b), Int)]
forall (t :: * -> *) a b. Functor t => (a -> b) -> t a -> t (a, b)
deriveTuples (a, b) -> Int
forall b. (a, b) -> Int
f [(a, b)]
tuplified_words_dict

    bucket_hash_tuples :: [HashBucket (a, b)]
bucket_hash_tuples = ((Int, [(a, b)]) -> HashBucket (a, b))
-> [(Int, [(a, b)])] -> [HashBucket (a, b)]
forall a b. (a -> b) -> [a] -> [b]
map ((Hash -> [(a, b)] -> HashBucket (a, b))
-> (Hash, [(a, b)]) -> HashBucket (a, b)
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry Hash -> [(a, b)] -> HashBucket (a, b)
forall a. Hash -> [a] -> HashBucket a
HashBucket ((Hash, [(a, b)]) -> HashBucket (a, b))
-> ((Int, [(a, b)]) -> (Hash, [(a, b)]))
-> (Int, [(a, b)])
-> HashBucket (a, b)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int -> Hash) -> (Int, [(a, b)]) -> (Hash, [(a, b)])
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first Int -> Hash
Hashing.Hash) ([(Int, [(a, b)])] -> [HashBucket (a, b)])
-> [(Int, [(a, b)])] -> [HashBucket (a, b)]
forall a b. (a -> b) -> a -> b
$
      IntMap [(a, b)] -> [(Int, [(a, b)])]
forall a. IntMap a -> [(Int, a)]
IntMap.toList (IntMap [(a, b)] -> [(Int, [(a, b)])])
-> IntMap [(a, b)] -> [(Int, [(a, b)])]
forall a b. (a -> b) -> a -> b
$ [((a, b), Int)] -> IntMap [(a, b)]
forall (t :: * -> *) a. Foldable t => t (a, Int) -> IntMap [a]
binTuplesBySecond [((a, b), Int)]
slot_key_pairs


-- | Arbitrarily pair the non-colliding buckets with free slots.
--
-- At this point, all of the "colliding" hashes have been resolved
-- to their own slots, so we just take the leftovers.
assignDirectSlots
  :: ArraySize
  -> PartialSolution a b
  -> LookupTable b
assignDirectSlots :: ArraySize -> PartialSolution a b -> LookupTable b
assignDirectSlots ArraySize
size (PartialSolution LookupTable b
intermediate_lookup_table [SingletonBucket (a, b)]
non_colliding_buckets) =
  IntMap NonceOrDirect -> IntMap b -> LookupTable b
forall a. IntMap NonceOrDirect -> IntMap a -> LookupTable a
NewLookupTable IntMap NonceOrDirect
final_nonces IntMap b
final_values
  where
    isUnusedSlot :: SlotIndex -> Bool
isUnusedSlot (Hashing.SlotIndex Int
s) =
      Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Int -> IntMap b -> Bool
forall a. Int -> IntMap a -> Bool
IntMap.member Int
s (IntMap b -> Bool) -> IntMap b -> Bool
forall a b. (a -> b) -> a -> b
$ LookupTable b -> IntMap b
forall a. LookupTable a -> IntMap a
vals LookupTable b
intermediate_lookup_table

    unused_slots :: [SlotIndex]
unused_slots = (SlotIndex -> Bool) -> [SlotIndex] -> [SlotIndex]
forall a. (a -> Bool) -> [a] -> [a]
filter SlotIndex -> Bool
isUnusedSlot ([SlotIndex] -> [SlotIndex]) -> [SlotIndex] -> [SlotIndex]
forall a b. (a -> b) -> a -> b
$ ArraySize -> [SlotIndex]
Hashing.generateArrayIndices ArraySize
size

    zipped_remaining_with_unused_slots :: [(SingletonBucket (a, b), SlotIndex)]
zipped_remaining_with_unused_slots =
      [SingletonBucket (a, b)]
-> [SlotIndex] -> [(SingletonBucket (a, b), SlotIndex)]
forall a b. [a] -> [b] -> [(a, b)]
zip [SingletonBucket (a, b)]
non_colliding_buckets [SlotIndex]
unused_slots

    insertDirectEntry :: (SingletonBucket a, SlotIndex)
-> IntMap NonceOrDirect -> IntMap NonceOrDirect
insertDirectEntry (SingletonBucket Hash
computed_hash a
_, SlotIndex
free_slot_index) =
      -- Observe here that both the output and input
      -- are nonces:
      Int
-> NonceOrDirect -> IntMap NonceOrDirect -> IntMap NonceOrDirect
forall a. Int -> a -> IntMap a -> IntMap a
IntMap.insert (Hash -> Int
Hashing.getHash Hash
computed_hash) (NonceOrDirect -> IntMap NonceOrDirect -> IntMap NonceOrDirect)
-> NonceOrDirect -> IntMap NonceOrDirect -> IntMap NonceOrDirect
forall a b. (a -> b) -> a -> b
$ SlotIndex -> NonceOrDirect
DirectEntry SlotIndex
free_slot_index

    final_nonces :: IntMap NonceOrDirect
final_nonces = ((SingletonBucket (a, b), SlotIndex)
 -> IntMap NonceOrDirect -> IntMap NonceOrDirect)
-> IntMap NonceOrDirect
-> [(SingletonBucket (a, b), SlotIndex)]
-> IntMap NonceOrDirect
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr
      (SingletonBucket (a, b), SlotIndex)
-> IntMap NonceOrDirect -> IntMap NonceOrDirect
forall a.
(SingletonBucket a, SlotIndex)
-> IntMap NonceOrDirect -> IntMap NonceOrDirect
insertDirectEntry
      (LookupTable b -> IntMap NonceOrDirect
forall a. LookupTable a -> IntMap NonceOrDirect
nonces LookupTable b
intermediate_lookup_table)
      [(SingletonBucket (a, b), SlotIndex)]
zipped_remaining_with_unused_slots

    f2 :: (SingletonBucket (a, a), SlotIndex) -> IntMap a -> IntMap a
f2 (SingletonBucket Hash
_ (a
_, a
map_value), Hashing.SlotIndex Int
free_slot_index) =
      Int -> a -> IntMap a -> IntMap a
forall a. Int -> a -> IntMap a -> IntMap a
IntMap.insert Int
free_slot_index a
map_value

    final_values :: IntMap b
final_values = ((SingletonBucket (a, b), SlotIndex) -> IntMap b -> IntMap b)
-> IntMap b -> [(SingletonBucket (a, b), SlotIndex)] -> IntMap b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr
      (SingletonBucket (a, b), SlotIndex) -> IntMap b -> IntMap b
forall a a.
(SingletonBucket (a, a), SlotIndex) -> IntMap a -> IntMap a
f2
      (LookupTable b -> IntMap b
forall a. LookupTable a -> IntMap a
vals LookupTable b
intermediate_lookup_table)
      [(SingletonBucket (a, b), SlotIndex)]
zipped_remaining_with_unused_slots


-- | Generates a minimal perfect hash for a set of key-value pairs.
--
-- The keys must be instances of 'Hashing.ToHashableChunks'.
-- The values may be of arbitrary type.
--
-- A 'Map' is required as input to guarantee that there are
-- no duplicate keys.
createMinimalPerfectHash
  :: (Hashing.ToHashableChunks k, Default v)
  => Map k v -- ^ key-value pairs
  -> Lookup.LookupTable v
     -- ^ output for use by 'Lookup.lookup' or a custom code generator
createMinimalPerfectHash :: Map k v -> LookupTable v
createMinimalPerfectHash Map k v
original_words_dict =
  LookupTable v -> LookupTable v
forall a. Default a => LookupTable a -> LookupTable a
convertToVector LookupTable v
final_solution
  where
    tuplified_words_dict :: [(k, v)]
tuplified_words_dict = Map k v -> [(k, v)]
forall k a. Map k a -> [(k, a)]
Map.toList Map k v
original_words_dict
    size :: ArraySize
size = Int -> ArraySize
Hashing.ArraySize (Int -> ArraySize) -> Int -> ArraySize
forall a b. (a -> b) -> a -> b
$ [(k, v)] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(k, v)]
tuplified_words_dict
    sized_list :: SizedList (k, v)
sized_list = [(k, v)] -> ArraySize -> SizedList (k, v)
forall a. [a] -> ArraySize -> SizedList a
SizedList [(k, v)]
tuplified_words_dict ArraySize
size

    sorted_bucket_hash_tuples :: SortedList (HashBucket (k, v))
sorted_bucket_hash_tuples = SizedList (k, v) -> SortedList (HashBucket (k, v))
forall a b.
ToHashableChunks a =>
SizedList (a, b) -> SortedList (HashBucket (a, b))
preliminaryBucketPlacement SizedList (k, v)
sized_list

    partial_solution :: PartialSolution k v
partial_solution = AlgorithmParams
-> ArraySize
-> SortedList (HashBucket (k, v))
-> PartialSolution k v
forall a b.
ToHashableChunks a =>
AlgorithmParams
-> ArraySize
-> SortedList (HashBucket (a, b))
-> PartialSolution a b
handleCollidingNonces
      AlgorithmParams
defaultAlgorithmParams
      ArraySize
size
      SortedList (HashBucket (k, v))
sorted_bucket_hash_tuples

    final_solution :: LookupTable v
final_solution = ArraySize -> PartialSolution k v -> LookupTable v
forall a b. ArraySize -> PartialSolution a b -> LookupTable b
assignDirectSlots ArraySize
size PartialSolution k v
partial_solution


-- * Utility functions

-- | Place the first elements of the tuples into bins according to the second
-- element.
binTuplesBySecond
  :: (Foldable t)
  => t (a, Int)
  -> IntMap [a]
binTuplesBySecond :: t (a, Int) -> IntMap [a]
binTuplesBySecond = ((a, Int) -> IntMap [a] -> IntMap [a])
-> IntMap [a] -> t (a, Int) -> IntMap [a]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (a, Int) -> IntMap [a] -> IntMap [a]
f IntMap [a]
forall a. Monoid a => a
mempty
  where
    f :: (a, Int) -> IntMap [a] -> IntMap [a]
f = (Int -> [a] -> IntMap [a] -> IntMap [a])
-> (Int, [a]) -> IntMap [a] -> IntMap [a]
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry (([a] -> [a] -> [a]) -> Int -> [a] -> IntMap [a] -> IntMap [a]
forall a. (a -> a -> a) -> Int -> a -> IntMap a -> IntMap a
IntMap.insertWith [a] -> [a] -> [a]
forall a. Monoid a => a -> a -> a
mappend) ((Int, [a]) -> IntMap [a] -> IntMap [a])
-> ((a, Int) -> (Int, [a])) -> (a, Int) -> IntMap [a] -> IntMap [a]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
      (a -> [a]) -> (Int, a) -> (Int, [a])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> [a]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ((Int, a) -> (Int, [a]))
-> ((a, Int) -> (Int, a)) -> (a, Int) -> (Int, [a])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a, Int) -> (Int, a)
forall a b. (a, b) -> (b, a)
swap


-- | duplicates the argument into both members of the tuple
duple :: a -> (a, a)
duple :: a -> (a, a)
duple = (a -> a -> (a, a)) -> a -> (a, a)
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (,)


-- | Given a function and a value, create a pair
-- where the first element is the value, and the
-- second element is the function applied to the value
derivePair :: (a -> b) -> a -> (a, b)
derivePair :: (a -> b) -> a -> (a, b)
derivePair a -> b
g = (a -> b) -> (a, a) -> (a, b)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> b
g ((a, a) -> (a, b)) -> (a -> (a, a)) -> a -> (a, b)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> (a, a)
forall a. a -> (a, a)
duple


deriveTuples :: Functor t => (a -> b) -> t a -> t (a, b)
deriveTuples :: (a -> b) -> t a -> t (a, b)
deriveTuples = (a -> (a, b)) -> t a -> t (a, b)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((a -> (a, b)) -> t a -> t (a, b))
-> ((a -> b) -> a -> (a, b)) -> (a -> b) -> t a -> t (a, b)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a -> b) -> a -> (a, b)
forall a b. (a -> b) -> a -> (a, b)
derivePair