{-# OPTIONS_HADDOCK prune #-} -- | Constructs a minimal perfect hash from a map of key-value pairs. -- -- = Overview of algorithm -- A two-input hash function @F(nonce, key)@ is used. -- -- 1. Keys are hashed into buckets for the first round with a nonce of @0@. -- 1. Iterating over each bucket of size @>= 2@ in order of decreasing size, keep -- testing different nonce values until all members -- of the bucket fall into open slots in the final array. -- When a successful nonce is found, write it to the \"intermediate\" array -- at the bucket's position. -- 1. For each bucket of size @1@, select an arbitrary open slot in the final -- array, and write the slot's -- index (after negation and subtracting @1@) to the intermediate array. -- -- According to , -- the algorithm is assured to run in linear time. -- -- = Provenance -- This implementation was adapted from -- . -- A refactoring of that Python implementation may be found -- . -- This Haskell implementation was transliterated and evolved from that refactoring. -- module Data.PerfectHash.Construction ( createMinimalPerfectHash ) where import Control.Arrow (first) import Data.Tuple (swap) import Data.Default (Default, def) import Control.Monad (join) import Data.SortedList (SortedList, toSortedList, fromSortedList) import Data.Foldable (foldl') import qualified Data.IntSet as IntSet import Data.IntSet (IntSet) import qualified Data.IntMap as IntMap import Data.IntMap (IntMap) import qualified Data.Map as Map import Data.Map (Map) import Data.Function (on) import Data.Ord (Down (Down)) import qualified Data.Vector as Vector import qualified Data.Maybe as Maybe import qualified Data.PerfectHash.Hashing as Hashing import Data.PerfectHash.Hashing (Hash, ArraySize) import qualified Data.PerfectHash.Lookup as Lookup import Data.PerfectHash.Types.Nonces (Nonce) import qualified Data.PerfectHash.Types.Nonces as Nonces data AlgorithmParams = AlgorithmParams { getNextNonceCandidate :: Nonce -> Nonce , startingNonce :: Nonce } data NonceOrDirect = WrappedNonce Nonce | DirectEntry Hashing.SlotIndex instance Default NonceOrDirect where def = WrappedNonce def -- | NOTE: Vector might perform better for these structures, but -- the code may not be as clean. data LookupTable a = NewLookupTable { nonces :: IntMap NonceOrDirect , vals :: IntMap a } data SingletonBucket a = SingletonBucket Hash a deriving Eq data HashBucket a = HashBucket { _hashVal :: Hash , bucketMembers :: [a] } instance Eq (HashBucket a) where (==) = (==) `on` (Down . length . bucketMembers) instance Ord (HashBucket a) where compare = compare `on` (Down . length . bucketMembers) data SizedList a = SizedList [a] ArraySize data IntMapAndSize a = IntMapAndSize (IntMap a) ArraySize -- | slots for each bucket with the current nonce attempt data PlacementAttempt a = PlacementAttempt Nonce [SingletonBucket a] data PartialSolution a b = PartialSolution (LookupTable b) [SingletonBucket (a, b)] -- * Constants emptyLookupTable :: LookupTable a emptyLookupTable = NewLookupTable mempty mempty defaultAlgorithmParams :: AlgorithmParams defaultAlgorithmParams = AlgorithmParams (Nonces.mapNonce (+1)) (Nonces.Nonce 1) -- * Functions toRedirector :: NonceOrDirect -> Int toRedirector (WrappedNonce (Nonces.Nonce x)) = x toRedirector (DirectEntry free_slot_index) = Lookup.encodeDirectEntry free_slot_index convertToVector :: (Default a) => LookupTable a -> Lookup.LookupTable a convertToVector x = Lookup.LookupTable a1 a2 where size = length $ vals x vectorizeNonces input = Vector.generate size $ toRedirector . flip (IntMap.findWithDefault def) input a1 = vectorizeNonces $ nonces x vectorizeVals input = Vector.generate size $ flip (IntMap.findWithDefault def) input a2 = vectorizeVals $ vals x -- | Computes a slot in the destination array (Data.PerfectHash.Lookup.values) -- for every element in this multi-entry bucket, for the given nonce. -- -- Return a Nothing for a slot if it collides. -- -- This function is able to fail fast if one of the elements of the bucket -- yields a collision when using the new nonce. attemptNonceRecursive :: Hashing.ToHashableChunks a => IntMapAndSize b -> Nonce -> IntSet -- ^ occupied slots -> [(a, b)] -- ^ keys -> [Maybe Hashing.SlotIndex] attemptNonceRecursive _ _ _ [] = [] attemptNonceRecursive values_and_size nonce occupied_slots ((current_key, _):remaining_bucket_keys) = if cannot_use_slot then pure Nothing else Just slot : recursive_result where IntMapAndSize values size = values_and_size slot = Hashing.hashToSlot nonce size current_key Hashing.SlotIndex slotval = slot -- TODO: Create a record "SlotOccupation" to encapsulate the IntSet implementation cannot_use_slot = IntSet.member slotval occupied_slots || IntMap.member slotval values recursive_result = attemptNonceRecursive values_and_size nonce (IntSet.insert slotval occupied_slots) remaining_bucket_keys -- | Repeatedly try different values of the nonce until we find a hash function -- that places all items in the bucket into free slots. -- -- Increment the candidate nonce by @1@ each time. -- Theoretically we're guaranteed to eventually find a solution. findNonceForBucketRecursive :: (Hashing.ToHashableChunks a) => AlgorithmParams -> Nonce -- ^ nonce to attempt -> IntMapAndSize b -> [(a, b)] -- ^ colliding keys for this bucket -> PlacementAttempt (a, b) findNonceForBucketRecursive algorithm_params nonce_attempt values_and_size bucket = -- This is a "lazy" (and awkward) way to specify recursion: -- If the result ("result_for_this_iteration") at this iteration of the recursion -- is not "Nothing", then, wrap it in a "PlacementAttempt" record. -- Otherwise, descend one layer deeper by computing "recursive_result". maybe recursive_result wrapSlotIndicesAsAttempt maybe_final_result where wrapSlotIndicesAsAttempt = PlacementAttempt nonce_attempt . flip (zipWith SingletonBucket) bucket . map (Hashing.Hash . Hashing.getIndex) -- NOTE: attemptNonceRecursive returns a list of "Maybe SlotIndex" -- records. If *any* of those elements are Nothing (that is, at -- least one of the slots were not successfully placed), then applying -- sequenceA to that list will yield Nothing. maybe_final_result = sequenceA $ attemptNonceRecursive values_and_size nonce_attempt mempty bucket recursive_result = findNonceForBucketRecursive algorithm_params (getNextNonceCandidate algorithm_params nonce_attempt) values_and_size bucket -- | Searches for a nonce for this bucket, starting with the value @1@, -- until one is found that results in no collisions for both this bucket -- and all previously placed buckets. processMultiEntryBuckets :: (Hashing.ToHashableChunks a) => AlgorithmParams -> ArraySize -> LookupTable b -> HashBucket (a, b) -> LookupTable b processMultiEntryBuckets algorithm_params size old_lookup_table (HashBucket computed_hash bucket_members) = NewLookupTable new_nonces new_values_dict where NewLookupTable old_nonces old_values_dict = old_lookup_table sized_vals_dict = IntMapAndSize old_values_dict size -- This is assured to succeed; it starts with a nonce of 1 -- but keeps incrementing it until all of the keys in this -- bucket are placeable. PlacementAttempt nonce slots_for_bucket = findNonceForBucketRecursive algorithm_params (startingNonce algorithm_params) sized_vals_dict bucket_members new_nonces = IntMap.insert (Hashing.getHash computed_hash) (WrappedNonce nonce) old_nonces new_values_dict = foldr place_values old_values_dict slots_for_bucket place_values (SingletonBucket slot_val (_, value)) = IntMap.insert (Hashing.getHash slot_val) value -- | This function exploits the sorted structure of the list -- by skimming the multi-entry buckets from the front of the -- list. Then we filter the single-entry buckets by dropping -- the empty buckets. -- -- The partial solution produced by this function entails -- all of the colliding nonces as fully placed. handleCollidingNonces :: (Hashing.ToHashableChunks a) => AlgorithmParams -> ArraySize -> SortedList (HashBucket (a, b)) -> PartialSolution a b handleCollidingNonces algorithm_params size sorted_bucket_hash_tuples = PartialSolution lookup_table non_colliding_buckets where -- Since the buckets have been sorted by descending size, -- once we get to the bucket with 1 or fewer elements, -- we know there are no more collision buckets. (multi_entry_buckets, single_or_fewer_buckets) = span ((> 1) . length . bucketMembers) $ fromSortedList sorted_bucket_hash_tuples -- XXX Using 'foldl' rather than 'foldr' is crucial here, given the order -- of the buckets. 'foldr' would actually try to place the smallest buckets -- first, making it improbable that the large buckets will be placeable, -- and potentially resulting in an infinite loop. lookup_table = foldl' (processMultiEntryBuckets algorithm_params size) emptyLookupTable multi_entry_buckets non_colliding_buckets = Maybe.mapMaybe convertToSingletonBucket single_or_fewer_buckets convertToSingletonBucket (HashBucket hashVal elements) = SingletonBucket hashVal <$> Maybe.listToMaybe elements -- | Hash the keys into buckets and sort them by descending size preliminaryBucketPlacement :: (Hashing.ToHashableChunks a) => SizedList (a, b) -> SortedList (HashBucket (a, b)) preliminaryBucketPlacement sized_list = toSortedList bucket_hash_tuples where SizedList tuplified_words_dict size = sized_list f = Hashing.getIndex . Hashing.hashToSlot (Nonces.Nonce 0) size . fst slot_key_pairs = deriveTuples f tuplified_words_dict bucket_hash_tuples = map (uncurry HashBucket . first Hashing.Hash) $ IntMap.toList $ binTuplesBySecond slot_key_pairs -- | Arbitrarily pair the non-colliding buckets with free slots. -- -- At this point, all of the "colliding" hashes have been resolved -- to their own slots, so we just take the leftovers. assignDirectSlots :: ArraySize -> PartialSolution a b -> LookupTable b assignDirectSlots size (PartialSolution intermediate_lookup_table non_colliding_buckets) = NewLookupTable final_nonces final_values where isUnusedSlot (Hashing.SlotIndex s) = not $ IntMap.member s $ vals intermediate_lookup_table unused_slots = filter isUnusedSlot $ Hashing.generateArrayIndices size zipped_remaining_with_unused_slots = zip non_colliding_buckets unused_slots insertDirectEntry (SingletonBucket computed_hash _, free_slot_index) = -- Observe here that both the output and input -- are nonces: IntMap.insert (Hashing.getHash computed_hash) $ DirectEntry free_slot_index final_nonces = foldr insertDirectEntry (nonces intermediate_lookup_table) zipped_remaining_with_unused_slots f2 (SingletonBucket _ (_, map_value), Hashing.SlotIndex free_slot_index) = IntMap.insert free_slot_index map_value final_values = foldr f2 (vals intermediate_lookup_table) zipped_remaining_with_unused_slots -- | Generates a minimal perfect hash for a set of key-value pairs. -- -- The keys must be instances of 'Hashing.ToHashableChunks'. -- The values may be of arbitrary type. -- -- A 'Map' is required as input to guarantee that there are -- no duplicate keys. createMinimalPerfectHash :: (Hashing.ToHashableChunks k, Default v) => Map k v -- ^ key-value pairs -> Lookup.LookupTable v -- ^ output for use by 'Lookup.lookup' or a custom code generator createMinimalPerfectHash original_words_dict = convertToVector final_solution where tuplified_words_dict = Map.toList original_words_dict size = Hashing.ArraySize $ length tuplified_words_dict sized_list = SizedList tuplified_words_dict size sorted_bucket_hash_tuples = preliminaryBucketPlacement sized_list partial_solution = handleCollidingNonces defaultAlgorithmParams size sorted_bucket_hash_tuples final_solution = assignDirectSlots size partial_solution -- * Utility functions -- | Place the first elements of the tuples into bins according to the second -- element. binTuplesBySecond :: (Foldable t) => t (a, Int) -> IntMap [a] binTuplesBySecond = foldr f mempty where f = uncurry (IntMap.insertWith mappend) . fmap pure . swap -- | duplicates the argument into both members of the tuple duple :: a -> (a, a) duple = join (,) -- | Given a function and a value, create a pair -- where the first element is the value, and the -- second element is the function applied to the value derivePair :: (a -> b) -> a -> (a, b) derivePair g = fmap g . duple deriveTuples :: Functor t => (a -> b) -> t a -> t (a, b) deriveTuples = fmap . derivePair