{-# LANGUAGE BangPatterns #-} {-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE InstanceSigs #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE LinearTypes #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE UnliftedNewtypes #-} {-# OPTIONS_GHC -Wno-incomplete-patterns #-} {-# OPTIONS_GHC -Wno-name-shadowing #-} {-# OPTIONS_HADDOCK hide #-} module Data.HashMap.Mutable.Linear.Internal where import qualified Control.Functor.Linear as Control import Data.Array.Mutable.Linear (Array) import qualified Data.Array.Mutable.Linear as Array import qualified Data.Function as NonLinear import Data.Functor.Identity hiding (runIdentity) import qualified Data.Functor.Linear as Data import Data.Hashable import qualified Data.Maybe as NonLinear import Data.Unrestricted.Linear import Prelude.Linear hiding (filter, insert, lookup, mapMaybe, read, (+)) import Unsafe.Coerce (unsafeCoerce) import qualified Unsafe.Linear as Unsafe import Prelude ((+)) import qualified Prelude -- # Implementation Notes -- This is a simple implementatation of robin hood hashing. -- -- See these links: -- -- * https://programming.guide/robin-hood-hashing.html -- * https://andre.arko.net/2017/08/24/robin-hood-hashing/ -- * https://cs.uwaterloo.ca/research/tr/1986/CS-86-14.pdf -- -- # Constants -------------------------------------------------- -- | When to trigger a resize. -- -- A high load factor usually is not desirable because it makes operations -- do more probes. A very low one is also not desirable since there're some -- operations which take time relative to the 'capacity'. -- -- This should be between (0, 1) -- -- The value 0.75 is what Java uses: -- https://docs.oracle.com/javase/10/docs/api/java/util/HashMap.html constMaxLoadFactor :: Float constMaxLoadFactor = 0.75 -- | When resizing, the capacity will be multiplied by this amount. -- -- This should be greater than one. constGrowthFactor :: Int constGrowthFactor = 2 -- # Core Data Types -------------------------------------------------- -- | A mutable hashmap with a linear interface. data HashMap k v where -- | -- @loadFactor m = size m / cap m@ -- -- Invariants: -- - array is non-empty -- - (count / capacity) <= constMaxLoadFactor. HashMap :: -- | The number of stored (key, value) pairs. !Int -> -- | Capacity of the underlying array (cached here) !Int -> -- | Underlying array. !(RobinArr k v) %1 -> HashMap k v -- | An array of Robin values -- -- Each cell is Nothing if empty and is a RobinVal with the correct -- PSL otherwise. type RobinArr k v = Array (Maybe (RobinVal k v)) -- | Robin values are triples of the key, value and PSL -- (the probe sequence length). data RobinVal k v = RobinVal !PSL !k v deriving (Show) incRobinValPSL :: RobinVal k v -> RobinVal k v incRobinValPSL (RobinVal (PSL p) k v) = RobinVal (PSL (p + 1)) k v decRobinValPSL :: RobinVal k v -> RobinVal k v decRobinValPSL (RobinVal (PSL p) k v) = RobinVal (PSL (p - 1)) k v -- | A probe sequence length newtype PSL = PSL Int deriving (Prelude.Eq, Prelude.Ord, Prelude.Num, Prelude.Show) -- | At minimum, we need to store hashable -- and identifiable keys type Keyed k = (Prelude.Eq k, Hashable k) -- | The results of searching for where to insert a key. -- -- PSL's on the constructors are the probes spent from the query, this -- might be different than PSL's of the cell at the returned index -- (in case of `IndexToSwap` constructor). data ProbeResult k v where -- | An empty cell at index to insert a new element with PSL. IndexToInsert :: !PSL -> !Int -> ProbeResult k v -- | A matching cell at index with a PSL and a value to update. IndexToUpdate :: v -> !PSL -> !Int -> ProbeResult k v -- | An occupied, richer, cell which should be evicted when inserting -- the new element. The swapped-out cell will then need to be inserted -- with a higher PSL. IndexToSwap :: RobinVal k v -> !PSL -> !Int -> ProbeResult k v -- # Construction and Modification -------------------------------------------------- -- | Run a computation with an empty 'HashMap' with given capacity. empty :: forall k v b. Keyed k => Int -> (HashMap k v %1 -> Ur b) %1 -> Ur b empty size scope = let cap = max 1 size in Array.alloc cap Nothing (\arr -> scope (HashMap 0 cap arr)) -- | Create an empty HashMap, using another as a uniqueness proof. allocBeside :: Keyed k => Int -> HashMap k' v' %1 -> (HashMap k v, HashMap k' v') allocBeside size (HashMap s' c' arr) = let cap = max 1 size in Array.allocBeside cap Nothing arr & \(arr', arr'') -> (HashMap size cap arr', HashMap s' c' arr'') -- | Run a computation with an 'HashMap' containing given key-value pairs. fromList :: forall k v b. Keyed k => [(k, v)] -> (HashMap k v %1 -> Ur b) %1 -> Ur b fromList xs scope = let cap = max 1 (ceiling @Float @Int (fromIntegral (Prelude.length xs) / constMaxLoadFactor)) in Array.alloc cap Nothing (\arr -> scope (insertAll xs (HashMap 0 cap arr))) -- | The most general modification function; which can insert, update or delete -- a value of the key, while collecting an effect in the form of an arbitrary -- 'Control.Functor'. alterF :: (Keyed k, Control.Functor f) => (Maybe v -> f (Ur (Maybe v))) -> k -> HashMap k v %1 -> f (HashMap k v) alterF f key hm = idealIndexForKey key hm & \(Ur idx, hm') -> probeFrom key 0 idx hm' `chainU` \case -- The key does not exist, and there is an empty cell to insert. (# HashMap count cap arr, IndexToInsert psl ix #) -> f Nothing Control.<&> \case -- We don't need to insert anything. Ur Nothing -> HashMap count cap arr -- We need to insert a new key. Ur (Just v) -> HashMap (count + 1) cap (Array.unsafeWrite arr ix (Just (RobinVal psl key v))) & growMapIfNecessary -- The key exists. (# HashMap count cap arr, IndexToUpdate v psl ix #) -> f (Just v) Control.<&> \case -- We need to delete it. Ur Nothing -> Array.unsafeWrite arr ix Nothing & \arr' -> shiftSegmentBackward 1 cap arr' ((ix + 1) `mod` cap) & \arr'' -> HashMap (count - 1) cap arr'' -- We need to update it. Ur (Just new) -> HashMap count cap (Array.unsafeWrite arr ix (Just (RobinVal psl key new))) -- The key does not exist, but there is a key to evict. (# HashMap count cap arr, IndexToSwap evicted psl ix #) -> f Nothing Control.<&> \case -- We don't need to insert anything. Ur Nothing -> HashMap count cap arr -- We need to insert a new key. Ur (Just v) -> tryInsertAtIndex ( HashMap count cap (Array.unsafeWrite arr ix (Just (RobinVal psl key v))) ) ((ix + 1) `mod` cap) (incRobinValPSL evicted) & growMapIfNecessary {-# INLINE alterF #-} -- aspiwack: I'm implementing `alter` in terms of `alterF`, because, at this -- point, we may have some bug fixes and so on and so forth. And maintaining two -- functions this size is quite a bit unpleasant. Nevertheless, the extra boxing -- required by the intermediate `Ur` call, there, makes it so that the -- specialisation of `alterF` to `Identity` doesn't quite yield the code that we -- would like, it's a bit costlier than it should. So in an ideal word, we would -- implement both manually. In the future probably. -- | A general modification function; which can insert, update or delete -- a value of the key. See 'alterF', for an even more general function. alter :: Keyed k => (Maybe v -> Maybe v) -> k -> HashMap k v %1 -> HashMap k v alter f key hm = runIdentity $ alterF (\v -> Identity (Ur (f v))) key hm where runIdentity :: Identity a %1 -> a runIdentity (Identity x) = x {-# INLINE alter #-} -- | Insert a key value pair to a 'HashMap'. It overwrites the previous -- value if it exists. insert :: Keyed k => k -> v -> HashMap k v %1 -> HashMap k v insert k v = alter (\_ -> Just v) k -- | Delete a key from a 'HashMap'. Does nothing if the key does not -- exist. delete :: Keyed k => k -> HashMap k v %1 -> HashMap k v delete = alter (\_ -> Nothing) -- | 'insert' (in the provided order) the given key-value pairs to -- the hashmap. insertAll :: Keyed k => [(k, v)] -> HashMap k v %1 -> HashMap k v insertAll [] hmap = hmap insertAll ((k, v) : xs) hmap = insertAll xs (insert k v hmap) -- TODO: Do a resize first on the length of the input. -- | A version of 'fmap' which can throw out the elements. -- -- Complexity: O(capacity hm) mapMaybe :: Keyed k => (v -> Maybe v') -> HashMap k v %1 -> HashMap k v' mapMaybe f = mapMaybeWithKey (\_k v -> f v) -- | Same as 'mapMaybe', but also has access to the keys. mapMaybeWithKey :: forall k v v'. Keyed k => (k -> v -> Maybe v') -> HashMap k v %1 -> HashMap k v' mapMaybeWithKey _ (HashMap 0 cap arr) = HashMap 0 cap (Unsafe.coerce arr) mapMaybeWithKey f (HashMap _ cap arr) = Array.size arr & \(Ur size, arr1) -> mapAndPushBack 0 (size - 1) (False, 0) 0 arr1 & \(Ur c, arr2) -> HashMap c cap (Unsafe.coerce arr2) where f' :: k -> v -> Maybe v f' k v = unsafeCoerce (f k v) -- Going from arr[0] to arr[size-1] map each element while -- simultaneously pushing elements back if some earlier element(s) -- were deleted in a contiguous segment and if the current -- element has PSL > 0. Maintain a counter of how -- far to push elements back. At arr[size-1] if needed, call -- shiftSegmentBackward with the counter at arr[0]. mapAndPushBack :: Int -> -- Current index Int -> -- Last index of array which is (size-1) (Bool, Int) -> -- (b,n) s.t. b iff open space n cells before current cell Int -> -- Count of present key-value pairs RobinArr k v %1 -> (Ur Int, RobinArr k v) -- The new count and fully mapped array mapAndPushBack ix end (shift, dec) count arr | (ix > end) = if shift then (Ur count, shiftSegmentBackward dec (end + 1) arr 0) else (Ur count, arr) | otherwise = Array.unsafeRead arr ix & \case (Ur Nothing, arr1) -> mapAndPushBack (ix + 1) end (False, 0) count arr1 (Ur (Just (RobinVal (PSL p) k v)), arr1) -> case f' k v of Nothing -> Array.unsafeWrite arr1 ix Nothing & \arr2 -> mapAndPushBack (ix + 1) end (True, dec + 1) count arr2 Just v' -> case shift of False -> Array.unsafeWrite arr1 ix (Just (RobinVal (PSL p) k v')) & \arr2 -> mapAndPushBack (ix + 1) end (False, 0) (count + 1) arr2 True -> case dec <= p of False -> Array.unsafeWrite arr1 (ix - p) (Just (RobinVal 0 k v')) & \arr2 -> case p == 0 of False -> Array.unsafeWrite arr2 ix Nothing & \arr3 -> mapAndPushBack (ix + 1) end (True, p) (count + 1) arr3 True -> mapAndPushBack (ix + 1) end (False, 0) (count + 1) arr2 True -> Array.unsafeWrite arr1 (ix - dec) (Just (RobinVal (PSL (p - dec)) k v')) & \arr2 -> Array.unsafeWrite arr2 ix Nothing & \arr3 -> mapAndPushBack (ix + 1) end (True, dec) (count + 1) arr3 -- | Complexity: O(capacity hm) filterWithKey :: Keyed k => (k -> v -> Bool) -> HashMap k v %1 -> HashMap k v filterWithKey f = mapMaybeWithKey (\k v -> if f k v then Just v else Nothing) -- | Complexity: O(capacity hm) filter :: Keyed k => (v -> Bool) -> HashMap k v %1 -> HashMap k v filter f = filterWithKey (\_k v -> f v) -- | Union of two maps using the provided function on conflicts. -- -- Complexity: O(min(capacity hm1, capacity hm2) unionWith :: Keyed k => (v -> v -> v) -> HashMap k v %1 -> HashMap k v %1 -> HashMap k v unionWith onConflict (hm1 :: HashMap k v) hm2 = -- To insert the elements in smaller map to the larger map, we -- compare their capacities, and flip the arguments if necessary. capacity hm1 & \(Ur cap1, hm1') -> capacity hm2 & \(Ur cap2, hm2') -> if cap1 > cap2 then go onConflict hm1' (toList hm2') else go (\v2 v1 -> onConflict v1 v2) hm2' (toList hm1') where go :: (v -> v -> v) -> HashMap k v %1 -> -- larger map Ur [(k, v)] %1 -> -- contents of the smaller map HashMap k v go _ hm (Ur []) = hm go f hm (Ur ((k, vr) : xs)) = alter ( \case Nothing -> Just vr Just vl -> Just (f vl vr) ) k hm & \hm -> go f hm (Ur xs) -- | A right-biased union. -- -- Complexity: O(min(capacity hm1, capacity hm2) union :: Keyed k => HashMap k v %1 -> HashMap k v %1 -> HashMap k v union hm1 hm2 = unionWith (\_v1 v2 -> v2) hm1 hm2 -- | Intersection of two maps with the provided combine function. -- -- Complexity: O(min(capacity hm1, capacity hm2) intersectionWith :: Keyed k => (a -> b -> c) -> HashMap k a %1 -> HashMap k b %1 -> HashMap k c intersectionWith combine (hm1 :: HashMap k a') hm2 = allocBeside 0 hm1 & \(hmNew, hm1') -> capacity hm1' & \(Ur cap1, hm1'') -> capacity hm2 & \(Ur cap2, hm2') -> if cap1 > cap2 then go combine hm1'' (toList hm2') hmNew else go (\v2 v1 -> combine v1 v2) hm2' (toList hm1'') hmNew where -- Iterate over the smaller map, while checking for the matches -- on the bigger map; and accumulate results on a third map. go :: (a -> b -> c) -> HashMap k a %1 -> Ur [(k, b)] %1 -> HashMap k c %1 -> HashMap k c go _ hm (Ur []) acc = hm `lseq` acc go f hm (Ur ((k, b) : xs)) acc = lookup k hm & \case (Ur Nothing, hm') -> go f hm' (Ur xs) acc (Ur (Just a), hm') -> go f hm' (Ur xs) (insert k (f a b) acc) -- | -- Reduce the 'HashMap' 'capacity' to decrease wasted memory. Returns -- a semantically identical 'HashMap'. -- -- This is only useful after a lot of deletes. -- -- Complexity: O(capacity hm) shrinkToFit :: Keyed k => HashMap k a %1 -> HashMap k a shrinkToFit hm = size hm & \(Ur size, hm') -> let targetSize = ceiling (Prelude.max 1 (fromIntegral size Prelude./ constMaxLoadFactor)) in resize targetSize hm' -- # Querying -------------------------------------------------- -- | Number of key-value pairs inside the 'HashMap' size :: HashMap k v %1 -> (Ur Int, HashMap k v) size (HashMap ct cap arr) = (Ur ct, HashMap ct cap arr) -- | Maximum number of elements the HashMap can store without -- resizing. However, for performance reasons, the 'HashMap' might be -- before full. -- -- Use 'shrinkToFit' to reduce the wasted space. capacity :: HashMap k v %1 -> (Ur Int, HashMap k v) capacity (HashMap ct cap arr) = (Ur cap, HashMap ct cap arr) -- | Look up a value from a 'HashMap'. lookup :: Keyed k => k -> HashMap k v %1 -> (Ur (Maybe v), HashMap k v) lookup k hm = idealIndexForKey k hm & \(Ur idx, hm') -> probeFrom k 0 idx hm' `chainU` \case (# h, IndexToUpdate v _ _ #) -> (Ur (Just v), h) (# h, IndexToInsert _ _ #) -> (Ur Nothing, h) (# h, IndexToSwap _ _ _ #) -> (Ur Nothing, h) -- | Check if the given key exists. member :: Keyed k => k -> HashMap k v %1 -> (Ur Bool, HashMap k v) member k hm = lookup k hm & \case (Ur Nothing, hm') -> (Ur False, hm') (Ur (Just _), hm') -> (Ur True, hm') -- | Converts a HashMap to a lazy list. toList :: HashMap k v %1 -> Ur [(k, v)] toList (HashMap _ _ arr) = Array.toList arr & \(Ur elems) -> elems NonLinear.& NonLinear.catMaybes NonLinear.& Prelude.map (\(RobinVal _ k v) -> (k, v)) NonLinear.& Ur -- # Instances -------------------------------------------------- instance Consumable (HashMap k v) where consume :: HashMap k v %1 -> () consume (HashMap _ _ arr) = consume arr instance Dupable (HashMap k v) where dup2 (HashMap i c arr) = dup2 arr & \(a1, a2) -> (HashMap i c a1, HashMap i c a2) instance Data.Functor (HashMap k) where fmap f (HashMap s c arr) = HashMap s c $ Data.fmap ( \case Nothing -> Nothing Just (RobinVal p k v) -> Just (RobinVal p k (f v)) ) arr instance Prelude.Semigroup (HashMap k v) where (<>) = error "Prelude.<>: invariant violation, unrestricted HashMap" instance Keyed k => Semigroup (HashMap k v) where (<>) = union -- # Internal library -------------------------------------------------- _debugShow :: (Show k, Show v) => HashMap k v %1 -> String _debugShow (HashMap _ _ robinArr) = Array.toList robinArr & \(Ur xs) -> show xs idealIndexForKey :: Keyed k => k -> HashMap k v %1 -> (Ur Int, HashMap k v) idealIndexForKey k (HashMap sz cap arr) = (Ur (mod (hash k) cap), HashMap sz cap arr) -- | Given a key, psl of the probe so far, current unread index, and -- a full hashmap, return a probe result: the place the key already -- exists, a place to swap from, or an unfilled cell to write over. probeFrom :: Keyed k => k -> PSL -> Int -> HashMap k v %1 -> (# HashMap k v, ProbeResult k v #) probeFrom k p ix (HashMap ct cap arr) = Array.unsafeRead arr ix `chainU'` \case (Ur Nothing, arr') -> (# HashMap ct cap arr', IndexToInsert p ix #) (Ur (Just robinVal'@(RobinVal psl k' v')), arr') -> case k Prelude.== k' of -- Note: in the True case, we must have p == psl True -> (# HashMap ct cap arr', IndexToUpdate v' psl ix #) False -> case psl Prelude.< p of True -> (# HashMap ct cap arr', IndexToSwap robinVal' p ix #) False -> probeFrom k (p + 1) ((ix + 1) `mod` cap) (HashMap ct cap arr') -- | Try to insert at a given index with a given PSL. So the -- probing starts from the given index (with the given PSL). tryInsertAtIndex :: Keyed k => HashMap k v %1 -> Int -> RobinVal k v -> HashMap k v tryInsertAtIndex hmap ix (RobinVal psl key val) = probeFrom key psl ix hmap `chainU` \case (# HashMap ct cap arr, IndexToUpdate _ psl' ix' #) -> Array.unsafeWrite arr ix' (Just $ RobinVal psl' key val) & HashMap ct cap (# HashMap ct cap arr, IndexToInsert psl' ix' #) -> Array.unsafeWrite arr ix' (Just $ RobinVal psl' key val) & HashMap (ct + 1) cap (# HashMap ct cap arr, IndexToSwap oldVal psl' ix' #) -> Array.unsafeWrite arr ix' (Just $ RobinVal psl' key val) & HashMap ct cap & \hm -> tryInsertAtIndex hm ((ix' + 1) `mod` cap) (incRobinValPSL oldVal) -- | Shift all cells with PSLs > 0 in a continuous segment -- following the deleted cell, backwards by one and decrement -- their PSLs. shiftSegmentBackward :: Keyed k => Int -> Int -> RobinArr k v %1 -> Int -> RobinArr k v shiftSegmentBackward dec s arr ix = Array.unsafeRead arr ix & \case (Ur Nothing, arr') -> arr' (Ur (Just (RobinVal 0 _ _)), arr') -> arr' (Ur (Just val), arr') -> Array.unsafeWrite arr' ix Nothing & \arr'' -> shiftSegmentBackward dec s (Array.unsafeWrite arr'' ((ix - dec + s) `mod` s) (Just $ decRobinValPSL val)) ((ix + 1) `mod` s) -- TODO: This does twice as much writes than necessary, it first empties -- the cell, just to update it again at the next call. We can save some -- writes by only emptying the last cell. -- | Makes sure that the map is not exceeding its utilization threshold -- (constMaxUtilization), resizes (constGrowthFactor) if necessary. growMapIfNecessary :: Keyed k => HashMap k v %1 -> HashMap k v growMapIfNecessary (HashMap sz cap arr) = let load = fromIntegral sz / fromIntegral cap in if load Prelude.< constMaxLoadFactor then HashMap sz cap arr else let newCap = max 1 (cap * constGrowthFactor) in resize newCap (HashMap sz cap arr) -- | Resizes the HashMap to given capacity. -- -- Invariant: Given capacity should be greater than the size, this is not -- checked. resize :: Keyed k => Int -> HashMap k v %1 -> HashMap k v resize targetSize (HashMap _ _ arr) = Array.allocBeside targetSize Nothing arr & \(newArr, oldArr) -> Array.toList oldArr & \(Ur elems) -> let xs = elems NonLinear.& NonLinear.catMaybes NonLinear.& Prelude.map (\(RobinVal _ k v) -> (k, v)) in insertAll xs (HashMap 0 targetSize newArr) -- TODO: 'insertAll' keeps checking capacity on each insert. We should -- replace it with a faster unsafe variant. -- TODO: Remove the below workarounds once we are on GHC 9.2. -- -- We have to use these functions below because: -- -- * GHC <9.2 does not allow linear `case` statements. -- * LambdaCase workaround does not work, because (&) does not work with -- unlifted types. chainU :: (# a, b #) %1 -> ((# a, b #) %1 -> c) %1 -> c chainU x f = f x chainU' :: a %1 -> (a %1 -> (# b, c #)) %1 -> (# b, c #) chainU' x f = f x