Ticket #4193: HashTable.hs

File HashTable.hs, 18.3 KB (added by japple, 3 years ago)

A full copy of Data.HashTable? with the patch applied and the module name changed (for ease of testing)

Line 
1{-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields -fno-warn-name-shadowing #-}
2
3-----------------------------------------------------------------------------
4-- |
5-- Module      :  Data.HashTable
6-- Copyright   :  (c) The University of Glasgow 2003
7-- License     :  BSD-style (see the file libraries/base/LICENSE)
8--
9-- Maintainer  :  libraries@haskell.org
10-- Stability   :  provisional
11-- Portability :  portable
12--
13-- An implementation of extensible hash tables, as described in
14-- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
15-- pp. 446--457.  The implementation is also derived from the one
16-- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
17--
18-----------------------------------------------------------------------------
19
20module HashTable (
21        -- * Basic hash table operations
22        HashTable, new, newHint, insert, delete, lookup, update,
23        -- * Converting to and from lists
24        fromList, toList,
25        -- * Hash functions
26        -- $hash_functions
27        hashInt, hashString,
28        prime,
29        -- * Diagnostics
30        longestChain
31 ) where
32
33-- This module is imported by Data.Dynamic, which is pretty low down in the
34-- module hierarchy, so don't import "high-level" modules
35
36#ifdef __GLASGOW_HASKELL__
37import GHC.Base
38#else
39import Prelude  hiding  ( lookup )
40#endif
41import Data.Tuple       ( fst )
42import Data.Bits
43import Data.Maybe
44import Data.List        ( maximumBy, length, concat, foldl', partition )
45import Data.Int         ( Int32 )
46
47#if defined(__GLASGOW_HASKELL__)
48import GHC.Num
49import GHC.Real         ( fromIntegral )
50import GHC.Show         ( Show(..) )
51import GHC.Int          ( Int64 )
52
53import GHC.IO
54import GHC.IOArray
55import GHC.IORef
56#else
57import Data.Char        ( ord )
58import Data.IORef       ( IORef, newIORef, readIORef, writeIORef )
59import System.IO.Unsafe ( unsafePerformIO )
60import Data.Int         ( Int64 )
61#  if defined(__HUGS__)
62import Hugs.IOArray     ( IOArray, newIOArray,
63                          unsafeReadIOArray, unsafeWriteIOArray )
64#  elif defined(__NHC__)
65import NHC.IOExtras     ( IOArray, newIOArray, readIOArray, writeIOArray )
66#  endif
67#endif
68import Control.Monad    ( mapM, mapM_, sequence_ )
69
70
71-----------------------------------------------------------------------
72
73iNSTRUMENTED :: Bool
74iNSTRUMENTED = False
75
76-----------------------------------------------------------------------
77
78readHTArray  :: HTArray a -> Int32 -> IO a
79writeMutArray :: MutArray a -> Int32 -> a -> IO ()
80newMutArray   :: (Int32, Int32) -> a -> IO (MutArray a)
81newMutArray = newIOArray
82type MutArray a = IOArray Int32 a
83type HTArray a = MutArray a
84#if defined(DEBUG) || defined(__NHC__)
85readHTArray  = readIOArray
86writeMutArray = writeIOArray
87#else
88readHTArray arr i = unsafeReadIOArray arr (fromIntegral i)
89writeMutArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
90#endif
91
92data HashTable key val = HashTable {
93                                     cmp     :: !(key -> key -> Bool),
94                                     hash_fn :: !(key -> Int32),
95                                     tab     :: !(IORef (HT key val))
96                                   }
97-- TODO: the IORef should really be an MVar.
98
99data HT key val
100  = HT {
101        kcount  :: !Int32,              -- Total number of keys.
102        bmask   :: !Int32,
103        buckets :: !(HTArray [(key,val)])
104       }
105
106-- ------------------------------------------------------------
107-- Instrumentation for performance tuning
108
109-- This ought to be roundly ignored after optimization when
110-- iNSTRUMENTED=False.
111
112-- STRICT version of modifyIORef!
113modifyIORef :: IORef a -> (a -> a) -> IO ()
114modifyIORef r f = do
115  v <- readIORef r
116  let z = f v in z `seq` writeIORef r z
117
118data HashData = HD {
119  tables :: !Integer,
120  insertions :: !Integer,
121  lookups :: !Integer,
122  totBuckets :: !Integer,
123  maxEntries :: !Int32,
124  maxChain :: !Int,
125  maxBuckets :: !Int32
126} deriving (Eq, Show)
127
128{-# NOINLINE hashData #-}
129hashData :: IORef HashData
130hashData =  unsafePerformIO (newIORef (HD { tables=0, insertions=0, lookups=0,
131                                            totBuckets=0, maxEntries=0,
132                                            maxChain=0, maxBuckets=tABLE_MIN } ))
133
134instrument :: (HashData -> HashData) -> IO ()
135instrument i | iNSTRUMENTED = modifyIORef hashData i
136             | otherwise    = return ()
137
138recordNew :: IO ()
139recordNew = instrument rec
140  where rec hd@HD{ tables=t, totBuckets=b } =
141               hd{ tables=t+1, totBuckets=b+fromIntegral tABLE_MIN }
142
143recordIns :: Int32 -> Int32 -> [a] -> IO ()
144recordIns i sz bkt = instrument rec
145  where rec hd@HD{ insertions=ins, maxEntries=mx, maxChain=mc } =
146               hd{ insertions=ins+fromIntegral i, maxEntries=mx `max` sz,
147                   maxChain=mc `max` length bkt }
148
149recordResize :: Int32 -> Int32 -> IO ()
150recordResize older newer = instrument rec
151  where rec hd@HD{ totBuckets=b, maxBuckets=mx } =
152               hd{ totBuckets=b+fromIntegral (newer-older),
153                   maxBuckets=mx `max` newer }
154
155recordLookup :: IO ()
156recordLookup = instrument lkup
157  where lkup hd@HD{ lookups=l } = hd{ lookups=l+1 }
158
159-- stats :: IO String
160-- stats =  fmap show $ readIORef hashData
161
162-- ----------------------------------------------------------------------------
163-- Sample hash functions
164
165-- $hash_functions
166--
167-- This implementation of hash tables uses the low-order /n/ bits of the hash
168-- value for a key, where /n/ varies as the hash table grows.  A good hash
169-- function therefore will give an even distribution regardless of /n/.
170--
171-- If your keyspace is integrals such that the low-order bits between
172-- keys are highly variable, then you could get away with using 'fromIntegral'
173-- as the hash function.
174--
175-- We provide some sample hash functions for 'Int' and 'String' below.
176
177golden :: Int32
178golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32
179-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32
180-- but that has bad mulHi properties (even adding 2^32 to get its inverse)
181-- Whereas the above works well and contains no hash duplications for
182-- [-32767..65536]
183
184hashInt32 :: Int32 -> Int32
185hashInt32 x = mulHi x golden + x
186
187-- | A sample (and useful) hash function for Int and Int32,
188-- implemented by extracting the uppermost 32 bits of the 64-bit
189-- result of multiplying by a 33-bit constant.  The constant is from
190-- Knuth, derived from the golden ratio:
191--
192-- > golden = round ((sqrt 5 - 1) * 2^32)
193--
194-- We get good key uniqueness on small inputs
195-- (a problem with previous versions):
196--  (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768
197--
198hashInt :: Int -> Int32
199hashInt x = hashInt32 (fromIntegral x)
200
201-- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
202mulHi :: Int32 -> Int32 -> Int32
203mulHi a b = fromIntegral (r `shiftR` 32)
204   where r :: Int64
205         r = fromIntegral a * fromIntegral b
206
207-- | A sample hash function for Strings.  We keep multiplying by the
208-- golden ratio and adding.  The implementation is:
209--
210-- > hashString = foldl' f golden
211-- >   where f m c = fromIntegral (ord c) * magic + hashInt32 m
212-- >         magic = 0xdeadbeef
213--
214-- Where hashInt32 works just as hashInt shown above.
215--
216-- Knuth argues that repeated multiplication by the golden ratio
217-- will minimize gaps in the hash space, and thus it's a good choice
218-- for combining together multiple keys to form one.
219--
220-- Here we know that individual characters c are often small, and this
221-- produces frequent collisions if we use ord c alone.  A
222-- particular problem are the shorter low ASCII and ISO-8859-1
223-- character strings.  We pre-multiply by a magic twiddle factor to
224-- obtain a good distribution.  In fact, given the following test:
225--
226-- > testp :: Int32 -> Int
227-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls
228-- >   where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]
229-- >         hs = foldl' f golden
230-- >         f m c = fromIntegral (ord c) * k + hashInt32 m
231-- >         n = 100000
232--
233-- We discover that testp magic = 0.
234
235hashString :: String -> Int32
236hashString = foldl' f golden
237   where f m c = fromIntegral (ord c) * magic + hashInt32 m
238         magic = 0xdeadbeef
239
240-- | A prime larger than the maximum hash table size
241prime :: Int32
242prime = 33554467
243
244-- -----------------------------------------------------------------------------
245-- Parameters
246
247tABLE_MAX :: Int32
248tABLE_MAX  = 32 * 1024 * 1024   -- Maximum size of hash table
249tABLE_MIN :: Int32
250tABLE_MIN  = 8
251
252hLOAD :: Int32
253hLOAD = 7                       -- Maximum average load of a single hash bucket
254
255hYSTERESIS :: Int32
256hYSTERESIS = 64                 -- entries to ignore in load computation
257
258{- Hysteresis favors long association-list-like behavior for small tables. -}
259
260-- -----------------------------------------------------------------------------
261-- Creating a new hash table
262
263-- | Creates a new hash table.  The following property should hold for the @eq@
264-- and @hash@ functions passed to 'new':
265--
266-- >   eq A B  =>  hash A == hash B
267--
268new
269  :: (key -> key -> Bool)    -- ^ @eq@: An equality comparison on keys
270  -> (key -> Int32)          -- ^ @hash@: A hash function on keys
271  -> IO (HashTable key val)  -- ^ Returns: an empty hash table
272
273new cmpr hash = do
274  recordNew
275  -- make a new hash table with a single, empty, segment
276  let mask = tABLE_MIN-1
277  bkts <- newMutArray (0,mask) []
278
279  let
280    kcnt = 0
281    ht = HT {  buckets=bkts, kcount=kcnt, bmask=mask }
282
283  table <- newIORef ht
284  return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
285
286{-
287   sameAs takes as arguments positive Int32s less than maxBound/2 and
288   returns the smallest power of 2 that is greater than or equal to the
289   argument
290-}
291sameAs :: Int32 -> Int32
292sameAs 1 = 1
293sameAs n = if 1 == n .&. 1 -- If the number is odd and not 1, it is not a power of 2
294           then shiftL (upAs   (shiftR n 1)) 1
295           else shiftL (sameAs (shiftR n 1)) 1
296
297{-
298   upAs takes as arguments positive Int32s less than maxBound/2 and
299   returns the smallest power of 2 that is greater than the argument
300-}
301upAs :: Int32 -> Int32
302upAs 1 = 2
303upAs n = shiftL (upAs (shiftR n 1)) 1
304
305{-
306  powerOver takes as arguments Int32s and returns the smallest power of 2
307  that is greater than or equal to the argument if that power of 2 is
308  within [tABLE_MIN,tABLE_MAX]
309-}
310powerOver :: Int32 -> Int32
311powerOver n = 
312    if n <= tABLE_MIN
313    then tABLE_MIN
314    else if n >= tABLE_MAX
315         then tABLE_MAX
316         else sameAs n
317
318-- | Creates a new hash table with the given minimum size.
319newHint
320  :: (key -> key -> Bool)    -- ^ @eq@: An equality comparison on keys
321  -> (key -> Int32)          -- ^ @hash@: A hash function on keys
322  -> Int                     -- ^ @minSize@: initial table size
323  -> IO (HashTable key val)  -- ^ Returns: an empty hash table
324
325newHint cmpr hash minSize = do
326  recordNew
327  -- make a new hash table with a single, empty, segment
328  let mask = powerOver $ fromIntegral minSize
329  bkts <- newMutArray (0,mask) []
330
331  let
332    kcnt = 0
333    ht = HT {  buckets=bkts, kcount=kcnt, bmask=mask }
334
335  table <- newIORef ht
336  return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
337
338-- -----------------------------------------------------------------------------
339-- Inserting a key\/value pair into the hash table
340
341-- | Inserts a key\/value mapping into the hash table.
342--
343-- Note that 'insert' doesn't remove the old entry from the table -
344-- the behaviour is like an association list, where 'lookup' returns
345-- the most-recently-inserted mapping for a key in the table.  The
346-- reason for this is to keep 'insert' as efficient as possible.  If
347-- you need to update a mapping, then we provide 'update'.
348--
349insert :: HashTable key val -> key -> val -> IO ()
350
351insert ht key val =
352  updatingBucket CanInsert (\bucket -> ((key,val):bucket, 1, ())) ht key
353
354
355-- ------------------------------------------------------------
356-- The core of the implementation is lurking down here, in findBucket,
357-- updatingBucket, and expandHashTable.
358
359tooBig :: Int32 -> Int32 -> Bool
360tooBig k b = k-hYSTERESIS > hLOAD * b
361
362-- index of bucket within table.
363bucketIndex :: Int32 -> Int32 -> Int32
364bucketIndex mask h = h .&. mask
365
366-- find the bucket in which the key belongs.
367-- returns (key equality, bucket index, bucket)
368--
369-- This rather grab-bag approach gives enough power to do pretty much
370-- any bucket-finding thing you might want to do.  We rely on inlining
371-- to throw away the stuff we don't want.  I'm proud to say that this
372-- plus updatingBucket below reduce most of the other definitions to a
373-- few lines of code, while actually speeding up the hashtable
374-- implementation when compared with a version which does everything
375-- from scratch.
376{-# INLINE findBucket #-}
377findBucket :: HashTable key val -> key -> IO (HT key val, Int32, [(key,val)])
378findBucket HashTable{ tab=ref, hash_fn=hash} key = do
379  table@HT{ buckets=bkts, bmask=b } <- readIORef ref
380  let indx = bucketIndex b (hash key)
381  bucket <- readHTArray bkts indx
382  return (table, indx, bucket)
383
384data Inserts = CanInsert
385             | Can'tInsert
386             deriving (Eq)
387
388-- updatingBucket is the real workhorse of all single-element table
389-- updates.  It takes a hashtable and a key, along with a function
390-- describing what to do with the bucket in which that key belongs.  A
391-- flag indicates whether this function may perform table insertions.
392-- The function returns the new contents of the bucket, the number of
393-- bucket entries inserted (negative if entries were deleted), and a
394-- value which becomes the return value for the function as a whole.
395-- The table sizing is enforced here, calling out to expandSubTable as
396-- necessary.
397
398-- This function is intended to be inlined and specialized for every
399-- calling context (eg every provided bucketFn).
400{-# INLINE updatingBucket #-}
401
402updatingBucket :: Inserts -> ([(key,val)] -> ([(key,val)], Int32, a)) ->
403                  HashTable key val -> key ->
404                  IO a
405updatingBucket canEnlarge bucketFn
406               ht@HashTable{ tab=ref, hash_fn=hash } key = do
407  (table@HT{ kcount=k, buckets=bkts, bmask=b },
408   indx, bckt) <- findBucket ht key
409  (bckt', inserts, result) <- return $ bucketFn bckt
410  let k' = k + inserts
411      table1 = table { kcount=k' }
412  writeMutArray bkts indx bckt'
413  table2 <- if canEnlarge == CanInsert && inserts > 0 then do
414               recordIns inserts k' bckt'
415               if tooBig k' b
416                  then expandHashTable hash table1
417                  else return table1
418            else return table1
419  writeIORef ref table2
420  return result
421
422expandHashTable :: (key -> Int32) -> HT key val -> IO (HT key val)
423expandHashTable hash table@HT{ buckets=bkts, bmask=mask } = do
424   let
425      oldsize = mask + 1
426      newmask = mask + mask + 1
427   recordResize oldsize (newmask+1)
428   --
429   if newmask > tABLE_MAX-1
430      then return table
431      else do
432   --
433    newbkts <- newMutArray (0,newmask) []
434
435    let
436     splitBucket oldindex = do
437       bucket <- readHTArray bkts oldindex
438       let (oldb,newb) =
439              partition ((oldindex==). bucketIndex newmask . hash . fst) bucket
440       writeMutArray newbkts oldindex oldb
441       writeMutArray newbkts (oldindex + oldsize) newb
442    mapM_ splitBucket [0..mask]
443
444    return ( table{ buckets=newbkts, bmask=newmask } )
445
446-- -----------------------------------------------------------------------------
447-- Deleting a mapping from the hash table
448
449-- Remove a key from a bucket
450deleteBucket :: (key -> Bool) -> [(key,val)] -> ([(key, val)], Int32, ())
451deleteBucket _   [] = ([],0,())
452deleteBucket del (pair@(k,_):bucket) =
453  case deleteBucket del bucket of
454    (bucket', dels, _) | del k     -> dels' `seq` (bucket', dels', ())
455                       | otherwise -> (pair:bucket', dels, ())
456      where dels' = dels - 1
457
458-- | Remove an entry from the hash table.
459delete :: HashTable key val -> key -> IO ()
460
461delete ht@HashTable{ cmp=eq } key =
462  updatingBucket Can'tInsert (deleteBucket (eq key)) ht key
463
464-- -----------------------------------------------------------------------------
465-- Updating a mapping in the hash table
466
467-- | Updates an entry in the hash table, returning 'True' if there was
468-- already an entry for this key, or 'False' otherwise.  After 'update'
469-- there will always be exactly one entry for the given key in the table.
470--
471-- 'insert' is more efficient than 'update' if you don't care about
472-- multiple entries, or you know for sure that multiple entries can't
473-- occur.  However, 'update' is more efficient than 'delete' followed
474-- by 'insert'.
475update :: HashTable key val -> key -> val -> IO Bool
476
477update ht@HashTable{ cmp=eq } key val =
478  updatingBucket CanInsert
479    (\bucket -> let (bucket', dels, _) = deleteBucket (eq key) bucket
480                in  ((key,val):bucket', 1+dels, dels/=0))
481    ht key
482
483-- -----------------------------------------------------------------------------
484-- Looking up an entry in the hash table
485
486-- | Looks up the value of a key in the hash table.
487lookup :: HashTable key val -> key -> IO (Maybe val)
488
489lookup ht@HashTable{ cmp=eq } key = do
490  recordLookup
491  (_, _, bucket) <- findBucket ht key
492  let firstHit (k,v) r | eq key k  = Just v
493                       | otherwise = r
494  return (foldr firstHit Nothing bucket)
495
496-- -----------------------------------------------------------------------------
497-- Converting to/from lists
498
499-- | Convert a list of key\/value pairs into a hash table.  Equality on keys
500-- is taken from the Eq instance for the key type.
501--
502fromList :: (Eq key) => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
503fromList hash list = do
504  table <- new (==) hash
505  sequence_ [ insert table k v | (k,v) <- list ]
506  return table
507
508-- | Converts a hash table to a list of key\/value pairs.
509--
510toList :: HashTable key val -> IO [(key,val)]
511toList = mapReduce id concat
512
513{-# INLINE mapReduce #-}
514mapReduce :: ([(key,val)] -> r) -> ([r] -> r) -> HashTable key val -> IO r
515mapReduce m r HashTable{ tab=ref } = do
516  HT{ buckets=bckts, bmask=b } <- readIORef ref
517  fmap r (mapM (fmap m . readHTArray bckts) [0..b])
518
519-- -----------------------------------------------------------------------------
520-- Diagnostics
521
522-- | This function is useful for determining whether your hash
523-- function is working well for your data set.  It returns the longest
524-- chain of key\/value pairs in the hash table for which all the keys
525-- hash to the same bucket.  If this chain is particularly long (say,
526-- longer than 14 elements or so), then it might be a good idea to try
527-- a different hash function.
528--
529longestChain :: HashTable key val -> IO [(key,val)]
530longestChain = mapReduce id (maximumBy lengthCmp)
531  where lengthCmp (_:x)(_:y) = lengthCmp x y
532        lengthCmp []   []    = EQ
533        lengthCmp []   _     = LT
534        lengthCmp _    []    = GT