{-# LANGUAGE BangPatterns, CPP #-} {-# OPTIONS_GHC -funbox-strict-fields #-} {-| Module : Data.Atom.UF Copyright : (c) Thomas Schilling 2010 License : BSD-style Maintainer : nominolo@gmail.com Stability : experimental Portability : portable Symbols without a central symbol table. Symbols provide the following efficient operations: - /O(1)/ equality comparison (in practise) - /O(1)/ ordering comparison (in practise) - /O(n)/ creation where /N/ is the size of the symbol descriptor. Many implementations often have the additional property that each symbol descriptor only exists once in memory. This implementation slightly relaxes this property: - A symbol descriptor is guaranteed to exists only once in memory if it has been created using the same symbol table. Furthermore, if two symbols created from different symbol tables are compared and their descriptors turn out to be equal, the symbols will share the descriptor after the comparison. This allows the following additional properties not present in conventional implementations: - No space leak. The symbol table can be discarded at any time. - Symbols created using different symbol tables can be compared reliably. - No global lock. (TODO: Well we might need one in the case of hash-collisions, but a lock-free implementation might be possible.) Inspired by Richard O'Keefe's message to Erlang's eeps mailing list , which in turn was inspired by the Logix implementation of Flat Concurrent Prolog. -} module Data.Atom.UF ( -- * Symbols -- Symbol, intern, internInto, SymTab(..), symbolHash -- * Implementation -- -- $impl -- ) where import Data.Word ( Word64 ) import Data.IORef import System.IO.Unsafe import Control.Monad ( unless ) import Control.DeepSeq #ifndef NDEBUG -- For testing: import Control.Monad ( liftM2 ) import System.Mem.Weak import System.Mem import Data.Digest.Murmur64 import Data.Maybe ( isJust ) #endif -- ------------------------------------------------------------------- -- Public API: -- | A symbol. -- -- Note that the ordering on @a@ is /not/ preserved on @Symbol a@. -- Symbols are ordered by their hashes, and only if the hashes are -- equal will the ordering on @a@ be used. We have: -- -- @ -- x == y ==> intern x == intern y -- -- let sx = intern x -- sy = intern y -- in -- (sx < sy) == ((symbolHash sy < symbolHash sx) || -- symbolHash sy == symbolHash sx && x < y) -- @ data Symbol a = Symbol {-# UNPACK #-} !Word64 -- hash {-# UNPACK #-} !(IORef (SymbolInfo a)) -- | Returns the hash of the symbol. symbolHash :: Symbol a -> Word64 symbolHash (Symbol h _) = h instance Ord a => Eq (Symbol a) where x == y = cmpSymbol x y == EQ instance Ord a => Ord (Symbol a) where compare = cmpSymbol instance Show a => Show (Symbol a) where show = show . symInfo -- | Create a new local symbol. For best performance use -- 'internInto' together with a symbol table / map. intern :: (a -> Word64) -> a -> Symbol a class SymTab s where lookupSymbol :: s a -> a -> Maybe (Symbol a) insertSymbol :: a -> (Symbol a) -> s a -> s a -- | Insert a symbol into an existing table. internInto :: (SymTab s) => (a -> Word64) -> s a -> a -> (s a, Symbol a) -- ------------------------------------------------------------------- -- Internals newtype SymbolInfo a = SymInfo (IORef (Link a)) type Link a = Either a (SymbolInfo a) internInto hash_fn st str = case lookupSymbol st str of Just sym -> (st, sym) _ -> let sym = intern hash_fn str in (insertSymbol str sym st, sym) intern hash_fn s = unsafePerformIO $ do info <- newIORef (Left s) info' <- newIORef (SymInfo info) return (Symbol (hash_fn s) info') cmpSymbol :: Ord a => Symbol a -> Symbol a -> Ordering cmpSymbol (Symbol h1 i1) (Symbol h2 i2) | i1 == i2 = EQ | otherwise = case h1 `compare` h2 of EQ -> uncommon_case -- not identical, but same hash ans -> ans where {-# NOINLINE uncommon_case #-} uncommon_case = unsafePerformIO $ do -- get representative element (performs path shortening) (rep1@(SymInfo rr1), s1) <- repr' i1 (rep2@(SymInfo rr2), s2) <- repr' i2 if rep1 === rep2 then return EQ else case s1 `compare` s2 of EQ -> do -- they should be equal! writeIORef rr2 (Right rep1) writeIORef i2 rep1 return EQ ans -> return ans -- We abuse the fact that IORefs give us an identity (i.e., observable -- sharing) and that we need the IORef anyway. sameSym :: SymbolInfo a -> SymbolInfo a -> Bool sameSym (SymInfo r1) (SymInfo r2) = r1 == r2 (===) = sameSym symInfo :: Symbol a -> a symInfo (Symbol _ r) = unsafePerformIO $ do fmap snd (repr' r) repr' :: IORef (SymbolInfo a) -> IO (SymbolInfo a, a) repr' r = do info <- readIORef r (root_info, str) <- go info unless (root_info === info) $ writeIORef r root_info return (root_info, str) where go si@(SymInfo ir) = do i <- readIORef ir case i of Left str -> return (si, str) Right si' -> do (root_info, str) <- go si' unless (si' === root_info) $ writeIORef ir (Right root_info) -- is Left possible here? return (root_info, str) ---------------------------------------------------------- -- Tests #ifndef NDEBUG -- requires import Data.Digest.Murmur32 test1 = do let h = asWord64 . hash64 s1@(Symbol _ r1) = intern h "foo" s2@(Symbol _ r2) = intern h "foo" print $ r1 == r2 -- should be False -- create a weak reference to the second symbol, so we can observe -- when it is garbage collected w <- mk_weak =<< readIORef r2 print $ s1 == s2 -- should print True print =<< liftM2 sameSym (readIORef r1) (readIORef r2) -- should print True putStrLn "GCing" performGC -- this should print goodbye, representing the -- fact that the second symbol has been garbage -- collected. print . isJust =<< deRefWeak w -- should print False (object has been collected) where mk_weak o = mkWeakPtr o (Just (putStrLn "goodbye")) #endif {- $doc1 test 1 -} {- $impl Each symbol is represented a mutable pointer to the symbol info and a hash. The symbol info might itself be a pointer to another (equal) symbol info. When creating a new symbol (without looking it up in a symbol table), we compute its hash and create a new symbol info. > +----+---+ +-------+ > A: | 42 | *-----> | "foo" | > +----+---+ +-------+ We now know the following: 1. If two symbols have the same reference, they are equal. (The 'Eq' instance on 'IORef's implements observable sharing.) 2. If two symbols have a different hash, they are different. If neither of the above is true we either have a hash collision or the two objects are equal but were created using different symbol tables. Let's consider the latter case: > +----+---+ +-------+ > A: | 42 | *-----> | "foo" | > +----+---+ +-------+ > > +----+---+ +-------+ > B: | 42 | *-----> | "foo" | > +----+---+ +-------+ We follow the symbol pointers and realise that the symbol descriptors are equal. We thus decide for one of them to be the canonical symbol descriptor and update the pointers: > +----+---+ +-------+ > A: | 42 | *-----> | "foo" | > +----+---+ .-> +-------+ > | ^ > +----+---+ | +---|---+ > B: | 42 | *---' | * | > +----+---+ +-------+ We change the other symbol descriptor to be a pointer to the canonical descriptor, because there may be other pointers to this symbol descriptor. Otherwise, the old symbol descriptor becomes garbage. We now have only one @\"foo\"@ object left. We can add a third rule for equality: - If, following all pointers, two symbol descriptors are the same, then the two symbols are equal. If this is not the case (e.g., in the case of a hash collision) we call the 'compare' function of the symbol descriptor. A good hash function is therefore important since in the case of a hash collision we will always have to call the 'compare' function of the symbol descriptor. ** Hash Function Assuming a good hash function (i.e., the hash is indistinguishable from a randomly generated number) we can use the birthday paradox to calculate the probability of a hash collision: @ collision_prob :: Integer -> Integer -> Double collision_prob key_bits items = 1 - exp (fromIntegral (-items * (items - 1)) / fromIntegral (2 * key_space)) where key_space = 2 ^ key_bits :: Integer @ E.g., @collision_prob 32 50000 == 0.2525...@ means that with 32 bit hashes and 50000 symbols, there is a 25 percent chance of a hash collision. ** Path Shortening If symbols from several symbol tables are joined repeatedly, its symbol infos may develop into long chains. For this reason we update all pointers while following them. That is, given we have the following state: > X+-sym+---+ A+-nfo-+ B+-------+ > | 42 | *------->| *------> | "foo" | > +----+---' +-----' '-------' > > Y+-sym+---+ C+-nfo-+ D+-------+ > | 42 | *------->| *------> | "foo" | > +----+---' +-----' '-------' after @x \`compare\` y@ we have. > X+-sym+---+ B+-------+ A+-nfo-+ > | 42 | *-------> | "foo" |<-------* | > `----+---' +--> '-------' `-----' > | ^ ^------+ > Y+-sym+---+ | C+-nfo-+| D+---|---+ > | 42 | *----' | *---' | * | > `----+---' `-----' '-------' These references can be updated concurrently and without a lock since their information content does not change. That is, the state > X.-sym+---. A.-nfo-. B.-------. > | 42 | *------->| *------> | "foo" | > `----+---' `-----' '-------' Is semantically equivalent to the state: > X.-sym+---. A.-nfo-. B.-------. > | 42 | *----. | *------> | "foo" | > +----+---+ | +-----+ .-> +-------+ > '-----------' * TODO - verify thread-safety - make sure the pointer update code is correct and has no bad cases - implement IntMap variant\/wrapper that respects that two different objects may have the same hash (however unlikely). -}