Data/Atom/UF.hs

{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_GHC -funbox-strict-fields #-}
-- | 
-- Module      : Data.Atom.UF
-- Copyright   : (c) Thomas Schilling 2010
-- License     : BSD-style
--
-- Maintainer  : nominolo@gmail.com
-- Stability   : experimental
-- Portability : portable
--
-- Symbols without a central symbol table.
--
-- Symbols provide the following efficient operations:
--
--  - /O(1)/ equality comparison (in practise)
--  - /O(1)/ ordering comparison (in practise)
--  - /O(n)/ creation
--
-- This can be implemented by using a global variable mapping strings
-- to symbols and a counter assigning ids to symbols.  However, this
-- has two problems:
--
--  1. It has a space leak.  No symbols can ever be removed from this
--     table.  For example, if we add the symbol @\"foo\"@ the first
--     time it might get assigned id 1, if we then delete it and
--     insert it again it might get assigned id 42.  However, there
--     may still be symbols in memory which got assigned id 1.
--     Instead, symbols should be garbage collected like other data.
--     Using weak pointers has bad effects on performance due to
--     garbage collector overhead.
--
--  2. It is not reliable to compare symbols created using different
--     symbol tables.  They would most likely get assigned different
--     ids.
--
-- This implementation of symbols allows *optional* use of a symbol
-- table.  If a symbol table is used, this implementation will tend to
-- use less memory and its operations will be a little bit faster at
-- the beginning.  For longer runs, it won't make a big difference
-- though, since the representation is self-optimising.
--
-- Inspired by Richard O'Keefe's message to Erlang's eeps mailing list
-- <http://www.erlang.org/cgi-bin/ezmlm-cgi/5/057>, which in turn was
-- inspired by the Logix implementation of Flat Concurrent Prolog.
--
--
-- * Implementation
--
-- Each symbol is represented a pointer to the symbol info, which
-- consists of:
--
--   * a 'String'
--   * a 'Hash'
--   * a null-able parent pointer to an equivalent symbol info
--
-- Creating the same symbol twice will at first be represented as two
-- different entities.
--
-- @
--            .----+-------+-----.
--   A -----> | 42 | "foo" | nil |
--            '----+-------+-----'
--   B --.
--       '--> .----+-------+-----.
--   C -----> | 42 | "foo" | nil |
--            '----+-------+-----'
-- @
--
-- (Note that @A@, @B@ and @C@ are @IORefs@.)
--
-- When comparing @A@ and @B@ we use the following properties:
--
--  1. If @A@ and @B@ are identical then they must be equal.
--  
--  2. If they point to the same object, they must equal.
--  
--  3. If they have different hashes, they are different.
--
-- Unless there is a hash collision, we can decide equality and
-- ordering for all symbols that have been built with the same hash
-- table.
--
-- If the two objects have no parent, have the same hash, and the same
-- string, we now make one the first the parent of the other and
-- update the pointer of @B@ accordingly.  If there are no references
-- to the second object left it can now be garbage collected.
--
-- If an object already has a parent pointer we follow each object's
-- parents to the roots and compare the roots.  This process might
-- again result in updates to @A@ or @B@ and various parent pointers.
--
-- In the example above, after @A == B@ we have:
--
-- @
--            .----+-------+-----.
--   A -----> | 42 | "foo" | nil |
--       .--> '----+-------+-----'
--   B --'                    ^
--            .----+-------+--|--.
--   C -----> | 42 | "foo" |  *  |
--            '----+-------+-----'
-- @
--
-- After @C == A@ or @C == B@ we have.
--
-- @
--   A -----> .----+-------+-----.
--       .--> | 42 | "foo" | nil |
--   B --'.-> '----+-------+-----'
--        |                   ^
--        |   .----+-------+--|--.
--   C ---'   | 42 | "foo" |  *  |
--            '----+-------+-----'
-- @
--
-- The second object will now be garbage collected.
--
-- In fact, after the first @A == B@, the remaining updates could use
-- some help from the garbage collector.  This could be done by
-- somehow forcibly (and unsafely) replacing the second object by an
-- update frame and then rely on the GC's indirection shortening
-- feature.  This is /very/ unsafe, since some code may rely \"know\"
-- that the object is already evaluated.  E.g., C's pointer could be
-- tagged (c.f. \"Faster Laziness Using Dynamic Pointer Tagging\").
-- It /might/ work if we can match the physical layout of both
-- structures, but it's equally likely that hell freezes over, so I'll
-- leave that as an exercise for more braver hackers.
--
-- * TODO
--
--  - generalise to arbitrary hashable objects.  need not be
--    restricted to 'String'.
--
--  - make thread-safe.  (we only need a lock for the uncommon cases)
--
--  - make sure the pointer update code is correct and has no bad
--    cases
--
--  - implement IntMap variant\/wrapper that respects that two
--    different objects may have the same key (however unlikely).
-- 
module Data.Atom.UF 
  ( Symbol, intern, internInto, SymTab(..) )
where

import Data.Word ( Word32 )
import Data.Char ( ord )
import Data.Bits ( xor )
import Data.IORef
import System.IO.Unsafe
import Control.Monad -- ( unless )
import System.Mem.Weak
import System.Mem
import Data.Maybe

-- -------------------------------------------------------------------
-- Public API:

-- | A symbol.
newtype Symbol = Symbol (IORef SymbolInfo)
instance Eq Symbol where x == y = cmpSymbol x y == EQ
instance Ord Symbol where compare = cmpSymbol
instance Show Symbol where show = showSym

-- | Create a new local symbol.  For best performance use
-- 'internInto' together with a symbol table / map.
intern :: String -> Symbol

class SymTab s where
  lookupSymbol :: s -> String -> Maybe Symbol
  insertSymbol :: String -> Symbol -> s -> s

-- | Insert a symbol into an existing table.
internInto :: SymTab s => s -> String -> (s, Symbol)

-- -------------------------------------------------------------------
-- Internals

data SymbolInfo =
  SymInfo {-# UNPACK #-} !Word32  -- hash
          {-# UNPACK #-} !(IORef Link) -- parent [really unpack]?
          String

type Link = Maybe SymbolInfo


internInto st str =
  case lookupSymbol st str of
    Just sym -> (st, sym)
    _        -> let sym = intern str in
                (insertSymbol str sym st, sym)

showSym :: Symbol -> String
showSym (Symbol r) = unsafePerformIO $ do
  -- dupable/inline is fine, too, since the string never changes
  (SymInfo _ _ str) <- readIORef r
  return str

intern s = unsafePerformIO $ do
  lnk <- newIORef Nothing
  r <- newIORef $ SymInfo (hash s) lnk s
  return (Symbol r)

mkSymbolInfo :: String -> SymbolInfo
mkSymbolInfo s = unsafePerformIO $ do
  lnk <- newIORef Nothing
  return $ SymInfo (hash s) lnk s

cmpSymbol :: Symbol -> Symbol -> Ordering
cmpSymbol (Symbol r1) (Symbol r2)
  | r1 == r2 = EQ
  | otherwise = unsafePerformIO $ do
      -- We only read.  It should be safe to use unsafeInlineIO for
      -- the two reads.
      sym1@(SymInfo h1 l1 s1) <- readIORef r1
      sym2@(SymInfo h2 l2 s2) <- readIORef r2
      case h1 `compare` h2 of
        -- If the hashes are different they cannot be the same symbol
        LT -> return LT
        GT -> return GT
        EQ
         | sameSym sym1 sym2 ->
          -- The two references are not the same, but they point to
          -- the same object.  That's fine, we can't optimise any
          -- further.
           return EQ

        -- END OF COMMON CASE
        -- 
        -- If the symbols have been built using the same symbol table
        -- we will only reach this case if we have a hash collision or
        -- the symbols were built from different symbol tables.
        --
        -- TODO: Extract into NOINLINE function, wrap unsafePerformIO,
        -- and use an MVar-based lock.

         | otherwise -> do
          -- The hashes are the same.  It could be a collision, or the
          -- symbol was created using a different symbol table.
          --
          -- Case 1: The symbols have already be joined, but this
          -- Symbol's IORef still points to the old version.  We can
          -- determine this by following the union/find structure.
          rep1 <- repr sym1
          rep2 <- repr sym2
          let string_cmp = s1 `compare` s2  -- lazy!
          if sameSym rep1 rep2 || string_cmp == EQ then do
             -- They should in fact be the same symbol.  Update the
             -- atoms and the symbol infos if necessary.
             -- TODO: Use MVar / lock.
             unless (sameSym sym1 rep1) $ do
               writeIORef r1 rep1
               writeIORef l1 (Just rep1)  -- path shortening
             unless (sameSym sym2 rep1) $ do
               writeIORef r2 rep1
               writeIORef l2 (Just rep1)
             return EQ
            else do
              -- They are not the same, and they shouldn't
              return string_cmp
{-# NOINLINE cmpSymbol #-}

-- We abuse the fact that IORefs give us an identity (i.e., observable
-- sharing) and that we need the IORef anyway.
sameSym :: SymbolInfo -> SymbolInfo -> Bool
sameSym (SymInfo _ r1 _) (SymInfo _ r2 _) = r1 == r2

repr :: SymbolInfo -> IO SymbolInfo
repr sym@(SymInfo _ r _) = do
  parent <- readIORef r   -- TODO: perform path shortening.
  case parent of
    Nothing -> return sym
    Just sym' -> repr sym'

test1 = do
  let s1@(Symbol r1) = intern "foo"
      s2@(Symbol r2) = intern "foo"
  print $ r1 == r2   -- should be False
                     
  -- create a weak reference to the second symbol, so we can observe
  -- when it is garbage collected
  w <- mk_weak =<< readIORef r2

  print $ s1 == s2   -- should print True
  print =<< liftM2 sameSym (readIORef r1) (readIORef r2) -- should print True
  putStrLn "GCing"
  performGC          -- this should print goodbye, representing the
                     -- fact that the second symbol has been garbage
                     -- collected.
  print . isJust =<< deRefWeak w  -- should print False (object has been collected)
 where
   mk_weak o = mkWeakPtr o (Just (putStrLn "goodbye"))

-- -------------------------------------------------------------------

-- Fowler / Noll / Vo (FNV) hash.  Original code expected 'unsigned
-- char' input.  Don't know whether it behaves worse for unicode
-- chars.
hash :: String -> Word32
hash str = go magic_start (map ord str)
  where
    magic_start = 2166136261 :: Word32
    go :: Word32 -> [Int] -> Word32
    go !h [] = h
    go !h (c:cs) =
        go ((h * 16777619) `xor` fromIntegral c) cs