{-# LANGUAGE BangPatterns, ForeignFunctionInterface #-}
module Main (main) where
-- some simple helpers
import Control.Monad (when)
import Data.List (isPrefixOf)
-- concurrent renderer with capability-specific scheduling
import Control.Concurrent (killThread)
import GHC.Conc (forkOnIO, numCapabilities)
-- the dependency on mtl is just for this!
import Control.Monad.Trans (liftIO)
-- each worker uses a mutable unboxed array of Bool to know which pixels
-- it has already started to render, to avoid pointless work duplication
import Data.Array.IO (IOUArray, newArray, readArray, writeArray, inRange)
-- the main program thread needs to store some thread-local state
import Data.IORef (newIORef, readIORef, writeIORef)
-- each worker thread keeps a queue of pixels that it needs to render or
-- to continue rendering later
import Data.PriorityQueue (PriorityQueue, newPriorityQueue, enqueue, enqueueBatch, dequeue)
-- poking bytes into memory is dirty, but it's quick and allows use of
-- other fast functions like memset and easy integration with OpenGL
import Foreign (castPtr, mallocBytes, nullPtr, plusPtr, pokeByteOff, Ptr, Word8)
import Foreign.C (CDouble, CInt, CSize)
-- build the interface with GTK to allow more fancy controls later
import Graphics.UI.Gtk
-- use OpenGL to display frequently update images on a textured quad
import Graphics.UI.Gtk.OpenGL
import qualified Graphics.Rendering.OpenGL as GL
import Graphics.Rendering.OpenGL (($=), GLfloat)
-- higher precision arithmetic using libqd
import Numeric.QD.DoubleDouble (DoubleDouble(DoubleDouble))
import Numeric.QD.QuadDouble (QuadDouble(QuadDouble))
import qualified Numeric.QD.DoubleDouble as DD
import qualified Numeric.QD.QuadDouble as QD
import Numeric.QD.FPU.Raw (fpu_fix_start)
-- ugly! but the default realToFrac :: (C)Double -> (C)Double is slooow
import Unsafe.Coerce (unsafeCoerce)
-- some type aliases to shorten things
type B = Word8
type N = Int
type R = Double
-- don't look! this is really really ugly, and should be benchmarked
-- to see how really necessary it is, or at least made into a type class
convert :: (Real a, Fractional b) => a -> b
convert = realToFrac
convertDouble2CDouble :: Double -> CDouble
convertDouble2CDouble !x = unsafeCoerce x
convertCDouble2Double :: CDouble -> Double
convertCDouble2Double !x = unsafeCoerce x
convertDouble2DoubleDouble :: Double -> DoubleDouble
convertDouble2DoubleDouble !x = convertCDouble2DoubleDouble . convertDouble2CDouble $ x
convertCDouble2DoubleDouble :: CDouble -> DoubleDouble
convertCDouble2DoubleDouble !x = DoubleDouble x 0
convertDoubleDouble2Double :: DoubleDouble -> Double
convertDoubleDouble2Double !(DoubleDouble x _) = convertCDouble2Double x
convertDoubleDouble2CDouble :: DoubleDouble -> CDouble
convertDoubleDouble2CDouble !(DoubleDouble x _) = x
{-# RULES "convert/Double2CDouble" convert = convertDouble2CDouble #-}
{-# RULES "convert/CDouble2Double" convert = convertCDouble2Double #-}
{-# RULES "convert/Double2DoubleDouble" convert = convertDouble2DoubleDouble #-}
{-# RULES "convert/CDouble2DoubleDouble" convert = convertCDouble2DoubleDouble #-}
{-# RULES "convert/DoubleDouble2Double" convert = convertDoubleDouble2Double #-}
{-# RULES "convert/DoubleDouble2CDouble" convert = convertDoubleDouble2CDouble #-}
-- this is ugly too: can't use Data.Complex because the qd bindings do
-- not implement some low-level functions properly, leading to obscure
-- crashes inside various Data.Complex functions...
data Complex c = {-# UNPACK #-} !c :+ {-# UNPACK #-} !c deriving (Read, Show, Eq)
-- complex number arithmetic, with extra strictness and cost-centres
instance Num c => Num (Complex c) where
(!(a :+ b)) + (!(c :+ d)) = {-# SCC "C+" #-} ((a + c) :+ (b + d))
(!(a :+ b)) - (!(c :+ d)) = {-# SCC "C-" #-} ((a - c) :+ (b - d))
(!(a :+ b)) * (!(c :+ d)) = {-# SCC "C*" #-} ((a * c - b * d) :+ (a * d + b * c))
negate !(a :+ b) = (-a) :+ (-b)
abs x = error $ "Complex.abs: " ++ show x
signum x = error $ "Complex.signum: " ++ show x
fromInteger !x = fromInteger x :+ 0
-- an extra class for some operations that can be made faster for things
-- like DoubleDouble: probably should have given this a better name
class Num c => Turbo c where
sqr :: c -> c
sqr !x = x * x
twice :: c -> c
twice !x = x + x
-- the default methods are fine for simple primitive types...
instance Turbo Float where
instance Turbo Double where
instance Turbo CDouble where
-- ...and complex numbers
instance Turbo c => Turbo (Complex c) where
sqr !(r :+ i) = (sqr r - sqr i) :+ (twice (r * i))
twice !(r :+ i) = (twice r) :+ (twice i)
-- use the specific implementations for the higher precision types
instance Turbo DoubleDouble where
sqr !x = DD.sqr x
twice !(DoubleDouble a b) = DoubleDouble (twice a) (twice b)
instance Turbo QuadDouble where
sqr !x = QD.sqr x
twice !(QuadDouble a b c d) = QuadDouble (twice a) (twice b) (twice c) (twice d)
-- colour space conversion from HSV [0..1] to RGB [0..1]
-- HSV looks quite 'chemical' to my eyes, need to investigate something
-- better to make it feel more 'natural'
hsv2rgb :: R -> R -> R -> (R, R, R)
hsv2rgb !h !s !v
| s == 0 = (v, v, v)
| h == 1 = hsv2rgb 0 s v
| otherwise =
let !i = floor (h * 6) `mod` 6 :: N
!f = (h * 6) - fromIntegral i
!p = v * (1 - s)
!q = v * (1 - s * f)
!t = v * (1 - s * (1 - f))
in case i of
0 -> (v, t, p)
1 -> (q, v, p)
2 -> (p, v, t)
3 -> (p, q, v)
4 -> (t, p, v)
5 -> (v, p, q)
_ -> (0, 0, 0)
-- compute RGB [0..255] bytes from the results of the complex iterations
-- don't need very high precision for this, as spatial aliasing will be
-- much more of a problem in intricate regions of the fractal
colour :: Complex Double -> Complex Double -> N -> (B, B, B)
colour !(zr:+zi) !(dzr:+dzi) !n =
let -- micro-optimization - there is no log2 function
!il2 = 1 / log 2
!zd2 = sqr zr + sqr zi
!dzd2 = sqr dzr + sqr dzi
-- normalized escape time
!d = (fromIntegral n :: R) - log (log zd2 / log escapeR2) * il2
!dwell = fromIntegral (floor d :: N)
-- final angle of the iterate
!finala = atan2 zi zr
-- distance estimate
!de = (log zd2 * il2) * sqrt (zd2 / dzd2)
!dscale = log de * il2 + 32
-- HSV is based on escape time, distance estimate, and angle
!hue = log d * il2 / 3
!saturation = 0 `max` (log d * il2 / 8) `min` 1
!value = 0 `max` (1 - dscale / 48) `min` 1
!h = hue - fromIntegral (floor hue :: N)
-- adjust saturation to give concentric striped pattern
!k = dwell / 2
!satf = if k - fromIntegral (floor k :: N) >= (0.5 :: R) then 0.9 else 1
-- adjust value to give tiled pattern
!valf = if finala < 0 then 0.9 else 1
-- convert to RGB
(!r, !g, !b) = hsv2rgb h (satf * saturation) (valf * value)
-- convert to bytes
!rr = floor $ 0 `max` 255 * r `min` 255
!gg = floor $ 0 `max` 255 * g `min` 255
!bb = floor $ 0 `max` 255 * b `min` 255
in (rr, gg, bb)
-- a Job stores a pixel undergoing iterations
data Job c = Job !N !N !(Complex c) !(Complex c) !(Complex c) !N
-- the priority of a Job is how many iterations have been computed:
-- so 'fresher' pixels drop to the front of the queue in the hope of
-- avoiding too much work iterating pixels that will never escape
priority :: Job c -> N
priority !(Job _ _ _ _ _ n) = n
-- add a job to a work queue, taking care not to duplicate work
-- there is no race condition here as each worker has its own queue
addJob :: RealFloat c => N -> N -> Complex c -> N -> PriorityQueue IO (Job c) -> IOUArray (N,N) Bool -> N -> N -> IO ()
addJob !w !h !c !zoom todo sync !i !j = do
already <- readArray sync (j, i)
when (not already) $ do
writeArray sync (j, i) True
enqueue todo $! Job i j (coords w h c zoom i j) 0 0 0
-- spawns a new batch of workers to render an image
-- returns an action that stops the rendering
renderer :: (Turbo c, RealFloat c) => ((N,N),(N,N)) -> (N -> N -> B -> B -> B -> IO ()) -> Complex c -> N -> IO (IO ())
renderer rng output !c !zoom = do
workerts <- mapM (\w -> forkOnIO w $ worker rng c zoom output w) [ 0 .. workers - 1 ]
return $ do
mapM_ killThread workerts
-- compute the Complex 'c' coordinate for a pixel in the image
coords :: RealFloat c => N -> N -> Complex c -> N -> N -> N -> Complex c
coords !w !h !c !zoom !i !j = c + ( (fromIntegral (i - w`div`2) * k)
:+(fromIntegral (h`div`2 - j) * k))
where !k = convert (1/2^^zoom :: Double)
-- start rendering pixels from the edge of the image
-- the Mandelbrot Set and its complement are both simply-connected
-- discounting spatial aliasing any point inside the boundary that is
-- in the complement is 'reachable' from a point on the boundary that
-- is also in the complement - probably some heavy math involved to
-- prove this though
-- note: this implicitly depends on the spread values below - it's
-- necessary for each interlaced subimage (one per worker) to have
-- at least a one pixel deep border
border :: N -> N -> [(N, N)]
border !w !h = concat $
[ [ (j, i) | i <- [ 0 .. w - 1 ], j <- [ 0 .. workers - 1 ] ]
, [ (j, i) | j <- [ 0 .. h - 1 ], i <- [ 0 .. workers - 1 ] ]
, [ (j, i) | j <- [ 0 .. h - 1 ], i <- [ w - workers .. w - 1 ] ]
, [ (j, i) | i <- [ 0 .. w - 1 ], j <- [ h - workers .. h - 1 ] ]
]
-- the worker thread enqueues its border and starts computing iterations
worker :: (Turbo c, RealFloat c) => ((N,N),(N,N)) -> Complex c -> N -> (N -> N -> B -> B -> B -> IO ()) -> N -> IO ()
worker rng@((y0,x0),(y1,x1)) !c !zoom output !me = do
sync <- newArray rng False
queue <- newPriorityQueue priority
let addJ = addJob w h c zoom queue sync
js = filter mine (border w h)
w = x1 - x0 + 1
h = y1 - y0 + 1
mapM_ (flip (writeArray sync) True) js
enqueueBatch queue (map (\(j,i) -> Job i j (coords w h c zoom i j) 0 0 0) js)
compute rng addJ output queue
where mine (j, _) = j `mod` workers == me -- another dependency on spread
-- the compute engine pulls pixels from the queue until there are no
-- more, and calculates a batch of iterations for each
compute :: (Turbo c, RealFloat c) => ((N,N),(N,N)) -> (N -> N -> IO ()) -> (N -> N -> B -> B -> B -> IO ()) -> PriorityQueue IO (Job c) -> IO ()
compute rng addJ output queue = do
mjob <- dequeue queue
case mjob of
Just (Job i j c z dz n) -> do
let -- called when the pixel escapes
done' !(zr:+zi) !(dzr:+dzi) !n' = {-# SCC "done" #-} do
let (r, g, b) = colour (convert zr :+ convert zi) (convert dzr :+ convert dzi) n'
output i j r g b
-- a wavefront of computation spreads to neighbouring pixels
sequence_
[ addJ x y
| u <- spreadX
, v <- spreadY
, let x = i + u
, let y = j + v
, inRange rng (y, x)
]
-- called when the pixel doesn't escape yet
todo' !z' !dz' !n' = {-# SCC "todo" #-} enqueue queue $! Job i j c z' dz' n'
calculate c limit z dz n done' todo'
compute rng addJ output queue
Nothing -> return () -- no pixels left to render, so finish quietly
-- the raw z->z^2+c calculation engine
-- also computes the derivative for distance estimation calculations
-- this function is crucial for speed, too much allocation will slooow
-- everything down severely
calculate :: (Turbo c, RealFloat c) => Complex c -> N -> Complex c -> Complex c -> N -> (Complex c -> Complex c -> N -> IO ()) -> (Complex c -> Complex c -> N -> IO ()) -> IO ()
calculate !c !m0 !z0 !dz0 !n0 done todo = go m0 z0 dz0 n0
where
go !m !z@(zr:+zi) !dz !n
| not (sqr zr + sqr zi < er2) = done z dz n
| m <= 0 = todo z dz n
| otherwise = go (m - 1) (sqr z + c) (let !zdz = z * dz in twice zdz + 1) (n + 1)
!er2 = convert escapeR2
-- dispatch to different instances of renderer depending on required precision
-- if zoom is low, single precision Float is ok, but as soon as pixel spacing
-- gets really small, it's necessary to increase it
-- it's probably not even worth using Float - worth benchmarking this and
-- also the DD and QD types (which cause a massively noticeable slowdown)
renderer' :: Real c => ((N,N),(N,N)) -> (N -> N -> B -> B -> B -> IO ()) -> Complex c -> N -> IO (IO ())
renderer' rng output !c !zoom
| zoom < 20 = {-# SCC "rF" #-} renderer rng output (f c :: Complex Float ) zoom
| zoom < 50 = {-# SCC "rD" #-} renderer rng output (f c :: Complex Double ) zoom
| zoom < 100 = {-# SCC "rDD" #-} renderer rng output (f c :: Complex DoubleDouble) zoom
| otherwise = {-# SCC "rQD" #-} renderer rng output (f c :: Complex QuadDouble ) zoom
where f !(cr :+ ci) = convert cr :+ convert ci
-- command line arguments: currently only initial window dimensions
data Args = Args{ aWidth :: N, aHeight :: N }
-- and the defaults are suitable for PAL DVD rendering, if that should
-- come to pass in the future
defaultArgs :: Args
defaultArgs = Args{ aWidth = 788, aHeight = 576 }
-- braindead argument parser: latest argument takes priority
-- probably should use Monoid instances for this stuff
combineArgs :: Args -> String -> Args
combineArgs a0 s
| "--width=" `isPrefixOf` s = a0{ aWidth = read $ "--width=" `dropPrefix` s }
| "--height=" `isPrefixOf` s = a0{ aHeight = read $ "--height=" `dropPrefix` s }
| "-w=" `isPrefixOf` s = a0{ aWidth = read $ "-w=" `dropPrefix` s }
| "-h=" `isPrefixOf` s = a0{ aHeight = read $ "-h=" `dropPrefix` s }
| otherwise = a0
-- this is a bit silly, especially with the duplicated string literals..
dropPrefix :: String -> String -> String
dropPrefix p s = drop (length p) s
-- round up to nearest power of two
-- this will probably explode when n gets large, but it's only used
-- for OpenGL texture dimensions so you'll run out of memory first
roundUp2 :: N -> N
roundUp2 n = head . dropWhile (< n) . iterate (2*) $ 1
-- the main program!
main :: IO ()
main = do
args <- foldl combineArgs defaultArgs `fmap` unsafeInitGUIForThreadedRTS
let width = aWidth args
height = aHeight args
size = roundUp2 (width `max` height)
rng = ((0, 0), (height - 1, width - 1))
_ <- initGL
glconfig <- glConfigNew [ GLModeRGBA, GLModeDouble ]
canvas <- glDrawingAreaNew glconfig
widgetSetSizeRequest canvas width height
-- allocate some image bytes and clear with white
imgdata <- mallocBytes $ width * height * 3
_ <- memset (castPtr imgdata) 255 (fromIntegral $ height * width * 3)
let output x y r g b = do
let p = imgdata `plusPtr` ((y * width + x) * 3)
pokeByteOff p 0 r
pokeByteOff p 1 g
pokeByteOff p 2 b
window <- windowNew
eventb <- eventBoxNew
vbox <- vBoxNew False 0
hbox <- hBoxNew False 0
statusRe <- labelNew Nothing
statusIm <- labelNew Nothing
statusZo <- labelNew Nothing
boxPackStart vbox eventb PackGrow 0
boxPackStart vbox hbox PackGrow 0
boxPackStart hbox statusRe PackGrow 0
boxPackStart hbox statusIm PackGrow 0
boxPackStart hbox statusZo PackGrow 0
let updateStatus re im zo = do -- this updates the status bar
labelSetText statusRe (reshow $ show re)
labelSetText statusIm (reshow $ show im)
labelSetText statusZo (show zo)
set window [ containerBorderWidth := 0, containerChild := vbox, windowResizable := False ]
set eventb [ containerBorderWidth := 0, containerChild := canvas ]
-- mouse motion events not sent by default as there can a flood
widgetAddEvents eventb [PointerMotionMask]
-- dirty hack to set FPU control words as recommended by libqd docs
-- because it relies on 64bit doubles and some FPU use 80bits inside
mapM_ (flip forkOnIO $ fpu_fix_start nullPtr) [ 0 .. numCapabilities - 1 ]
-- start the renderer for the first time
stop0 <- renderer' rng output c0 zoom0
-- save the initial state
sR <- newIORef (c0, zoom0, stop0)
-- when the mouse moves, update the status bar with the current coords
_ <- eventb `on` motionNotifyEvent $ {-# SCC "cbMo" #-} tryEvent $ do
(x, y) <- eventCoordinates
liftIO $ do
(c, zoom, _stop) <- readIORef sR
let _c'@(re' :+ im') = c + ((convert x :+ convert (-y)) - (fromIntegral width :+ fromIntegral (-height)) * (0.5 :+ 0)) * ((1/2^^zoom) :+ 0)
updateStatus re' im' zoom
-- when the mouse button is pressed, center and zoom in
_ <- eventb `on` buttonPressEvent $ {-# SCC "cbEv" #-} tryEvent $ do
LeftButton <- eventButton
(x, y) <- eventCoordinates
liftIO $ do
(c, zoom, stop) <- readIORef sR
stop
_ <- memset (castPtr imgdata) 255 (fromIntegral $ height * width * 3)
let c'@(re' :+ im') = c + ((convert x :+ convert (-y)) - (fromIntegral width :+ fromIntegral (-height)) * (0.5 :+ 0)) * ((1/2^^zoom) :+ 0)
zoom' = zoom + 1
stop' <- renderer' rng output c' zoom'
writeIORef sR (c', zoom', stop')
updateStatus re' im' zoom'
-- need to set up OpenGL stuff in callback just because that's how...
_ <- onRealize canvas $ {-# SCC "cbRz" #-} withGLDrawingArea canvas $ \_ -> do
GL.clearColor $= (GL.Color4 0.0 0.0 0.0 0.0)
GL.matrixMode $= GL.Projection
GL.loadIdentity
GL.ortho 0.0 1.0 0.0 1.0 (-1.0) 1.0
GL.drawBuffer $= GL.BackBuffers
-- create a new texture and pre-allocate it to a square 2^n size for speed
[tex] <- GL.genObjectNames 1
GL.texture GL.Texture2D $= GL.Enabled
GL.textureBinding GL.Texture2D $= Just tex
GL.texImage2D Nothing GL.NoProxy 0 GL.RGB' (GL.TextureSize2D (fromIntegral size) (fromIntegral size)) 0 (GL.PixelData GL.RGB GL.UnsignedByte nullPtr)
GL.textureFilter GL.Texture2D $= ((GL.Nearest, Nothing), GL.Nearest)
GL.textureWrapMode GL.Texture2D GL.S $= (GL.Repeated, GL.ClampToEdge)
GL.textureWrapMode GL.Texture2D GL.T $= (GL.Repeated, GL.ClampToEdge)
-- time to draw the image: upload to the texture and draw a quad
_ <- onExpose canvas $ {-# SCC "cbEx" #-} \_ -> do
withGLDrawingArea canvas $ \glwindow -> do
let v :: GLfloat -> GLfloat -> GLfloat -> GLfloat -> IO ()
v tx ty vx vy = GL.texCoord (GL.TexCoord2 tx ty) >> GL.vertex (GL.Vertex2 vx vy)
w = fromIntegral width
h = fromIntegral height
sx = fromIntegral width / fromIntegral size
sy = fromIntegral height / fromIntegral size
GL.clear [ GL.ColorBuffer ]
GL.texSubImage2D Nothing 0 (GL.TexturePosition2D 0 0) (GL.TextureSize2D w h) (GL.PixelData GL.RGB GL.UnsignedByte imgdata)
GL.renderPrimitive GL.Quads $ do
v 0 sy 0 0 >> v 0 0 0 1 >> v sx 0 1 1 >> v sx sy 1 0
glDrawableSwapBuffers glwindow
return True
-- need an exit strategy
_ <- onDestroy window mainQuit
-- make sure the expose callback gets called regularly (5fps)
_ <- timeoutAdd (widgetQueueDraw canvas >> return True) 200
-- and we're off!
widgetShowAll window
mainGUI
-- which neighbours to activate once a pixel has escaped
-- there are essentially two choices, with x<->y swapped
-- choose greater X spread because images are often wider than tall
-- other schemes wherein the spread is split in both directions
-- might benefit appearance with large worker count, but too complicated
spreadX, spreadY :: [ N ]
spreadX = [ -workers, 0, workers ]
spreadY = [ -1, 0, 1 ]
-- number of worker threads
-- use as many worker threads as capabilities, with the workers
-- distributed 1-1 onto capabilities to maximize CPU utilization
workers :: N
workers = numCapabilities
-- iteration limit per pixel
-- at most this many iterations are performed on each pixel before it
-- is shunted to the back of the work queue
-- this should be tuneable to balance display updates against overheads
limit :: N
limit = (2^(11::N)-1)
-- initial center coordinates
-- using the maximum precision available from the start for this makes
-- sure that nothing weird happens when precision gets close to the edge
c0 :: Complex QuadDouble
c0 = 0
-- initial zoom level
-- neighbouring pixel are 2^(-zoom) units apart
-- the initial zoom level should probably depend on initial image size
zoom0 :: N
zoom0 = 6
-- escape radius for fractal iteration calculations
-- once the complex iterate exceeds this, it's never coming back
-- theoretically escapeR = 2 would work
-- but higher values like this give a significantly smoother picture
escapeR, escapeR2 :: R
escapeR = 65536
escapeR2 = escapeR * escapeR
-- import standard C library memset for clearing images efficiently
-- previous implementation used pokeArray ... (replicate ...) ...
-- which had a nasty habit of keeping the list around in memory
foreign import ccall unsafe "string.h memset" c_memset :: Ptr Word8 -> CInt -> CSize -> IO (Ptr Word8)
memset :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
memset p w s = c_memset p (fromIntegral w) s
-- convert scientific notation like "-4.12345600000000e-03"
-- into human-readable numbers like "-0.004123456"
-- do it by string manipulation as QuadDouble has a lot of precision
-- and the show instance for QuadDouble gives the problematic form...
reshow :: String -> String
reshow s =
let (front, 'e':rear) = break (=='e') s
(sign, mantissa) = case front of
'-':m -> (True, m)
m -> (False, m)
(big, '.':small) = break (=='.') mantissa
expo = read (dropWhile (=='+') rear) + length big
digits = if expo < 0 then "0." ++ replicate (-expo) '0' ++ big ++ small
else let (x,y) = splitAt expo (big ++ tail small)
in case x of
[] -> "0." ++ y
x' -> x' ++ "." ++ y
in reverse . dropWhile (=='.') . dropWhile (=='0') . reverse . (if sign then ('-':) else id) $ digits