{-# LANGUAGE CPP #-} -------------------------------------------------------------------------------- -- -- Module : MatrixMul -- Copyright : (c) 2009 Trevor L. McDonell -- License : BSD -- -- Matrix multiplication using runtime interface and execution control instead -- of calling C functions via the FFI. -- -------------------------------------------------------------------------------- module Main where #include "matrix_mul.h" -- Friends import Time import RandomVector -- System import Data.Array import System.IO import Foreign import qualified Foreign.CUDA as CUDA -------------------------------------------------------------------------------- -- Reference -------------------------------------------------------------------------------- matMult :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e) matMult mx my = do x <- unsafeFreeze mx y <- unsafeFreeze my let ((li, lj), (ui, uj)) = bounds x ((li',lj'),(ui',uj')) = bounds y resBnds | (lj,uj) == (li',ui') = ((li,lj'),(ui,uj')) | otherwise = error "matrix dimensions must agree" newListArray resBnds [sum [x!(i,k) * y!(k,j) | k <- range (lj,uj)] | i <- range (li,ui) , j <- range (lj',uj') ] -------------------------------------------------------------------------------- -- CUDA -------------------------------------------------------------------------------- matMultCUDA :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e) matMultCUDA xs' ys' = doMult undefined xs' ys' where doMult :: (Num e', Storable e') => e' -> Matrix e' -> Matrix e' -> IO (Matrix e') doMult dummy xs ys = do -- Setup matrix parameters -- ((li, lj), (ui, uj)) <- getBounds xs ((li',lj'),(ui',uj')) <- getBounds ys let wx = rangeSize (lj,uj) hx = rangeSize (li,ui) wy = rangeSize (lj',uj') hy = rangeSize (li',ui') resBnds | wx == hy = ((li,lj'),(ui,uj')) | otherwise = error "matrix dimensions must agree" -- Allocate memory and copy test data -- CUDA.allocaArray (wx*hx) $ \d_xs -> do CUDA.allocaArray (wy*hy) $ \d_ys -> do CUDA.allocaArray (wy*hx) $ \d_zs -> do withMatrix xs $ \p -> CUDA.pokeArray (wx*hx) p d_xs withMatrix ys $ \p -> CUDA.pokeArray (wy*hy) p d_ys -- Launch the kernel -- let gridDim = (wy`div`BLOCK_SIZE, hx`div`BLOCK_SIZE) blockDim = (BLOCK_SIZE,BLOCK_SIZE,1) sharedMem = 2 * BLOCK_SIZE * BLOCK_SIZE * fromIntegral (sizeOf dummy) CUDA.setConfig gridDim blockDim sharedMem Nothing CUDA.setParams [CUDA.VArg d_xs, CUDA.VArg d_ys, CUDA.VArg d_zs, CUDA.IArg wx, CUDA.IArg wy] CUDA.launch "matrixMul" -- Copy back result zs <- newArray_ resBnds withMatrix zs $ \p -> CUDA.peekArray (wy*hx) d_zs p return zs -------------------------------------------------------------------------------- -- Main -------------------------------------------------------------------------------- main :: IO () main = do dev <- CUDA.get props <- CUDA.props dev putStrLn $ "Using device " ++ show dev ++ ": " ++ CUDA.deviceName props xs <- randomArr ((1,1),(8*BLOCK_SIZE, 4*BLOCK_SIZE)) :: IO (Matrix Float) ys <- randomArr ((1,1),(4*BLOCK_SIZE,12*BLOCK_SIZE)) :: IO (Matrix Float) putStr "== Reference: " >> hFlush stdout (tr,ref) <- benchmark 100 (matMult xs ys) (return ()) putStrLn $ shows (fromInteger (timeIn millisecond tr) / 100::Float) " ms" putStr "== CUDA: " >> hFlush stdout (tc,mat) <- benchmark 100 (matMultCUDA xs ys) (CUDA.sync) putStrLn $ shows (fromInteger (timeIn millisecond tc) / 100::Float) " ms" putStr "== Validating: " verify ref mat >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!"