-- Simple computation of the dot product in Haskell (using various array -- implementations) -- -- Compile and run with -- -- ghc -ffi -O2 -fliberate-case-threshold100 -o dotprod DotProd.hs dotprod.o\ -- && ./dotprod +RTS -K10M -- standard libraries import CPUTime import Random -- FFI import Foreign import Foreign.C -- GHC libraries import Data.Array import Data.Array.Unboxed (UArray) import qualified Data.Array.Unboxed as U import Control.Exception (evaluate) import System.Mem (performGC) -- arrays types -- type Vector = Array Int Float type UVector = UArray Int Float type CVector = Ptr Float -- generates a random vector of the given length in NF -- generateVector :: Int -> IO Vector generateVector n = do rg <- newStdGen let fs = take n $ randomRs (-100, 100) rg arr = listArray (0, n - 1) fs evaluate $ sum (elems arr) -- make sure it is brought in NF return arr -- convert a vector into an UVector in NF -- vectorToUVector :: Vector -> IO UVector vectorToUVector v = do let uv = U.listArray (bounds v) . elems $ v evaluate $ sum (U.elems uv) return uv -- convert a vector into a CVector in NF -- vectorToCVector :: Vector -> IO CVector vectorToCVector v = newArray (elems v) -- compute the dot product -- -- vanilla vectorDP1a :: Vector -> Vector -> IO Float {-# NOINLINE vectorDP1a #-} vectorDP1a v1 v2 = do let r = sum [x * y | x <- elems v1 | y <- elems v2] evaluate r -- vanilla vectorDP1b :: Vector -> Vector -> IO Float {-# NOINLINE vectorDP1b #-} vectorDP1b v1 v2 = do let r = sum [v1!i * v2!i | i <- indices v1] evaluate r -- array combinators vectorDP2 :: Vector -> Vector -> IO Float {-# NOINLINE vectorDP2 #-} vectorDP2 v1 v2 = do let r = sumA (zipWithA (*) v1 v2) evaluate r where zipWithA f v1 v2 = listArray (0, n1) (loop 0) where n1 = snd (U.bounds v1) loop i | i > n1 = [] | otherwise = f (v1!i) (v2!i) : loop (i + 1) -- sumA v = loop 0 where n1 = snd (U.bounds v) loop i | i > n1 = 0 | otherwise = v!i + loop (i + 1) -- explicit loop vectorDP3 :: Vector -> Vector -> IO Float {-# NOINLINE vectorDP3 #-} vectorDP3 v1 v2 = do let n1 = snd (U.bounds v1) r = loop 0 where loop i | i > n1 = 0 | otherwise = v1!i * v2!i + loop (i + 1) evaluate r -- explicit loop w/ acc vectorDP4 :: Vector -> Vector -> IO Float {-# NOINLINE vectorDP4 #-} vectorDP4 v1 v2 = do let n1 = snd (U.bounds v1) r = loop 0 0 where loop i a | i > n1 = a | otherwise = loop (i + 1) (v1!i * v2!i + a) evaluate r -- vanilla uvectorDP1a :: UVector -> UVector -> IO Float {-# NOINLINE uvectorDP1a #-} uvectorDP1a v1 v2 = do let r = sum $ zipWith (*) (U.elems v1) (U.elems v2) evaluate r -- vanilla uvectorDP1b :: UVector -> UVector -> IO Float {-# NOINLINE uvectorDP1b #-} uvectorDP1b v1 v2 = do let r = sum [v1 U.!i * v2 U.!i | i <- U.indices v1] evaluate r -- array combinators uvectorDP2 :: UVector -> UVector -> IO Float {-# NOINLINE uvectorDP2 #-} uvectorDP2 v1 v2 = do let r = sumA (zipWithA (*) v1 v2) evaluate r where zipWithA :: (Float -> Float -> Float) -> UVector -> UVector -> UVector zipWithA f v1 v2 = U.listArray (0, n1) (loop 0) where n1 = snd (U.bounds v1) loop i | i > n1 = [] | otherwise = f (v1 U.!i) (v2 U.!i) : loop (i + 1) -- sumA v = loop 0 where n1 = snd (U.bounds v) loop i | i > n1 = 0 | otherwise = v U.!i + loop (i + 1) -- explicit loop uvectorDP3 :: UVector -> UVector -> IO Float {-# NOINLINE uvectorDP3 #-} uvectorDP3 v1 v2 = do let n1 = snd (U.bounds v1) r = loop 0 where loop i | i > n1 = 0 | otherwise = v1 U.!i * v2 U.!i + loop (i + 1) evaluate r -- NB: main difference in Core to vectorDP3 is that here the compiler -- decided to first go into the recursion and then do the indexing of v1 -- and v2, whereas in vectorDP3 it's the other way around -- explicit loop w/ acc uvectorDP4 :: UVector -> UVector -> IO Float {-# NOINLINE uvectorDP4 #-} uvectorDP4 v1 v2 = do let n1 = snd (U.bounds v1) r = loop 0 0 where loop i a | i > n1 = a | otherwise = loop (i + 1) (v1 U.!i * v2 U.!i + a) evaluate r -- NB: this generates perfect code -- merciless C code foreign import ccall "dotprod.h" cvectorDP :: CVector -> CVector -> Int -> IO Float -- execute a function and print the result and execution time -- execAndTime :: String -- description -> IO Float -- benchmarked computation -> IO () execAndTime desc comp = do putStrLn $ "\n*** " ++ desc performGC start <- getCPUTime result <- comp end <- getCPUTime let duration = (end - start) `div` 1000000000 putStrLn $ "Result : " ++ show result putStrLn $ "Running time: " ++ show duration ++ "ms" main :: IO () main = do putStrLn "Dot product benchmark" putStrLn "=====================" putStrLn $ "[time resolution: " ++ show (cpuTimePrecision `div` 1000000000)++ "ms]" -- v1 <- generateVector 10000 v2 <- generateVector 10000 execAndTime "H98 arrays (ind'd compr) [n = 10000]" (vectorDP1b v1 v2) -- v1 <- generateVector 20000 v2 <- generateVector 20000 execAndTime "H98 arrays (ind'd compr) [n = 20000]" (vectorDP1b v1 v2) -- v1 <- generateVector 50000 v2 <- generateVector 50000 execAndTime "H98 arrays (par compr) [n = 50000]" (vectorDP1a v1 v2) execAndTime "H98 arrays (ind'd compr) [n = 50000]" (vectorDP1b v1 v2) execAndTime "H98 arrays (combinator-based) [n = 50000]" (vectorDP2 v1 v2) execAndTime "H98 arrays (explicit loop) [n = 50000]" (vectorDP3 v1 v2) execAndTime "H98 arrays (explicit loop w/ acc) [n = 50000]" (vectorDP4 v1 v2) uv1 <- vectorToUVector v1 uv2 <- vectorToUVector v2 execAndTime "UArray (par compr) [n = 50000]" (uvectorDP1a uv1 uv2) execAndTime "UArray (ind'd compr) [n = 50000]" (uvectorDP1b uv1 uv2) execAndTime "UArray (combinator-based) [n = 50000]" (uvectorDP2 uv1 uv2) execAndTime "UArray (explicit loop) [n = 50000]" (uvectorDP3 uv1 uv2) execAndTime "UArray (explicit loop w/ acc) [n = 50000]" (uvectorDP4 uv1 uv2) -- v1 <- generateVector 100000 v2 <- generateVector 100000 execAndTime "H98 arrays (par compr) [n = 100000]" (vectorDP1a v1 v2) execAndTime "H98 arrays (ind'd compr) [n = 100000]" (vectorDP1b v1 v2) execAndTime "H98 arrays (combinator-based) [n = 100000]" (vectorDP2 v1 v2) execAndTime "H98 arrays (explicit loop) [n = 100000]" (vectorDP3 v1 v2) execAndTime "H98 arrays (explicit loop w/ acc) [n = 100000]"(vectorDP4 v1 v2) uv1 <- vectorToUVector v1 uv2 <- vectorToUVector v2 execAndTime "UArray (par compr) [n = 100000]" (uvectorDP1a uv1 uv2) execAndTime "UArray (ind'd compr) [n = 100000]" (uvectorDP1b uv1 uv2) execAndTime "UArray (combinator-based) [n = 100000]" (uvectorDP2 uv1 uv2) execAndTime "UArray (explicit loop) [n = 100000]" (uvectorDP3 uv1 uv2) execAndTime "UArray (explicit loop w/ acc) [n = 100000]" (uvectorDP4 uv1 uv2) cv1 <- vectorToCVector v1 cv2 <- vectorToCVector v2 execAndTime "C [n = 100000]" (cvectorDP cv1 cv2 100000) -- v1 <- generateVector 500000 v2 <- generateVector 500000 uv1 <- vectorToUVector v1 uv2 <- vectorToUVector v2 execAndTime "UArray (explicit loop w/ acc) [n = 500000]" (uvectorDP4 uv1 uv2) cv1 <- vectorToCVector v1 cv2 <- vectorToCVector v2 execAndTime "C [n = 500000]" (cvectorDP cv1 cv2 500000)