{-# LANGUAGE TemplateHaskell, FlexibleContexts #-} module NoSlow.Micro.Kernels ( kernels ) where import qualified NoSlow.Backend.Interface as I import NoSlow.Util.Base import NoSlow.Util.Computation ( Len(..), WithSqrtLen(..), Sorted(..), ParenTree(..), Edges(..) ) import Data.Bits kernels = [d| -- --------------------------- -- map, zipWith, replicate -- --------------------------- -- Scale a vector by a constant -- -- NOTE: we do not want to measure LiberateCase here so we explicitly unbox -- x scale :: (Num a, I.Vector v a) => Ty (v a) -> v a -> a -> v a scale _ as x = seq x $ I.map (x*) as -- Scale a vector by a constant (version 2) -- -- No intermediate array should be created, performance should be -- similar to splus. scale_r :: (Num a, I.Vector v a) => Ty (v a) -> v a -> a -> v a scale_r _ as x = I.zipWith (+) as (I.replicate (I.length as) x) -- Do these maps get fused? splus4 :: (Num a, I.Vector v a) => Ty (v a) -> v a -> a -> v a splus4 _ as x = seq x $ I.map (+x) $ I.map (+x) $ I.map (+x) $ I.map (+x) as -- Elementwise addition -- -- Lower bound on the execution time of the following benchmarks plus :: (Num a, I.Vector v a) => Ty (v a) -> v a -> v a -> v a plus _ as bs = I.zipWith (+) as bs -- Checks speed of map/zip compared to zipWith plus_zip :: (Num a, I.Vector v a) => Ty (v a) -> v a -> v a -> v a plus_zip _ as bs = I.map (\p -> I.fst p + I.snd p) (I.zip as bs) -- Standard axpy benchmark axpy :: (Num a, I.Vector v a) => Ty (v a) -> a -> v a -> v a -> v a axpy _ x as bs = seq x $ I.zipWith (+) (I.map (x*) as) bs -- Lots of zips can be inefficient with stream fusion. mpp :: (Num a, I.Vector v a) => Ty (v a) -> v a -> v a -> v a -> v a -> v a mpp _ as bs cs ds = I.zipWith (*) (I.zipWith (+) as bs) (I.zipWith (+) cs ds) -- How does this compare to mpp? mpp_zip :: (Num a, I.Vector v a) => Ty (v a) -> v a -> v a -> v a -> v a -> v a mpp_zip _ as bs cs ds = I.map (\p -> (I.fst (I.fst p) + I.snd (I.fst p)) * (I.fst (I.snd p) + I.snd (I.snd p))) $ I.zip (I.zip as bs) (I.zip cs ds) mspsp :: (Num a, I.Vector v a) => Ty (v a) -> a -> v a -> a -> v a -> v a mspsp _ x as y bs = seq x $ seq y $ I.zipWith (*) (I.map (x+) as) (I.map (y+) bs) -- Do we get rid of the replicates? mspsp_r :: (Num a, I.Vector v a) => Ty (v a) -> a -> v a -> a -> v a -> v a mspsp_r _ x as y bs = I.zipWith (*) (I.zipWith (+) (I.replicate (I.length as) x) as) (I.zipWith (+) (I.replicate (I.length bs) y) bs) -- ----------- -- Filters -- ----------- -- Removes all elements from the list, basic test of filter fusion filterout :: (Num a, Eq a, I.Vector v a) => Ty (v a) -> v a -> v a filterout _ as = I.filter (/=0) $ I.map (\x -> x-x) as -- Only do the test once, no loop should be executed filterout_r :: (Num a, Eq a, I.Vector v a) => Ty (v a) -> Len -> v a filterout_r _ (Len n) = I.filter (/=0) $ I.replicate n 0 -- Retain all elements in a list filterin :: (Num a, Eq a, I.Vector v a) => Ty (v a) -> v a -> v a filterin _ as = I.filter (==0) $ I.map (\x -> x-x) as -- Only do the test once, should be faster than filterin filterin_r :: (Num a, Eq a, I.Vector v a) => Ty (v a) -> Len -> v a filterin_r _ (Len n) = I.filter (==0) $ I.replicate n 0 filter_zip :: (Num a, Ord a, I.Vector v a) => Ty (v a) -> a -> v a -> v a -> v a filter_zip _ x as bs = seq x $ I.filter ( Ty (v a) -> Len -> v a -> v a filter_evens _ (Len n) as = I.map I.snd $ I.filter (even . I.fst) $ I.zip (I.enumFromTo_Int 0 (n-1)) as find_indices :: (Eq a, I.Vector v a) => Ty (v a) -> a -> v a -> v Int find_indices _ x as = seq x $ I.map I.fst $ I.filter (\t -> x == I.snd t) $ I.zip (I.enumFromTo_Int 0 (I.length as - 1)) as -- -------- -- Sums -- -------- -- Dot product dotp :: (Num a, I.Vector v a) => Ty (v a) -> v a -> v a -> a dotp _ as bs = I.sum (I.zipWith (*) as bs) dotp_zip :: (Num a, I.Vector v a) => Ty (v a) -> v a -> v a -> a dotp_zip _ as bs = I.sum (I.map (\p -> I.fst p * I.snd p) (I.zip as bs)) -- sum [1 .. n] sumn :: (Num a, I.Vector v a) => Ty (v a) -> Len -> a sumn ty (Len n) = I.sum $ I.map fromIntegral (I.enumFromTo_Int 1 n) `ofType` ty sumsq_map :: (Num a, I.Vector v a) => Ty (v a) -> Len -> a sumsq_map ty (Len n) = I.sum $ I.map (\x -> x*x) $ I.map fromIntegral (I.enumFromTo_Int 1 n) `ofType` ty sumsq_zip :: (Num a, I.Vector v a) => Ty (v a) -> Len -> a sumsq_zip ty (Len n) = let as = I.map fromIntegral (I.enumFromTo_Int 1 n) `ofType` ty in I.sum $ I.zipWith (*) as as sum_evens :: (Num a, I.Vector v a) => Ty (v a) -> Len -> v a -> a sum_evens _ (Len n) as = I.sum $ I.map I.snd $ I.filter (even . I.fst) $ I.zip (I.enumFromTo_Int 0 (n-1)) as count_sum :: (Eq a, I.Vector v a) => Ty (v a) -> a -> v a -> Int count_sum _ x as = seq x $ I.sum $ I.map (\y -> if x == y then 1 else 0) as count_filter :: (Eq a, I.Vector v a) => Ty (v a) -> a -> v a -> Int count_filter _ x as = seq x $ I.length $ I.filter (x==) as {- -- ----- -- Folds -- ----- min_index :: (Ord a, I.Vector v a) => Ty (v a) -> v a -> Int min_index _ as = I.foldl1' (\(i,x) (j,y) -> if i <= j then (i,x) else (j,y)) $ I.zip (I.enumFromTo 0 (I.length as - 1)) as -} -- ----- -- Scans -- ----- scan_replicate :: (Num a, I.Vector v a) => Ty (v a) -> Len -> v a scan_replicate _ (Len n) = I.prescanl' (+) 0 (I.replicate n 1) sumn_scan :: (Num a, I.Vector v a) => Ty (v a) -> Len -> a sumn_scan ty (Len n) = I.sum (I.prescanl' (+) 0 (I.replicate n 1 `ofType` ty)) -- -------- -- Permutes -- -------- -- Compare to odds_backpermute; the conditional in n might interfere -- with fusion evens_backpermute :: I.Vector v a => Ty (v a) -> v a -> v a evens_backpermute _ as = I.backpermute as $ I.map (*2) $ I.enumFromTo_Int 0 (n-1) where m = I.length as n = if odd m then m `div` 2 + 1 else m `div` 2 -- Compare to evens_backpermute odds_backpermute :: I.Vector v a => Ty (v a) -> v a -> v a odds_backpermute _ as = I.backpermute as $ I.map (\i -> i*2+1) $ I.enumFromTo_Int 0 (I.length as `div` 2 - 1) interleave :: I.Vector v a => Ty (v a) -> v a -> v a -> v a interleave _ as bs = I.backpermute (I.append as bs) $ I.map (\i -> if even i then i `div` 2 else i `div` 2 + m) $ I.enumFromTo_Int 0 (m+n-1) where m = I.length as n = I.length bs -- ---------- -- Algorithms -- ---------- sorted_rank :: (Ord a, I.Vector v a) => Ty (v a) -> a -> Sorted v a -> Int sorted_rank _ x (Sorted xs) = go 0 xs where go k xs | n == 0 = k | x <= I.index xs i = go k (I.take i xs) | otherwise = go (k+i+1) (I.drop (i+1) xs) where n = I.length xs i = n `div` 2 qsort :: (Ord a, I.Vector v a) => Ty (v a) -> v a -> v a qsort ty xs | n < 2 = xs | otherwise = I.append (qsort ty lts) (I.cons x (qsort ty geqs)) where n = I.length xs x = I.head xs lts = x `seq` I.filter (=x) (I.tail xs) -- (lts, geqs) = x `seq` I.unstablePartition (