{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE TupleSections #-} -- | Code generation for Python with OpenCL. module Futhark.CodeGen.Backends.PyOpenCL ( compileProg, ) where import Control.Monad import qualified Data.Map as M import qualified Futhark.CodeGen.Backends.GenericPython as Py import Futhark.CodeGen.Backends.GenericPython.AST import Futhark.CodeGen.Backends.GenericPython.Options import Futhark.CodeGen.Backends.PyOpenCL.Boilerplate import qualified Futhark.CodeGen.ImpCode.OpenCL as Imp import qualified Futhark.CodeGen.ImpGen.OpenCL as ImpGen import Futhark.IR.KernelsMem (KernelsMem, Prog) import Futhark.MonadFreshNames import Futhark.Util (zEncodeString) -- | Compile the program to Python with calls to OpenCL. compileProg :: MonadFreshNames m => Py.CompilerMode -> String -> Prog KernelsMem -> m (ImpGen.Warnings, String) compileProg mode class_name prog = do ( ws, Imp.Program opencl_code opencl_prelude kernels types sizes failures prog' ) <- ImpGen.compileProg prog --prepare the strings for assigning the kernels and set them as global let assign = unlines $ map ( \x -> pretty $ Assign (Var ("self." ++ zEncodeString (nameToString x) ++ "_var")) (Var $ "program." ++ zEncodeString (nameToString x)) ) $ M.keys kernels let defines = [ Assign (Var "synchronous") $ Bool False, Assign (Var "preferred_platform") None, Assign (Var "preferred_device") None, Assign (Var "default_threshold") None, Assign (Var "default_group_size") None, Assign (Var "default_num_groups") None, Assign (Var "default_tile_size") None, Assign (Var "default_reg_tile_size") None, Assign (Var "fut_opencl_src") $ RawStringLiteral $ opencl_prelude ++ opencl_code ] let imports = [ Import "sys" Nothing, Import "numpy" $ Just "np", Import "ctypes" $ Just "ct", Escape openClPrelude, Import "pyopencl.array" Nothing, Import "time" Nothing ] let constructor = Py.Constructor [ "self", "command_queue=None", "interactive=False", "platform_pref=preferred_platform", "device_pref=preferred_device", "default_group_size=default_group_size", "default_num_groups=default_num_groups", "default_tile_size=default_tile_size", "default_reg_tile_size=default_reg_tile_size", "default_threshold=default_threshold", "sizes=sizes" ] [Escape $ openClInit types assign sizes failures] options = [ Option { optionLongName = "platform", optionShortName = Just 'p', optionArgument = RequiredArgument "str", optionAction = [Assign (Var "preferred_platform") $ Var "optarg"] }, Option { optionLongName = "device", optionShortName = Just 'd', optionArgument = RequiredArgument "str", optionAction = [Assign (Var "preferred_device") $ Var "optarg"] }, Option { optionLongName = "default-threshold", optionShortName = Nothing, optionArgument = RequiredArgument "int", optionAction = [Assign (Var "default_threshold") $ Var "optarg"] }, Option { optionLongName = "default-group-size", optionShortName = Nothing, optionArgument = RequiredArgument "int", optionAction = [Assign (Var "default_group_size") $ Var "optarg"] }, Option { optionLongName = "default-num-groups", optionShortName = Nothing, optionArgument = RequiredArgument "int", optionAction = [Assign (Var "default_num_groups") $ Var "optarg"] }, Option { optionLongName = "default-tile-size", optionShortName = Nothing, optionArgument = RequiredArgument "int", optionAction = [Assign (Var "default_tile_size") $ Var "optarg"] }, Option { optionLongName = "default-reg-tile-size", optionShortName = Nothing, optionArgument = RequiredArgument "int", optionAction = [Assign (Var "default_reg_tile_size") $ Var "optarg"] }, Option { optionLongName = "size", optionShortName = Nothing, optionArgument = RequiredArgument "size_assignment", optionAction = [ Assign ( Index (Var "sizes") ( IdxExp ( Index (Var "optarg") (IdxExp (Integer 0)) ) ) ) (Index (Var "optarg") (IdxExp (Integer 1))) ] } ] (ws,) <$> Py.compileProg mode class_name constructor imports defines operations () [Exp $ Py.simpleCall "sync" [Var "self"]] options prog' where operations :: Py.Operations Imp.OpenCL () operations = Py.Operations { Py.opsCompiler = callKernel, Py.opsWriteScalar = writeOpenCLScalar, Py.opsReadScalar = readOpenCLScalar, Py.opsAllocate = allocateOpenCLBuffer, Py.opsCopy = copyOpenCLMemory, Py.opsStaticArray = staticOpenCLArray, Py.opsEntryOutput = packArrayOutput, Py.opsEntryInput = unpackArrayInput } -- We have many casts to 'long', because PyOpenCL may get confused at -- the 32-bit numbers that ImpCode uses for offsets and the like. asLong :: PyExp -> PyExp asLong x = Py.simpleCall "np.int64" [x] callKernel :: Py.OpCompiler Imp.OpenCL () callKernel (Imp.GetSize v key) = do v' <- Py.compileVar v Py.stm $ Assign v' $ Index (Var "self.sizes") (IdxExp $ String $ pretty key) callKernel (Imp.CmpSizeLe v key x) = do v' <- Py.compileVar v x' <- Py.compileExp x Py.stm $ Assign v' $ BinOp "<=" (Index (Var "self.sizes") (IdxExp $ String $ pretty key)) x' callKernel (Imp.GetSizeMax v size_class) = do v' <- Py.compileVar v Py.stm $ Assign v' $ Var $ "self.max_" ++ pretty size_class callKernel (Imp.LaunchKernel safety name args num_workgroups workgroup_size) = do num_workgroups' <- mapM (fmap asLong . Py.compileExp) num_workgroups workgroup_size' <- mapM (fmap asLong . Py.compileExp) workgroup_size let kernel_size = zipWith mult_exp num_workgroups' workgroup_size' total_elements = foldl mult_exp (Integer 1) kernel_size cond = BinOp "!=" total_elements (Integer 0) body <- Py.collect $ launchKernel name safety kernel_size workgroup_size' args Py.stm $ If cond body [] when (safety >= Imp.SafetyFull) $ Py.stm $ Assign (Var "self.failure_is_an_option") $ Py.compilePrimValue (Imp.IntValue (Imp.Int32Value 1)) where mult_exp = BinOp "*" launchKernel :: Imp.KernelName -> Imp.KernelSafety -> [PyExp] -> [PyExp] -> [Imp.KernelArg] -> Py.CompilerM op s () launchKernel kernel_name safety kernel_dims workgroup_dims args = do let kernel_dims' = Tuple kernel_dims workgroup_dims' = Tuple workgroup_dims kernel_name' = "self." ++ zEncodeString (nameToString kernel_name) ++ "_var" args' <- mapM processKernelArg args let failure_args = take (Imp.numFailureParams safety) [ Var "self.global_failure", Var "self.failure_is_an_option", Var "self.global_failure_args" ] Py.stm $ Exp $ Py.simpleCall (kernel_name' ++ ".set_args") $ failure_args ++ args' Py.stm $ Exp $ Py.simpleCall "cl.enqueue_nd_range_kernel" [Var "self.queue", Var kernel_name', kernel_dims', workgroup_dims'] finishIfSynchronous where processKernelArg :: Imp.KernelArg -> Py.CompilerM op s PyExp processKernelArg (Imp.ValueKArg e bt) = do e' <- Py.compileExp e return $ Py.simpleCall (Py.compilePrimToNp bt) [e'] processKernelArg (Imp.MemKArg v) = Py.compileVar v processKernelArg (Imp.SharedMemoryKArg (Imp.Count num_bytes)) = do num_bytes' <- Py.compileExp num_bytes return $ Py.simpleCall "cl.LocalMemory" [asLong num_bytes'] writeOpenCLScalar :: Py.WriteScalar Imp.OpenCL () writeOpenCLScalar mem i bt "device" val = do let nparr = Call (Var "np.array") [Arg val, ArgKeyword "dtype" $ Var $ Py.compilePrimType bt] Py.stm $ Exp $ Call (Var "cl.enqueue_copy") [ Arg $ Var "self.queue", Arg mem, Arg nparr, ArgKeyword "device_offset" $ BinOp "*" (asLong i) (Integer $ Imp.primByteSize bt), ArgKeyword "is_blocking" $ Var "synchronous" ] writeOpenCLScalar _ _ _ space _ = error $ "Cannot write to '" ++ space ++ "' memory space." readOpenCLScalar :: Py.ReadScalar Imp.OpenCL () readOpenCLScalar mem i bt "device" = do val <- newVName "read_res" let val' = Var $ pretty val let nparr = Call (Var "np.empty") [ Arg $ Integer 1, ArgKeyword "dtype" (Var $ Py.compilePrimType bt) ] Py.stm $ Assign val' nparr Py.stm $ Exp $ Call (Var "cl.enqueue_copy") [ Arg $ Var "self.queue", Arg val', Arg mem, ArgKeyword "device_offset" $ BinOp "*" (asLong i) (Integer $ Imp.primByteSize bt), ArgKeyword "is_blocking" $ Var "synchronous" ] Py.stm $ Exp $ Py.simpleCall "sync" [Var "self"] return $ Index val' $ IdxExp $ Integer 0 readOpenCLScalar _ _ _ space = error $ "Cannot read from '" ++ space ++ "' memory space." allocateOpenCLBuffer :: Py.Allocate Imp.OpenCL () allocateOpenCLBuffer mem size "device" = Py.stm $ Assign mem $ Py.simpleCall "opencl_alloc" [Var "self", size, String $ pretty mem] allocateOpenCLBuffer _ _ space = error $ "Cannot allocate in '" ++ space ++ "' space" copyOpenCLMemory :: Py.Copy Imp.OpenCL () copyOpenCLMemory destmem destidx Imp.DefaultSpace srcmem srcidx (Imp.Space "device") nbytes bt = do let divide = BinOp "//" nbytes (Integer $ Imp.primByteSize bt) end = BinOp "+" destidx divide dest = Index destmem (IdxRange destidx end) Py.stm $ ifNotZeroSize nbytes $ Exp $ Call (Var "cl.enqueue_copy") [ Arg $ Var "self.queue", Arg dest, Arg srcmem, ArgKeyword "device_offset" $ asLong srcidx, ArgKeyword "is_blocking" $ Var "synchronous" ] copyOpenCLMemory destmem destidx (Imp.Space "device") srcmem srcidx Imp.DefaultSpace nbytes bt = do let divide = BinOp "//" nbytes (Integer $ Imp.primByteSize bt) end = BinOp "+" srcidx divide src = Index srcmem (IdxRange srcidx end) Py.stm $ ifNotZeroSize nbytes $ Exp $ Call (Var "cl.enqueue_copy") [ Arg $ Var "self.queue", Arg destmem, Arg src, ArgKeyword "device_offset" $ asLong destidx, ArgKeyword "is_blocking" $ Var "synchronous" ] copyOpenCLMemory destmem destidx (Imp.Space "device") srcmem srcidx (Imp.Space "device") nbytes _ = do Py.stm $ ifNotZeroSize nbytes $ Exp $ Call (Var "cl.enqueue_copy") [ Arg $ Var "self.queue", Arg destmem, Arg srcmem, ArgKeyword "dest_offset" $ asLong destidx, ArgKeyword "src_offset" $ asLong srcidx, ArgKeyword "byte_count" $ asLong nbytes ] finishIfSynchronous copyOpenCLMemory destmem destidx Imp.DefaultSpace srcmem srcidx Imp.DefaultSpace nbytes _ = Py.copyMemoryDefaultSpace destmem destidx srcmem srcidx nbytes copyOpenCLMemory _ _ destspace _ _ srcspace _ _ = error $ "Cannot copy to " ++ show destspace ++ " from " ++ show srcspace staticOpenCLArray :: Py.StaticArray Imp.OpenCL () staticOpenCLArray name "device" t vs = do mapM_ Py.atInit <=< Py.collect $ do -- Create host-side Numpy array with intended values. Py.stm $ Assign (Var name') $ case vs of Imp.ArrayValues vs' -> Call (Var "np.array") [ Arg $ List $ map Py.compilePrimValue vs', ArgKeyword "dtype" $ Var $ Py.compilePrimToNp t ] Imp.ArrayZeros n -> Call (Var "np.zeros") [ Arg $ Integer $ fromIntegral n, ArgKeyword "dtype" $ Var $ Py.compilePrimToNp t ] let num_elems = case vs of Imp.ArrayValues vs' -> length vs' Imp.ArrayZeros n -> n -- Create memory block on the device. static_mem <- newVName "static_mem" let size = Integer $ toInteger num_elems * Imp.primByteSize t allocateOpenCLBuffer (Var (Py.compileName static_mem)) size "device" -- Copy Numpy array to the device memory block. Py.stm $ ifNotZeroSize size $ Exp $ Call (Var "cl.enqueue_copy") [ Arg $ Var "self.queue", Arg $ Var $ Py.compileName static_mem, Arg $ Call (Var "normaliseArray") [Arg (Var name')], ArgKeyword "is_blocking" $ Var "synchronous" ] -- Store the memory block for later reference. Py.stm $ Assign (Field (Var "self") name') $ Var $ Py.compileName static_mem Py.stm $ Assign (Var name') (Field (Var "self") name') where name' = Py.compileName name staticOpenCLArray _ space _ _ = error $ "PyOpenCL backend cannot create static array in memory space '" ++ space ++ "'" packArrayOutput :: Py.EntryOutput Imp.OpenCL () packArrayOutput mem "device" bt ept dims = do mem' <- Py.compileVar mem dims' <- mapM Py.compileDim dims return $ Call (Var "cl.array.Array") [ Arg $ Var "self.queue", Arg $ Tuple dims', Arg $ Var $ Py.compilePrimTypeExt bt ept, ArgKeyword "data" mem' ] packArrayOutput _ sid _ _ _ = error $ "Cannot return array from " ++ sid ++ " space." unpackArrayInput :: Py.EntryInput Imp.OpenCL () unpackArrayInput mem "device" t s dims e = do let type_is_ok = BinOp "and" (BinOp "in" (Py.simpleCall "type" [e]) (List [Var "np.ndarray", Var "cl.array.Array"])) (BinOp "==" (Field e "dtype") (Var (Py.compilePrimToExtNp t s))) Py.stm $ Assert type_is_ok $ String "Parameter has unexpected type" zipWithM_ (Py.unpackDim e) dims [0 ..] let memsize' = Py.simpleCall "np.int64" [Field e "nbytes"] pyOpenCLArrayCase = [Assign mem $ Field e "data"] numpyArrayCase <- Py.collect $ do allocateOpenCLBuffer mem memsize' "device" Py.stm $ ifNotZeroSize memsize' $ Exp $ Call (Var "cl.enqueue_copy") [ Arg $ Var "self.queue", Arg mem, Arg $ Call (Var "normaliseArray") [Arg e], ArgKeyword "is_blocking" $ Var "synchronous" ] Py.stm $ If (BinOp "==" (Py.simpleCall "type" [e]) (Var "cl.array.Array")) pyOpenCLArrayCase numpyArrayCase unpackArrayInput _ sid _ _ _ _ = error $ "Cannot accept array from " ++ sid ++ " space." ifNotZeroSize :: PyExp -> PyStmt -> PyStmt ifNotZeroSize e s = If (BinOp "!=" e (Integer 0)) [s] [] finishIfSynchronous :: Py.CompilerM op s () finishIfSynchronous = Py.stm $ If (Var "synchronous") [Exp $ Py.simpleCall "sync" [Var "self"]] []