# Stub code for OpenCL setup. import pyopencl as cl import numpy as np import sys if cl.version.VERSION < (2015, 2): raise Exception( "Futhark requires at least PyOpenCL version 2015.2. Installed version is %s." % cl.version.VERSION_TEXT ) def parse_preferred_device(s): pref_num = 0 if len(s) > 1 and s[0] == "#": i = 1 while i < len(s): if not s[i].isdigit(): break else: pref_num = pref_num * 10 + int(s[i]) i += 1 while i < len(s) and s[i].isspace(): i += 1 return (s[i:], pref_num) else: return (s, 0) def get_prefered_context( interactive=False, platform_pref=None, device_pref=None ): if device_pref != None: (device_pref, device_num) = parse_preferred_device(device_pref) else: device_num = 0 if interactive: return cl.create_some_context(interactive=True) def blacklisted(p, d): return ( platform_pref == None and device_pref == None and p.name == "Apple" and d.name.find("Intel(R) Core(TM)") >= 0 ) def platform_ok(p): return not platform_pref or p.name.find(platform_pref) >= 0 def device_ok(d): return not device_pref or d.name.find(device_pref) >= 0 device_matches = 0 for p in cl.get_platforms(): if not platform_ok(p): continue for d in p.get_devices(): if blacklisted(p, d) or not device_ok(d): continue if device_matches == device_num: return cl.Context(devices=[d]) else: device_matches += 1 raise Exception( "No OpenCL platform and device matching constraints found." ) def param_assignment(s): name, value = s.split("=") return (name, int(value)) def check_types(self, required_types): if "f64" in required_types: if ( self.device.get_info(cl.device_info.PREFERRED_VECTOR_WIDTH_DOUBLE) == 0 ): raise Exception( "Program uses double-precision floats, but this is not supported on chosen device: %s" % self.device.name ) def apply_size_heuristics(self, size_heuristics, sizes): for platform_name, device_type, size, valuef in size_heuristics: if ( sizes[size] == None and self.platform.name.find(platform_name) >= 0 and (self.device.type & device_type) == device_type ): sizes[size] = valuef(self.device) return sizes def initialise_opencl_object( self, program_src="", build_options=[], command_queue=None, interactive=False, platform_pref=None, device_pref=None, default_group_size=None, default_num_groups=None, default_tile_size=None, default_reg_tile_size=None, default_threshold=None, size_heuristics=[], required_types=[], all_sizes={}, user_sizes={}, ): if command_queue is None: self.ctx = get_prefered_context( interactive, platform_pref, device_pref ) self.queue = cl.CommandQueue(self.ctx) else: self.ctx = command_queue.context self.queue = command_queue self.device = self.queue.device self.platform = self.device.platform self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue)) device_type = self.device.type check_types(self, required_types) max_group_size = int(self.device.max_work_group_size) max_tile_size = int(np.sqrt(self.device.max_work_group_size)) self.max_group_size = max_group_size self.max_tile_size = max_tile_size self.max_threshold = 0 self.max_num_groups = 0 self.max_local_memory = int(self.device.local_mem_size) # Futhark reserves 4 bytes of local memory for its own purposes. self.max_local_memory -= 4 # See comment in rts/c/opencl.h. if self.platform.name.find("NVIDIA CUDA") >= 0: self.max_local_memory -= 12 elif self.platform.name.find("AMD") >= 0: self.max_local_memory -= 16 self.free_list = {} self.global_failure = self.pool.allocate(np.int32().itemsize) cl.enqueue_fill_buffer( self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize ) self.global_failure_args = self.pool.allocate( np.int64().itemsize * (self.global_failure_args_max + 1) ) self.failure_is_an_option = np.int32(0) if "default_group_size" in sizes: default_group_size = sizes["default_group_size"] del sizes["default_group_size"] if "default_num_groups" in sizes: default_num_groups = sizes["default_num_groups"] del sizes["default_num_groups"] if "default_tile_size" in sizes: default_tile_size = sizes["default_tile_size"] del sizes["default_tile_size"] if "default_reg_tile_size" in sizes: default_reg_tile_size = sizes["default_reg_tile_size"] del sizes["default_reg_tile_size"] if "default_threshold" in sizes: default_threshold = sizes["default_threshold"] del sizes["default_threshold"] default_group_size_set = default_group_size != None default_tile_size_set = default_tile_size != None default_sizes = apply_size_heuristics( self, size_heuristics, { "group_size": default_group_size, "tile_size": default_tile_size, "reg_tile_size": default_reg_tile_size, "num_groups": default_num_groups, "lockstep_width": None, "threshold": default_threshold, }, ) default_group_size = default_sizes["group_size"] default_num_groups = default_sizes["num_groups"] default_threshold = default_sizes["threshold"] default_tile_size = default_sizes["tile_size"] default_reg_tile_size = default_sizes["reg_tile_size"] lockstep_width = default_sizes["lockstep_width"] if default_group_size > max_group_size: if default_group_size_set: sys.stderr.write( "Note: Device limits group size to {} (down from {})\n".format( max_tile_size, default_group_size ) ) default_group_size = max_group_size if default_tile_size > max_tile_size: if default_tile_size_set: sys.stderr.write( "Note: Device limits tile size to {} (down from {})\n".format( max_tile_size, default_tile_size ) ) default_tile_size = max_tile_size for k, v in user_sizes.items(): if k in all_sizes: all_sizes[k]["value"] = v else: raise Exception( "Unknown size: {}\nKnown sizes: {}".format( k, " ".join(all_sizes.keys()) ) ) self.sizes = {} for k, v in all_sizes.items(): if v["class"] == "group_size": max_value = max_group_size default_value = default_group_size elif v["class"] == "num_groups": max_value = max_group_size # Intentional! default_value = default_num_groups elif v["class"] == "tile_size": max_value = max_tile_size default_value = default_tile_size elif v["class"] == "reg_tile_size": max_value = None default_value = default_reg_tile_size elif v["class"].startswith("threshold"): max_value = None default_value = default_threshold else: # Bespoke sizes have no limit or default. max_value = None if v["value"] == None: self.sizes[k] = default_value elif max_value != None and v["value"] > max_value: sys.stderr.write( "Note: Device limits {} to {} (down from {}\n".format( k, max_value, v["value"] ) ) self.sizes[k] = max_value else: self.sizes[k] = v["value"] # XXX: we perform only a subset of z-encoding here. Really, the # compiler should provide us with the variables to which # parameters are mapped. if len(program_src) >= 0: build_options += ["-DLOCKSTEP_WIDTH={}".format(lockstep_width)] build_options += ["-D{}={}".format("max_group_size", max_group_size)] build_options += [ "-D{}={}".format( s.replace("z", "zz") .replace(".", "zi") .replace("#", "zh") .replace("'", "zq"), v, ) for (s, v) in self.sizes.items() ] if self.platform.name == "Oclgrind": build_options += ["-DEMULATE_F16"] return cl.Program(self.ctx, program_src).build(build_options) def opencl_alloc(self, min_size, tag): min_size = 1 if min_size == 0 else min_size assert min_size > 0 return self.pool.allocate(min_size) def opencl_free_all(self): self.pool.free_held() def sync(self): failure = np.empty(1, dtype=np.int32) cl.enqueue_copy(self.queue, failure, self.global_failure, is_blocking=True) self.failure_is_an_option = np.int32(0) if failure[0] >= 0: # Reset failure information. cl.enqueue_fill_buffer( self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize, ) # Read failure args. failure_args = np.empty( self.global_failure_args_max + 1, dtype=np.int64 ) cl.enqueue_copy( self.queue, failure_args, self.global_failure_args, is_blocking=True, ) raise Exception(self.failure_msgs[failure[0]].format(*failure_args))