# Stub code for OpenCL setup. import pyopencl as cl import numpy as np import sys if cl.version.VERSION < (2015,2): raise Exception('Futhark requires at least PyOpenCL version 2015.2. Installed version is %s.' % cl.version.VERSION_TEXT) def parse_preferred_device(s): pref_num = 0 if len(s) > 1 and s[0] == '#': i = 1 while i < len(s): if not s[i].isdigit(): break else: pref_num = pref_num * 10 + int(s[i]) i += 1 while i < len(s) and s[i].isspace(): i += 1 return (s[i:], pref_num) else: return (s, 0) def get_prefered_context(interactive=False, platform_pref=None, device_pref=None): if device_pref != None: (device_pref, device_num) = parse_preferred_device(device_pref) else: device_num = 0 if interactive: return cl.create_some_context(interactive=True) def blacklisted(p, d): return platform_pref == None and device_pref == None and \ p.name == "Apple" and d.name.find("Intel(R) Core(TM)") >= 0 def platform_ok(p): return not platform_pref or p.name.find(platform_pref) >= 0 def device_ok(d): return not device_pref or d.name.find(device_pref) >= 0 device_matches = 0 for p in cl.get_platforms(): if not platform_ok(p): continue for d in p.get_devices(): if blacklisted(p,d) or not device_ok(d): continue if device_matches == device_num: return cl.Context(devices=[d]) else: device_matches += 1 raise Exception('No OpenCL platform and device matching constraints found.') def size_assignment(s): name, value = s.split('=') return (name, int(value)) def check_types(self, required_types): if 'f64' in required_types: if self.device.get_info(cl.device_info.PREFERRED_VECTOR_WIDTH_DOUBLE) == 0: raise Exception('Program uses double-precision floats, but this is not supported on chosen device: %s' % self.device.name) def apply_size_heuristics(self, size_heuristics, sizes): for (platform_name, device_type, size, valuef) in size_heuristics: if sizes[size] == None \ and self.platform.name.find(platform_name) >= 0 \ and (self.device.type & device_type) == device_type: sizes[size] = valuef(self.device) return sizes def initialise_opencl_object(self, program_src='', command_queue=None, interactive=False, platform_pref=None, device_pref=None, default_group_size=None, default_num_groups=None, default_tile_size=None, default_threshold=None, size_heuristics=[], required_types=[], all_sizes={}, user_sizes={}): if command_queue is None: self.ctx = get_prefered_context(interactive, platform_pref, device_pref) self.queue = cl.CommandQueue(self.ctx) else: self.ctx = command_queue.context self.queue = command_queue self.device = self.queue.device self.platform = self.device.platform self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue)) device_type = self.device.type check_types(self, required_types) max_group_size = int(self.device.max_work_group_size) max_tile_size = int(np.sqrt(self.device.max_work_group_size)) self.max_group_size = max_group_size self.max_tile_size = max_tile_size self.max_threshold = 0 self.max_num_groups = 0 self.max_local_memory = int(self.device.local_mem_size) # Futhark reserves 4 bytes of local memory for its own purposes. self.max_local_memory -= 4 # See comment in rts/c/opencl.h. if self.platform.name.find('NVIDIA CUDA') >= 0: self.max_local_memory -= 12 self.free_list = {} self.global_failure = self.pool.allocate(np.int32().itemsize) cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize) self.global_failure_args = self.pool.allocate(np.int32().itemsize * (self.global_failure_args_max+1)) self.failure_is_an_option = np.int32(0) if 'default_group_size' in sizes: default_group_size = sizes['default_group_size'] del sizes['default_group_size'] if 'default_num_groups' in sizes: default_num_groups = sizes['default_num_groups'] del sizes['default_num_groups'] if 'default_tile_size' in sizes: default_tile_size = sizes['default_tile_size'] del sizes['default_tile_size'] if 'default_threshold' in sizes: default_threshold = sizes['default_threshold'] del sizes['default_threshold'] default_group_size_set = default_group_size != None default_tile_size_set = default_tile_size != None default_sizes = apply_size_heuristics(self, size_heuristics, {'group_size': default_group_size, 'tile_size': default_tile_size, 'num_groups': default_num_groups, 'lockstep_width': None, 'threshold': default_threshold}) default_group_size = default_sizes['group_size'] default_num_groups = default_sizes['num_groups'] default_threshold = default_sizes['threshold'] default_tile_size = default_sizes['tile_size'] lockstep_width = default_sizes['lockstep_width'] if default_group_size > max_group_size: if default_group_size_set: sys.stderr.write('Note: Device limits group size to {} (down from {})\n'. format(max_tile_size, default_group_size)) default_group_size = max_group_size if default_tile_size > max_tile_size: if default_tile_size_set: sys.stderr.write('Note: Device limits tile size to {} (down from {})\n'. format(max_tile_size, default_tile_size)) default_tile_size = max_tile_size for (k,v) in user_sizes.items(): if k in all_sizes: all_sizes[k]['value'] = v else: raise Exception('Unknown size: {}\nKnown sizes: {}'.format(k, ' '.join(all_sizes.keys()))) self.sizes = {} for (k,v) in all_sizes.items(): if v['class'] == 'group_size': max_value = max_group_size default_value = default_group_size elif v['class'] == 'num_groups': max_value = max_group_size # Intentional! default_value = default_num_groups elif v['class'] == 'tile_size': max_value = max_tile_size default_value = default_tile_size elif v['class'].startswith('threshold'): max_value = None default_value = default_threshold else: # Bespoke sizes have no limit or default. max_value = None if v['value'] == None: self.sizes[k] = default_value elif max_value != None and v['value'] > max_value: sys.stderr.write('Note: Device limits {} to {} (down from {}\n'. format(k, max_value, v['value'])) self.sizes[k] = max_value else: self.sizes[k] = v['value'] # XXX: we perform only a subset of z-encoding here. Really, the # compiler should provide us with the variables to which # parameters are mapped. if (len(program_src) >= 0): return cl.Program(self.ctx, program_src).build( ["-DLOCKSTEP_WIDTH={}".format(lockstep_width)] + ["-D{}={}".format(s.replace('z', 'zz').replace('.', 'zi').replace('#', 'zh'),v) for (s,v) in self.sizes.items()]) def opencl_alloc(self, min_size, tag): min_size = 1 if min_size == 0 else min_size assert min_size > 0 return self.pool.allocate(min_size) def opencl_free_all(self): self.pool.free_held() def sync(self): failure = np.empty(1, dtype=np.int32) cl.enqueue_copy(self.queue, failure, self.global_failure, is_blocking=True) self.failure_is_an_option = np.int32(0) if failure[0] >= 0: # Reset failure information. cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize) # Read failure args. failure_args = np.empty(self.global_failure_args_max+1, dtype=np.int32) cl.enqueue_copy(self.queue, failure_args, self.global_failure_args, is_blocking=True) raise Exception(self.failure_msgs[failure[0]].format(*failure_args))