#include "cbits/stubs.h" #include /* * Make sure that the linker always touches this module so that it notices the * below constructor function. Calling this empty function as part of * 'Foreign.CUDA.Driver.initialise' should be sufficient to prevent it from ever * being stripped. */ void enable_constructors() { } /* * GHC-8 introduced a new (simpler) 64-bit allocator, which on startup 'mmap's * 1TB of address space and then commits sub-portions of that memory as needed. * * The CUDA driver also appears to 'mmap' a large chunk of address space on * 'cuInit', probably as the arena for shuffling memory to and from the device, * but attempts to do so at a _fixed_ address. If the GHC RTS has already taken * that address at the time we call 'cuInit', driver initialisation will fail * with an "out of memory" error. * * The workaround is to call 'cuInit' before initialising the RTS. Then the * RTS's allocation will avoid CUDA's allocation, since the RTS doesn't care * where in the address space it gets that memory. Embedding the following * __attribute__((constructor)) function in the library does the trick nicely, * and the linker will ensure that this gets executed when the shared library is * loaded (during program startup). * * Another way around this, without actually calling 'cuInit', would be to just * reserve the regions that 'cuInit' requires in the constructor function so * that the RTS avoids them, then release them before calling 'cuInit'. However, * since the CUDA driver is closed and we don't know exactly which regions to * reserve, that approach would be fragile. * * See: https://github.com/tmcdonell/cuda/issues/39 */ #ifdef CUDA_PRELOAD __attribute__((constructor)) void preinitialise_cuda() { CUresult status = cuInit (0); if ( status != CUDA_SUCCESS ) { #if CUDA_VERSION >= 6000 const char* str = NULL; cuGetErrorString(status, &str); fprintf(stderr, "Failed to pre-initialise CUDA: %s\n", str); #else fprintf(stderr, "Failed to pre-initialise CUDA (%d)\n", status); #endif } } #endif