/* * render_opencl.c * * Created on: 26.01.2018 * Author: Superleo1810 */ #include "render_opencl.h" void init_opencl(config_t *cfg) { config_opencl = cfg; output = (cl_uint *) malloc((config_opencl->width) * (config_opencl->height) * sizeof(cl_uchar4)); context = NULL; width_cl = config_opencl->width; // Leave it in, cl needs different endianness cl_platform_id platform_id; cl_uint ret_num_devices; cl_uint ret_num_platforms; size_t device_list_size; cl_program program; size_t cl_src_sz; clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); ret = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size); devices = (cl_device_id *)malloc(device_list_size); num_devices = (cl_uint)(device_list_size/sizeof(cl_device_id)); clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL); FILE *fp; char *cl_src, *path, *flags = (char *)malloc(200 * sizeof(char)); flags[0] = '\0'; switch(config_opencl->config_opencl.fpu) { case OPENCL_FPU_32: switch(config_opencl->config_opencl.set_func) { case SFUNC_JULIA: path = "cl/julia32.cl"; break; case SFUNC_MANDELBROT: default: path = "cl/mandelbrot32.cl"; break; } break; case OPENCL_FPU_64: switch(config_opencl->config_opencl.set_func) { case SFUNC_JULIA: path = "cl/julia64.cl"; break; case SFUNC_MANDELBROT: default: path = "cl/mandelbrot64.cl"; break; } int khrFP64 = 0; int amdFP64 = 0; for (cl_uint i = 0; i < num_devices; i++) { char deviceExtensions[8192]; ret = clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(deviceExtensions), deviceExtensions, 0); if (strstr(deviceExtensions, "cl_khr_fp64")) { khrFP64++; } else { if (strstr(deviceExtensions, "cl_amd_fp64")) { amdFP64++; } } } if (khrFP64 == num_devices) { flags = strcat(flags, "-D KHR_DP_EXTENSION "); } else if (amdFP64 == num_devices) { flags = strcat(flags, ""); } break; case OPENCL_FPU_128: printf("128 bit precision not implemented yet\n"); break; } cl_src = (char *)malloc(MAX_SOURCE_SIZE * sizeof(char)); fp = fopen(path, "r"); cl_src_sz = fread(cl_src, 1, MAX_SOURCE_SIZE, fp); fclose(fp); for (cl_uint i = 0; i < num_devices; i++) { cl_command_queue_properties prop = 0; // if (sampleArgs->timing) // { // prop |= CL_QUEUE_PROFILING_ENABLE; // } commandQueue[i] = clCreateCommandQueue(context, devices[i], prop, &ret); outputBuffer[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(cl_uint) * (config_opencl->width) * (config_opencl->height)) / num_devices, NULL, &ret); } if (config_opencl->config_opencl.fma) { flags = strcat(flags, "-D MUL_ADD=fma "); } else { flags = strcat(flags, "-D MUL_ADD=mad "); } printf("flags: %s\n", flags); program = clCreateProgramWithSource(context, 1, (const char **)&cl_src, (const size_t *)&cl_src_sz, &ret); ret = clBuildProgram(program, num_devices, devices, flags, NULL, NULL); for (cl_uint i = 0; i < num_devices; i++) { kernel_vector[i] = clCreateKernel(program, "calculate", &ret); } } void render_opencl(d64 x_min, d64 y_min, d64 x_max, d64 y_max) { cl_event events[MAX_DEVICES]; cl_int eventStatus = CL_QUEUED; size_t globalThreads[1]; size_t localThreads[1]; size_t kernelWorkGroupSize; cl_kernel kernel; cl_double x_min_cl = (cl_double) x_min; cl_double y_min_cl = (cl_double) y_min; cl_double x_max_cl = (cl_double) x_max; cl_double y_max_cl = (cl_double) y_max; cl_double y_max_t; cl_float y_max_t_f; cl_double x_delta = ((x_max_cl - x_min_cl) / (cl_double) config_opencl->width); cl_double y_delta = -((y_max_cl - y_min_cl) / (cl_double) config_opencl->height); cl_float x_delta_f = (float) x_delta; cl_float y_delta_f = (float) y_delta; cl_float x_min_f = (float) x_min; globalThreads[0] = ((config_opencl->width) * (config_opencl->height)) / num_devices; localThreads[0] = 256; globalThreads[0] >>= 2; for (cl_uint i = 0; i < num_devices; i++) { kernel = kernel_vector[i]; ret = clGetKernelWorkGroupInfo(kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if ((cl_uint) (localThreads[0]) > kernelWorkGroupSize) { localThreads[0] = kernelWorkGroupSize; } y_max_t = (((y_min_cl + y_max_cl) / 2.0) + (y_max_cl - y_min_cl) / 2.0 - ((cl_double) i * (y_max_cl - y_min_cl)) / (cl_double) num_devices); ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &outputBuffer[i]); y_max_t_f = (float) y_max_t; //printf("x_delta: %f, y_delta: %f, x_delta_f: %f, y_delta_f: %f, x_min_f: %f, y_max_t: %f, y_max_t_f: %f\n", x_delta, y_delta, x_delta_f, y_delta_f, x_min_f, y_max_t, y_max_t_f); switch (config_opencl->config_opencl.fpu) { case OPENCL_FPU_32: // lel ret = clSetKernelArg(kernel, 1, sizeof(cl_float), (void *) &x_min_f); ret = clSetKernelArg(kernel, 2, sizeof(cl_float), (void *) &y_max_t_f); ret = clSetKernelArg(kernel, 3, sizeof(cl_float), (void *) &x_delta_f); ret = clSetKernelArg(kernel, 4, sizeof(cl_float), (void *) &y_delta_f); break; case OPENCL_FPU_64: ret = clSetKernelArg(kernel, 1, sizeof(cl_double), (void *) &x_min_cl); ret = clSetKernelArg(kernel, 2, sizeof(cl_double), (void *) &y_max_t); ret = clSetKernelArg(kernel, 3, sizeof(cl_double), (void *) &x_delta); ret = clSetKernelArg(kernel, 4, sizeof(cl_double), (void *) &y_delta); break; case OPENCL_FPU_128: break; } ret = clSetKernelArg(kernel, 5, sizeof(cl_uint), (void *) &config_opencl->iterations); ret = clSetKernelArg(kernel, 6, sizeof(cl_int), (void *) &width_cl); ret = clEnqueueNDRangeKernel(commandQueue[i], kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clFlush(commandQueue[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clWaitForEvents(1, &events[num_devices - i - 1]); ret = clReleaseEvent(events[num_devices - i - 1]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clEnqueueReadBuffer(commandQueue[i], outputBuffer[i], CL_FALSE, 0, (config_opencl->width * config_opencl->height * sizeof(cl_int)) / num_devices, config_opencl->arr + (config_opencl->width * config_opencl->height / num_devices) * i, 0, NULL, &events[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clFlush(commandQueue[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clWaitForEvents(1, &events[num_devices - i - 1]); ret = clReleaseEvent(events[num_devices - i - 1]); } } void idle_opencl(void) { }