/* * render_opencl.c * * Created on: 26.01.2018 * Author: Superleo1810 */ #include "render_opencl.h" void init_opencl(OpenCLConfig *config) { printf("cl init\n"); x_min_s = -2.0; x_max_s = 1.0; y_min_s = -1.0; y_max_s = 1.0; x_min = x_min_s; x_max = x_max_s; y_min = y_min_s; y_max = y_max_s; config_opencl = config; output = (cl_uint *) malloc((config_opencl->width) * (config_opencl->height) * sizeof(cl_uchar4)); context = NULL; cl_platform_id platform_id; cl_uint ret_num_devices; cl_uint ret_num_platforms; size_t device_list_size; cl_program program; size_t cl_src_sz; clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); ret = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size); devices = (cl_device_id *)malloc(device_list_size); num_devices = (cl_uint)(device_list_size/sizeof(cl_device_id)); clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL); printf("lists done\n"); FILE *fp; char *cl_src, *path, *flags = (char *)malloc(200 * sizeof(char)); flags[0] = '\0'; switch(config_opencl->fpu) { case OPENCL_FPU_32: switch(config_opencl->set_func) { case SFUNC_JULIA: path = "cl/julia32.cl"; break; case SFUNC_MANDELBROT: default: path = "cl/mandelbrot32.cl"; break; } break; case OPENCL_FPU_64: switch(config_opencl->set_func) { case SFUNC_JULIA: path = "cl/julia64.cl"; break; case SFUNC_MANDELBROT: default: path = "cl/mandelbrot64.cl"; break; } int khrFP64 = 0; int amdFP64 = 0; for (cl_uint i = 0; i < num_devices; i++) { char deviceExtensions[8192]; ret = clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(deviceExtensions), deviceExtensions, 0); if (strstr(deviceExtensions, "cl_khr_fp64")) { khrFP64++; } else { if (strstr(deviceExtensions, "cl_amd_fp64")) { amdFP64++; } } } if (khrFP64 == num_devices) { flags = strcat(flags, "-D KHR_DP_EXTENSION "); } else if (amdFP64 == num_devices) { flags = strcat(flags, ""); } break; case OPENCL_FPU_128: printf("128 bit precision not implemented yet\n"); break; } cl_src = (char *)malloc(MAX_SOURCE_SIZE * sizeof(char)); fp = fopen(path, "r"); cl_src_sz = fread(cl_src, 1, MAX_SOURCE_SIZE, fp); fclose(fp); printf("reading done\n"); for (cl_uint i = 0; i < num_devices; i++) { cl_command_queue_properties prop = 0; // if (sampleArgs->timing) // { // prop |= CL_QUEUE_PROFILING_ENABLE; // } commandQueue[i] = clCreateCommandQueue(context, devices[i], prop, &ret); outputBuffer[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(cl_uint) * (config_opencl->width) * (config_opencl->height)) / num_devices, NULL, &ret); } printf("comand puffer\n"); if (config_opencl->fma) { flags = strcat(flags, "-D MUL_ADD=fma "); } else { flags = strcat(flags, "-D MUL_ADD=mad "); } printf("flags concat\n"); program = clCreateProgramWithSource(context, 1, (const char **)&cl_src, (const size_t *)&cl_src_sz, &ret); ret = clBuildProgram(program, num_devices, devices, flags, NULL, NULL); for (cl_uint i = 0; i < num_devices; i++) { kernel_vector[i] = clCreateKernel(program, "calculate", &ret); } printf("cl init done\n"); } void render_opencl(void) { cl_event events[MAX_DEVICES]; cl_int eventStatus = CL_QUEUED; size_t globalThreads[1]; size_t localThreads[1]; size_t kernelWorkGroupSize; cl_kernel kernel; globalThreads[0] = ((config_opencl->width) * (config_opencl->height)) / num_devices; localThreads[0] = 256; globalThreads[0] >>= 2; for (cl_uint i = 0; i < num_devices; i++) { kernel = kernel_vector[i]; ret = clGetKernelWorkGroupInfo(kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if ((cl_uint) (localThreads[0]) > kernelWorkGroupSize) { localThreads[0] = kernelWorkGroupSize; } xpos = 0.0; ypos = 0.0; xsize = 0.1; ysize = 0.1; xstep = (xsize / (double) config_opencl->width); ystep = (ysize / (double) config_opencl->height); leftx = (xpos - xsize / 2.0); topy = (ypos + ysize / 2.0 - ((double) i * ysize) / (double) num_devices); if (i == 0) { topy0 = topy; } printf("xsize: %f, ysize: %f, xstep: %f, ystep: %f, leftx: %f, topy: %f\n", xsize, ysize, xstep, ystep, leftx, topy); ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &outputBuffer[i]); cl_float leftxF = (float) leftx; cl_float topyF = (float) topy; cl_float xstepF = (float) xstep; cl_float ystepF = (float) ystep; switch (config_opencl->fpu) { case OPENCL_FPU_32: // lel ret = clSetKernelArg(kernel, 1, sizeof(cl_float), (void *) &leftxF); ret = clSetKernelArg(kernel, 2, sizeof(cl_float), (void *) &topyF); ret = clSetKernelArg(kernel, 3, sizeof(cl_float), (void *) &xstepF); ret = clSetKernelArg(kernel, 4, sizeof(cl_float), (void *) &ystepF); break; case OPENCL_FPU_64: ret = clSetKernelArg(kernel, 1, sizeof(cl_double), (void *) &leftx); ret = clSetKernelArg(kernel, 2, sizeof(cl_double), (void *) &topy); ret = clSetKernelArg(kernel, 3, sizeof(cl_double), (void *) &xstep); ret = clSetKernelArg(kernel, 4, sizeof(cl_double), (void *) &ystep); break; case OPENCL_FPU_128: break; } ret = clSetKernelArg(kernel, 5, sizeof(cl_uint), (void *) &config_opencl->iterations); ret = clSetKernelArg(kernel, 6, sizeof(cl_int), (void *) &config_opencl->width); ret = clEnqueueNDRangeKernel(commandQueue[i], kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[i]); for (cl_uint i = 0; i < num_devices; i++) { ret = clFlush(commandQueue[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clWaitForEvents(1, &events[num_devices - i - 1]); ret = clReleaseEvent(events[num_devices - i - 1]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clEnqueueReadBuffer(commandQueue[i], outputBuffer[i], CL_FALSE, 0, ((config_opencl->width) * (config_opencl->height) * sizeof(u32)) / num_devices, config_opencl->arr + (((config_opencl->width) * (config_opencl->height) * i) / num_devices), 0, NULL, &events[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clFlush(commandQueue[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clWaitForEvents(1, &events[num_devices - i - 1]); ret = clReleaseEvent(events[num_devices - i - 1]); } } glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glBindTexture(GL_TEXTURE_2D, config_opencl->tex); glEnable(GL_TEXTURE_2D); glBegin(GL_QUADS); glTexCoord2i(0, 0); glVertex2i(0, 0); glTexCoord2i(0, 1); glVertex2i(0, config_opencl->height); glTexCoord2i(1, 1); glVertex2i(config_opencl->width, config_opencl->height); glTexCoord2i(1, 0); glVertex2i(config_opencl->width, 0); glEnd(); glDisable(GL_TEXTURE_2D); glBindTexture(GL_TEXTURE_2D, 0); glutSwapBuffers(); } void idle_opencl(void) { static int t_old; int t = 0, delta = 0; do { t = glutGet(GLUT_ELAPSED_TIME); delta = t - t_old; } while (delta < 16); // TODO: Hardcoded FPS t_old = t; //glGenTextures(1, &tex); glBindTexture(GL_TEXTURE_2D, config_opencl->tex); //glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); //glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, config_opencl->width, config_opencl->height, 0, GL_RGBA, GL_UNSIGNED_BYTE, config_opencl->arr); glBindTexture(GL_TEXTURE_2D, 0); cl_ft += (config_opencl->speed * (delta / 1000.0)); x_min = x_min_s + config_opencl->zoom_func(cl_ft, (d64) 2.0 + config_opencl->to_x); y_min = y_min_s + config_opencl->zoom_func(cl_ft, (d64) 1.0 + config_opencl->to_y); x_max = x_max_s - config_opencl->zoom_func(cl_ft, (d64) 1.0 - config_opencl->to_x); y_max = y_max_s - config_opencl->zoom_func(cl_ft, (d64) 1.0 - config_opencl->to_y); glutPostRedisplay(); }