Bueno este es el código en cuestion, se basa en un template de el SDK de Ati Stream modificado, este es el código en cuestión:
main.cpp:
Código
#include "main.hpp" std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } else { std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n"; exit(1); } return NULL; } int initializeHost(void) { width = 102400; output = NULL; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// cl_uint sizeInBytes = width * sizeof(cl_uint); output = (cl_uint *) malloc(sizeInBytes); if(output == NULL) { std::cout<<"Error: Failed to allocate output memory on host\n"; return 1; } return 0; } int initializeCL(bool UseGPU) { cl_int status = 0; size_t deviceListSize; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n"; return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n"; return 1; } for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Info.(clGetPlatformInfo)\n"; return 1; } platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } if(NULL == platform) { std::cout << "NULL platform found so Exiting Application." << std::endl; return 1; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// if(UseGPU) context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); else context = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Context. (clCreateContextFromType)\n"; return 1; } /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { std::cout<<"Error: No devices found.\n"; return 1; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list, clGetContextInfo)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// commandQueue = clCreateCommandQueue( context, devices[0], 0, &status); if(status != CL_SUCCESS) { std::cout<<"Creating Command Queue. (clCreateCommandQueue)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Create OpenCL memory buffers ///////////////////////////////////////////////////////////////// outputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * width, output, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (outputBuffer)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// const char * filename = "kernel.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { std::cout<< "Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"; return 1; } /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Building Program (clBuildProgram)\n"; return 1; } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel(program, "Prime", &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Kernel from program. (clCreateKernel)\n"; return 1; } return 0; } /* * \brief Run OpenCL program * * Bind host variables to kernel arguments * Run the CL kernel */ int runCLKernels(void) { cl_int status; cl_uint maxDims; cl_event events[2]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; /** * Query device capabilities. Maximum * work item dimensions and the maximmum * work item sizes */ status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } /*** Set appropriate arguments to the kernel ***/ /* the output array to the kernel */ status = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&outputBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (output)\n"; return 1; } status = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &usedLocalMemory, NULL); if(status != CL_SUCCESS) { std::cout<<"clGetKernelWorkGroupInfo CL_KERNEL_LOCAL_MEM_SIZE failed." << std::endl; return 1; } if(usedLocalMemory > totalLocalMemory) { std::cout << "Unsupported: Insufficient local memory on device" << std::endl; return 1; } /* Check group size against group size returned by kernel */ status = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if(status != CL_SUCCESS) { std::cout<<"clGetKernelWorkGroupInfo CL_KERNEL_COMPILE_WORK_GROUP_SIZE failed." << std::endl; return 1; } if(groupSize > kernelWorkGroupSize) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << groupSize << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelWorkGroupSize << std::endl; groupSize = kernelWorkGroupSize; } /* * Enqueue a kernel run call. */ size_t globalThreads[] = {width}; size_t localThreads[] = {1}; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { std::cout << "Unsupported: Device" "does not support requested number of work items."; return 1; } status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Enqueueing kernel onto command queue. \ (clEnqueueNDRangeKernel)\n"; return 1; } /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for kernel run to finish. \ (clWaitForEvents)\n"; return 1; } status = clReleaseEvent(events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Release event object. \ (clReleaseEvent)\n"; return 1; } /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, width * sizeof(cl_uint), output, 0, NULL, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueReadBuffer failed. \ (clEnqueueReadBuffer)\n"; return 1; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for read buffer call to finish. \ (clWaitForEvents)\n"; return 1; } status = clReleaseEvent(events[1]); if(status != CL_SUCCESS) { std::cout<< "Error: Release event object. \ (clReleaseEvent)\n"; return 1; } return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanupCL(void) { cl_int status; status = clReleaseKernel(kernel); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseKernel \n"; return 1; } status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseProgram\n"; return 1; } status = clReleaseMemObject(outputBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (outputBuffer)\n"; return 1; } status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseCommandQueue\n"; return 1; } status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseContext\n"; return 1; } return 0; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(output != NULL) { free(output); output = NULL; } if(devices != NULL) { free(devices); devices = NULL; } } int main(int argc, char * argv[]) { bool GPU = true; for(int i = 0; i < 2; i++) { // Initialize Host application if(initializeHost()==1) return 1; // Initialize OpenCL resources if(initializeCL(GPU)==1) return 1; /* char C; do { std::cout << "Use GPU?(Y/N)" << std::endl; C = getch(); }while(!((C == 'Y') || (C == 'y') || (C == 'n') || (C == 'N'))); if(C == 'Y' || C == 'y') { if(initializeCL(true)==1) return 1; } else { if(initializeCL(false)==1) return 1; }*/ // Run the CL program float Clocklast = clock(); if(runCLKernels()==1) return 1; float Benchtime = (clock() - Clocklast) / CLOCKS_PER_SEC; // Print output array //for(int i = 0; i < width; i++) std::cout << output[i] << " "; if(GPU) std::cout << "With GPU: "; else std::cout << "With CPU: "; std::cout << Benchtime << " Seconds" << std::endl; // Releases OpenCL resources if(cleanupCL()==1) return 1; // Release host resources cleanupHost(); GPU = false; } return 0; }
Main.hpp:
Código
#include <CL/cl.h> #include <string.h> #include <cstdlib> #include <iostream> #include <time.h> #include <fstream> #include <conio.h> /*** GLOBALS ***/ cl_uint *output; cl_ulong totalLocalMemory; cl_ulong usedLocalMemory; size_t kernelWorkGroupSize; size_t groupSize; cl_uint width; /* The memory buffer that is used as input/output for OpenCL kernel */ cl_mem outputBuffer; cl_context context; cl_device_id *devices; cl_command_queue commandQueue; cl_program program; /* This program uses only one kernel and this serves as a handle to it */ cl_kernel kernel; /*** FUNCTION DECLARATIONS ***/ /* * OpenCL related initialisations are done here. * Context, Device list, Command Queue are set up. * Calls are made to set up OpenCL memory buffers that this program uses * and to load the programs into memory and get kernel handles. */ int initializeCL(bool UseGPU); /* * */ std::string convertToString(const char * filename); /* * This is called once the OpenCL context, memory etc. are set up, * the program is loaded into memory and the kernel handles are ready. * * It sets the values for kernels' arguments and enqueues calls to the kernels * on to the command queue and waits till the calls have finished execution. * * It also gets kernel start and end time if profiling is enabled. */ int runCLKernels(void); /* Releases OpenCL resources (Context, Memory etc.) */ int cleanupCL(void); /* Releases program's resources */ void cleanupHost(void); /* * Prints no more than 256 elements of the given array. * Prints full array if length is less than 256. * * Prints Array name followed by elements. */ void print1DArray( const std::string arrayName, const unsigned long * arrayData, const unsigned int length);
Kernel.cl
Código
__kernel void Prime( __global unsigned int * output) { uint tid = get_global_id(0); output[tid] = 2 * tid; }
El kernel es muy simple, iba a calcular primos, por eso se llama "prime" pero bueno, todavía no me puse a hacerlo.
Un abrazo
APOKLIPTICO