OpenCL + Ati STREAM + CUDA.

Hola gente, como va todo? Acabo de compilar un programa de prueba que utiliza OpenCL. OpenCL es una plataforma que unifica el GPGPU, realmente me pareció muy buena, el tema es que yo solo tengo una Ati HD4850, no tengo ninguna Nvidia para probar, alguien podría hacerme el favor?? Osea, en definitiva lo que necesitaría es el vendor ID de nvidia para agregarlo al programa, una vez que haga esto, se puede adaptar para que funcione con ambas plataformas! No es genial?.

Bueno este es el código en cuestion, se basa en un template de el SDK de Ati Stream modificado, este es el código en cuestión:

main.cpp:

Código

#include "main.hpp"
std::string
convertToString(const char *filename)
{
    size_t size;
    char*  str;
    std::string s;
 
    std::fstream f(filename, (std::fstream::in | std::fstream::binary));
 
    if(f.is_open())
    {
        size_t fileSize;
        f.seekg(0, std::fstream::end);
        size = fileSize = f.tellg();
        f.seekg(0, std::fstream::beg);
 
        str = new char[size+1];
        if(!str)
        {
            f.close();
            return NULL;
        }
 
        f.read(str, fileSize);
        f.close();
        str[size] = '\0';
 
        s = str;
        delete[] str;
        return s;
    }
    else
    {
        std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n";
        exit(1);
    }
    return NULL;
}
int
initializeHost(void)
{
    width               = 102400;
    output                = NULL;
 
    /////////////////////////////////////////////////////////////////
    // Allocate and initialize memory used by host
    /////////////////////////////////////////////////////////////////
    cl_uint sizeInBytes = width * sizeof(cl_uint);
 
    output = (cl_uint *) malloc(sizeInBytes);
    if(output == NULL)
    {
        std::cout<<"Error: Failed to allocate output memory on host\n";
        return 1;
    }
 
    return 0;
}
int
initializeCL(bool UseGPU)
{
    cl_int status = 0;
    size_t deviceListSize;
 
    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
 
    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(status != CL_SUCCESS)
    {
        std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n";
        return 1;
    }
 
    if(numPlatforms > 0)
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(status != CL_SUCCESS)
        {
            std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n";
            return 1;
        }
        for(unsigned int i=0; i < numPlatforms; ++i)
        {
            char pbuff[100];
            status = clGetPlatformInfo(
                        platforms[i],
                        CL_PLATFORM_VENDOR,
                        sizeof(pbuff),
                        pbuff,
                        NULL);
            if(status != CL_SUCCESS)
            {
                std::cout << "Error: Getting Platform Info.(clGetPlatformInfo)\n";
                return 1;
            }
            platform = platforms[i];
            if(!strcmp(pbuff, "Advanced Micro Devices, Inc."))
            {
                break;
            }
        }
        delete platforms;
    }
 
    if(NULL == platform)
    {
        std::cout << "NULL platform found so Exiting Application." << std::endl;
        return 1;
    }
 
    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
 
    /////////////////////////////////////////////////////////////////
    // Create an OpenCL context
    /////////////////////////////////////////////////////////////////
    if(UseGPU) context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
    else context = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &status);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: Creating Context. (clCreateContextFromType)\n";
        return 1;
    }
 
    /* First, get the size of device list data */
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              0,
                              NULL,
                              &deviceListSize);
    if(status != CL_SUCCESS)
    {
        std::cout<<
            "Error: Getting Context Info \
            (device list size, clGetContextInfo)\n";
        return 1;
    }
 
    /////////////////////////////////////////////////////////////////
    // Detect OpenCL devices
    /////////////////////////////////////////////////////////////////
    devices = (cl_device_id *)malloc(deviceListSize);
    if(devices == 0)
    {
        std::cout<<"Error: No devices found.\n";
        return 1;
    }
 
    /* Now, get the device list data */
    status = clGetContextInfo(
                 context,
                 CL_CONTEXT_DEVICES,
                 deviceListSize,
                 devices,
                 NULL);
    if(status != CL_SUCCESS)
    {
        std::cout<<
            "Error: Getting Context Info \
            (device list, clGetContextInfo)\n";
        return 1;
    }
 
    /////////////////////////////////////////////////////////////////
    // Create an OpenCL command queue
    /////////////////////////////////////////////////////////////////
    commandQueue = clCreateCommandQueue(
                       context,
                       devices[0],
                       0,
                       &status);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Creating Command Queue. (clCreateCommandQueue)\n";
        return 1;
    }
 
    /////////////////////////////////////////////////////////////////
    // Create OpenCL memory buffers
    /////////////////////////////////////////////////////////////////
 
    outputBuffer = clCreateBuffer(
                       context,
                       CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                       sizeof(cl_uint) * width,
                       output,
                       &status);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: clCreateBuffer (outputBuffer)\n";
        return 1;
    }
 
 
    /////////////////////////////////////////////////////////////////
    // Load CL file, build CL program object, create CL kernel object
    /////////////////////////////////////////////////////////////////
    const char * filename  = "kernel.cl";
    std::string  sourceStr = convertToString(filename);
    const char * source    = sourceStr.c_str();
    size_t sourceSize[]    = { strlen(source) };
 
    program = clCreateProgramWithSource(
                  context,
                  1,
                  &source,
                  sourceSize,
                  &status);
    if(status != CL_SUCCESS)
    {
      std::cout<<
               "Error: Loading Binary into cl_program \
               (clCreateProgramWithBinary)\n";
      return 1;
    }
 
    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: Building Program (clBuildProgram)\n";
        return 1;
    }
 
    /* get a kernel object handle for a kernel with the given name */
    kernel = clCreateKernel(program, "Prime", &status);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: Creating Kernel from program. (clCreateKernel)\n";
        return 1;
    }
 
    return 0;
}
 
 
/*
 * \brief Run OpenCL program
 *
 *        Bind host variables to kernel arguments
 *          Run the CL kernel
 */
int
runCLKernels(void)
{
    cl_int   status;
    cl_uint maxDims;
    cl_event events[2];
     size_t maxWorkGroupSize;
    size_t maxWorkItemSizes[3];
 
    /**
    * Query device capabilities. Maximum
    * work item dimensions and the maximmum
    * work item sizes
    */
    status = clGetDeviceInfo(
        devices[0],
        CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(size_t),
        (void*)&maxWorkGroupSize,
        NULL);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n";
        return 1;
    }
 
    status = clGetDeviceInfo(
        devices[0],
        CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
        sizeof(cl_uint),
        (void*)&maxDims,
        NULL);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n";
        return 1;
    }
 
    status = clGetDeviceInfo(
        devices[0],
        CL_DEVICE_MAX_WORK_ITEM_SIZES,
        sizeof(size_t)*maxDims,
        (void*)maxWorkItemSizes,
        NULL);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n";
        return 1;
    }
 
    /*** Set appropriate arguments to the kernel ***/
    /* the output array to the kernel */
    status = clSetKernelArg(
                    kernel,
                    0,
                    sizeof(cl_mem),
                    (void *)&outputBuffer);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: Setting kernel argument. (output)\n";
        return 1;
    }
    status = clGetKernelWorkGroupInfo(kernel,
        devices[0],
        CL_KERNEL_LOCAL_MEM_SIZE,
        sizeof(cl_ulong),
        &usedLocalMemory,
        NULL);
    if(status != CL_SUCCESS)
    {
        std::cout<<"clGetKernelWorkGroupInfo CL_KERNEL_LOCAL_MEM_SIZE failed." << std::endl;
        return 1;
    }
 
    if(usedLocalMemory > totalLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device" << std::endl;
        return 1;
    }
 
    /* Check group size against group size returned by kernel */
    status = clGetKernelWorkGroupInfo(kernel,
        devices[0],
        CL_KERNEL_WORK_GROUP_SIZE,
        sizeof(size_t),
        &kernelWorkGroupSize,
        0);
    if(status != CL_SUCCESS)
    {
        std::cout<<"clGetKernelWorkGroupInfo CL_KERNEL_COMPILE_WORK_GROUP_SIZE failed." << std::endl;
        return 1;
    }
 
    if(groupSize > kernelWorkGroupSize)
    {
        std::cout << "Out of Resources!" << std::endl;
        std::cout << "Group Size specified : " << groupSize << std::endl;
        std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl;
        std::cout << "Falling back to " << kernelWorkGroupSize << std::endl;
        groupSize = kernelWorkGroupSize;
    }
    /*
     * Enqueue a kernel run call.
     */
    size_t globalThreads[] = {width};
    size_t localThreads[] = {1};
    if(localThreads[0] > maxWorkItemSizes[0] ||
       localThreads[0] > maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device" "does not support requested number of work items.";
        return 1;
    }
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 kernel,
                 1,
                 NULL,
                 globalThreads,
                 localThreads,
                 0,
                 NULL,
                 &events[0]);
    if(status != CL_SUCCESS)
    {
        std::cout<<
            "Error: Enqueueing kernel onto command queue. \
            (clEnqueueNDRangeKernel)\n";
        return 1;
    }
 
 
    /* wait for the kernel call to finish execution */
    status = clWaitForEvents(1, &events[0]);
    if(status != CL_SUCCESS)
    {
        std::cout<<
            "Error: Waiting for kernel run to finish. \
            (clWaitForEvents)\n";
        return 1;
    }
 
    status = clReleaseEvent(events[0]);
    if(status != CL_SUCCESS)
    {
        std::cout<<
            "Error: Release event object. \
            (clReleaseEvent)\n";
        return 1;
    }
 
    /* Enqueue readBuffer*/
    status = clEnqueueReadBuffer(
                commandQueue,
                outputBuffer,
                CL_TRUE,
                0,
                width * sizeof(cl_uint),
                output,
                0,
                NULL,
                &events[1]);
 
    if(status != CL_SUCCESS)
    {
        std::cout <<
            "Error: clEnqueueReadBuffer failed. \
             (clEnqueueReadBuffer)\n";
 
        return 1;
    }
 
    /* Wait for the read buffer to finish execution */
    status = clWaitForEvents(1, &events[1]);
    if(status != CL_SUCCESS)
    {
        std::cout<<
            "Error: Waiting for read buffer call to finish. \
            (clWaitForEvents)\n";
        return 1;
    }
 
    status = clReleaseEvent(events[1]);
    if(status != CL_SUCCESS)
    {
        std::cout<<
            "Error: Release event object. \
            (clReleaseEvent)\n";
        return 1;
    }
 
    return 0;
}
 
 
/*
 * \brief Release OpenCL resources (Context, Memory etc.)
 */
int
cleanupCL(void)
{
    cl_int status;
 
    status = clReleaseKernel(kernel);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: In clReleaseKernel \n";
        return 1;
    }
    status = clReleaseProgram(program);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: In clReleaseProgram\n";
        return 1;
    }
    status = clReleaseMemObject(outputBuffer);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: In clReleaseMemObject (outputBuffer)\n";
        return 1;
    }
    status = clReleaseCommandQueue(commandQueue);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: In clReleaseCommandQueue\n";
        return 1;
    }
    status = clReleaseContext(context);
    if(status != CL_SUCCESS)
    {
        std::cout<<"Error: In clReleaseContext\n";
        return 1;
    }
 
    return 0;
}
 
 
/*
 * \brief Releases program's resources
 */
void
cleanupHost(void)
{
    if(output != NULL)
    {
        free(output);
        output = NULL;
    }
    if(devices != NULL)
    {
        free(devices);
        devices = NULL;
    }
}
 
 
 
int
main(int argc, char * argv[])
{
    bool GPU = true;
    for(int i = 0; i < 2; i++)
    {
    // Initialize Host application
    if(initializeHost()==1)
        return 1;
 
    // Initialize OpenCL resources
    if(initializeCL(GPU)==1)
    return 1;
/*  char C;
    do
    {
        std::cout << "Use GPU?(Y/N)" << std::endl;
        C = getch();
    }while(!((C == 'Y') || (C == 'y') || (C == 'n') || (C == 'N')));
    if(C == 'Y' || C == 'y')
    {
        if(initializeCL(true)==1)
            return 1;
    }
    else
    {
        if(initializeCL(false)==1)
            return 1;
    }*/
 
    // Run the CL program
    float Clocklast = clock();
    if(runCLKernels()==1)
        return 1;
    float Benchtime = (clock() - Clocklast) / CLOCKS_PER_SEC;
    // Print output array
    //for(int i = 0; i < width; i++) std::cout << output[i] << " ";
    if(GPU) std::cout << "With GPU: "; else std::cout << "With CPU: ";
    std::cout << Benchtime << " Seconds" << std::endl;
        // Releases OpenCL resources
    if(cleanupCL()==1)
        return 1;
 
    // Release host resources
    cleanupHost();
    GPU = false;
    }
    return 0;
}

Main.hpp:

Código

#include <CL/cl.h>
#include <string.h>
#include <cstdlib>
#include <iostream>
#include <time.h>
#include <fstream>
#include <conio.h>
/*** GLOBALS ***/
 
cl_uint *output;
cl_ulong totalLocalMemory;
cl_ulong usedLocalMemory;
size_t kernelWorkGroupSize;
size_t groupSize;
cl_uint width;
 
/* The memory buffer that is used as input/output for OpenCL kernel */
cl_mem     outputBuffer;
 
cl_context          context;
cl_device_id        *devices;
cl_command_queue    commandQueue;
 
cl_program program;
 
/* This program uses only one kernel and this serves as a handle to it */
cl_kernel  kernel;
 
 
/*** FUNCTION DECLARATIONS ***/
/*
 * OpenCL related initialisations are done here.
 * Context, Device list, Command Queue are set up.
 * Calls are made to set up OpenCL memory buffers that this program uses
 * and to load the programs into memory and get kernel handles.
 */
int initializeCL(bool UseGPU);
 
/*
 *
 */
std::string convertToString(const char * filename);
 
/*
 * This is called once the OpenCL context, memory etc. are set up,
 * the program is loaded into memory and the kernel handles are ready.
 *
 * It sets the values for kernels' arguments and enqueues calls to the kernels
 * on to the command queue and waits till the calls have finished execution.
 *
 * It also gets kernel start and end time if profiling is enabled.
 */
int runCLKernels(void);
 
/* Releases OpenCL resources (Context, Memory etc.) */
int cleanupCL(void);
 
/* Releases program's resources */
void cleanupHost(void);
/*
 * Prints no more than 256 elements of the given array.
 * Prints full array if length is less than 256.
 *
 * Prints Array name followed by elements.
 */
void print1DArray(
         const std::string arrayName,
         const unsigned long * arrayData,
         const unsigned int length);

Kernel.cl

Código

__kernel
void Prime(
    __global unsigned int * output)
{
	uint tid = get_global_id(0);
	output[tid] = 2 * tid;
}

El kernel es muy simple, iba a calcular primos, por eso se llama "prime" pero bueno, todavía no me puse a hacerlo.

Un abrazo
APOKLIPTICO