CUDA Samples

This page provides some simplistic samples.

 

Sample 1: Generate random numbers and COPY data to the GPU

/* implement random generator and copy to CUDA */
nn_precision*
generate_random_numbers(int number_of_values)
{
        nn_precision    *cuda_float_p;

        /* allocate host memory and CUDA memory */
        nn_precision *host_p = (nn_precision *)pg_palloc(sizeof(nn_precision) * number_of_values);
        CUDATOOLS_SAFE_CALL( cudaMalloc( (void**) &cuda_float_p,
                sizeof(nn_precision) * number_of_values));

        /* create random numbers */
        for     (int i = 0; i < number_of_values; i++)
        {
                host_p[i] = (nn_precision) drand48();
        }

        /* copy data to CUDA and return pointer to CUDA structure */
        CUDATOOLS_SAFE_CALL( cudaMemcpy(cuda_float_p, host_p,
                sizeof(nn_precision) * number_of_values, cudaMemcpyHostToDevice) );

        return cuda_float_p;
}

CUDA offers a set of simple to use functions to copy data from the CPU to the GPU. On the GPU we can basically perform all operations which can be done on a GPU - just ways more efficiently.

Sample 2: add up matrixes

/* a CPU version of add_matrix */
void add_matrix ( float* a, float* b, float* c, int N )
{
    int index;
    for ( int i = 0; i < N; ++i )
    {
        for ( int j = 0; j < N; ++j )
        {
            index = i + j*N;
            c[index] = a[index] + b[index];
        }
}

int main()
{
    add_matrix( a, b, c, N );
}


/* the equivalent GPU version (prototype code) */
__global__ add_matrix( float* a, float* b, float* c, int N )
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int index = i + j*N;
    if ( i < N && j < N )
        c[index] = a[index] + b[index];
}

int main()
{
    dim3 dimBlock( blocksize, blocksize );
    dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );
    add_matrix<<<dimGrid, dimBlock>>>( a, b, c, N );
}


/* ------------------------------------------------------ */
/* a CPU version of add_matrix */
void add_matrix ( float* a, float* b, float* c, int N )
{
    int index;
    for ( int i = 0; i < N; ++i )
    {
        for ( int j = 0; j < N; ++j )
        {
            index = i + j*N;
            c[index] = a[index] + b[index];
        }
}

int main()
{
    add_matrix( a, b, c, N );
}


/* the equivalent GPU version (prototype code) */
__global__ add_matrix( float* a, float* b, float* c, int N )
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int index = i + j*N;
    if ( i < N && j < N )
        c[index] = a[index] + b[index];
}

int main()
{
    dim3 dimBlock( blocksize, blocksize );
    dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );
    add_matrix<<<dimGrid, dimBlock>>>( a, b, c, N );
}