CUDA Samples
This page provides some simplistic samples.

Sample 1: Generate random numbers and COPY data to the GPU
/* implement random generator and copy to CUDA */
nn_precision*
generate_random_numbers(int number_of_values)
{
nn_precision *cuda_float_p;
/* allocate host memory and CUDA memory */
nn_precision *host_p = (nn_precision *)pg_palloc(sizeof(nn_precision) * number_of_values);
CUDATOOLS_SAFE_CALL( cudaMalloc( (void**) &cuda_float_p,
sizeof(nn_precision) * number_of_values));
/* create random numbers */
for (int i = 0; i < number_of_values; i++)
{
host_p[i] = (nn_precision) drand48();
}
/* copy data to CUDA and return pointer to CUDA structure */
CUDATOOLS_SAFE_CALL( cudaMemcpy(cuda_float_p, host_p,
sizeof(nn_precision) * number_of_values, cudaMemcpyHostToDevice) );
return cuda_float_p;
}
CUDA offers a set of simple to use functions to copy data from the CPU to the GPU. On the GPU we can basically perform all operations which can be done on a GPU - just ways more efficiently.
Sample 2: add up matrixes
/* a CPU version of add_matrix */
void add_matrix ( float* a, float* b, float* c, int N )
{
int index;
for ( int i = 0; i < N; ++i )
{
for ( int j = 0; j < N; ++j )
{
index = i + j*N;
c[index] = a[index] + b[index];
}
}
int main()
{
add_matrix( a, b, c, N );
}
/* the equivalent GPU version (prototype code) */
__global__ add_matrix( float* a, float* b, float* c, int N )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i + j*N;
if ( i < N && j < N )
c[index] = a[index] + b[index];
}
int main()
{
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );
add_matrix<<<dimGrid, dimBlock>>>( a, b, c, N );
}
/* ------------------------------------------------------ */
/* a CPU version of add_matrix */
void add_matrix ( float* a, float* b, float* c, int N )
{
int index;
for ( int i = 0; i < N; ++i )
{
for ( int j = 0; j < N; ++j )
{
index = i + j*N;
c[index] = a[index] + b[index];
}
}
int main()
{
add_matrix( a, b, c, N );
}
/* the equivalent GPU version (prototype code) */
__global__ add_matrix( float* a, float* b, float* c, int N )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i + j*N;
if ( i < N && j < N )
c[index] = a[index] + b[index];
}
int main()
{
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( N/dimBlock.x, N/dimBlock.y );
add_matrix<<<dimGrid, dimBlock>>>( a, b, c, N );
}
