Answered step by step

Verified Expert Solution

Link Copied!

Question

1 Approved Answer

Posted on Sep 26, 2024

Write a CUDA program to implement the same functionalities as shown in the codes below , perform different experiments, and write a short description on

Write a CUDA program to implement the same functionalities as shown in the codes below, perform different experiments, and write a short description on what was made. The CUDA kernel function results and running time of the kernel(s) should be displayed as output. Thus, your main task it to write an efficient CUDA kernel to compute the SDH. In addition, your program should also include the following features.

Input/Output of Your Program: You have to modify the program to take a different number of command line arguments from what is in the codes. Your program should take care of bad or missing inputs and throw appropriate errors in a reasonable way. In particular, here is what we expect in launching your program:

./proj2 {#of_samples} {bucket_width} {block_size}

where proj2 is assumed to be the executable after compiling your project. The first two arguments are the same as in given code, while the last one is the number of threads within each block your CUDA kernel should be launched. The output of your program should print out the SDH you computed as in given code. Following the SDH, you should add a line to report the performance of your kernel, it should look like the following sample.

******** Total Running Time of Kernel = 2.0043 sec *******

*details of measuring kernel running time

1: cudaEvent_t start, stop;

2: cudaEventCreate(&start);

3: cudaEventCreate(&stop);

4: cudaEventRecord( start, 0 );

5: /* Your Kernel call goes here */

6: cudaEventRecord( stop, 0 );

7: cudaEventSynchronize( stop );

8: float elapsedTime;

9: cudaEventElapsedTime( &elapsedTime, start, stop );

10: printf( "Time to generate: %0.5f ms ", elapsedTime );

11: cudaEventDestroy( start );

12: cudaEventDestroy( stop );

==================================================================

The basic SDH algorithm implementation for 3D data

To compile: nvcc SDH.c -o SDH in the rc machines

==================================================================

#include

#define BOX_SIZE 23000 /* size of the data box on one dimension */

/* descriptors for single atom in the tree */

typedef struct atomdesc {

double x_pos;

double y_pos;

double z_pos;

} atom;

typedef struct hist_entry{

//float min;

//float max;

unsigned long long d_cnt; /* need a long long type as the count might be huge */

} bucket;

bucket * histogram; /* list of all buckets in the histogram */

long long PDH_acnt; /* total number of data points */

int num_buckets; /* total number of buckets in the histogram */

double PDH_res; /* value of w */

atom * atom_list; /* list of all data points */

/* These are for an old way of tracking time */

struct timezone Idunno;

struct timeval startTime, endTime;

// Checking for CUDA Error

void checkError(cudaError_t e, const char out[]){

if(e != cudaSuccess){

printf("There is a CUDA Error: %s, %s ", out, cudaGetErrorString(e));

exit(EXIT_FAILURE);

}

// distance of two points in the atom_list

__device__

double p2p_distance(atom *l, int ind1, int ind2) {

double x1 = l[ind1].x_pos;

double x2 = l[ind2].x_pos;

double y1 = l[ind1].y_pos;

double y2 = l[ind2].y_pos;

double z1 = l[ind1].z_pos;

double z2 = l[ind2].z_pos;

return sqrt((x1 - x2)*(x1-x2) + (y1 - y2)*(y1 - y2) + (z1 - z2)*(z1 - z2));

}

//SDH solution in a single CPU thread

__global__

void PDH_baseline(bucket *histogram_in, atom *list, double width, int size) {

int i, j, h_pos;

double dist;

i = blockIdx.x * blockDim.x + threadIdx.x;

j = i + 1;

for(int x = j; x < size; x++){

dist = p2p_distance(list,i,x);

h_pos = (int) (dist/ width);

atomicAdd( &histogram_in[h_pos].d_cnt, 1);

}

// set a checkpoint

// and

// show running time in seconds

double report_running_time() {

long sec_diff, usec_diff;

gettimeofday(&endTime, &Idunno);

sec_diff = endTime.tv_sec - startTime.tv_sec;

usec_diff= endTime.tv_usec-startTime.tv_usec;

if(usec_diff < 0) {

sec_diff --;

usec_diff += 1000000;

}

printf("Running time for GPU version: %ld.%06lds ", sec_diff, usec_diff);

return (double)(sec_diff*1.0 + usec_diff/1000000.0);

}

brute-force solution in a GPU thread

__global__

void PDH2D_baseline(bucket *histogram, atom *Atomlist, double w){

int i = (blockIdx.x * blockDim.x) + threadIdx.x;

int j = (blockIdx.y * blockDim.y) + threadIdx.y;

if(i < j){

double dist = p2p_distance(Atomlist, i, j);

int h_pos = (int)(dist / w);

histogram[h_pos].d_cnt++;

printf("%d, %d : %d, %f ", i, j, h_pos, dist);

}

print the counts in all buckets of the histogram

void output_histogram(bucket *histogram){

int i;

long long total_cnt = 0;

for(i=0; i< num_buckets; i++) {

if(i%5 == 0) /* we print 5 buckets in a row */

printf(" %02d: ", i);

printf("%15lld ", histogram[i].d_cnt);

total_cnt += histogram[i].d_cnt;

/* we also want to make sure the total distance count is correct */

if(i == num_buckets - 1)

printf(" T:%lld ", total_cnt);

else printf("| ");

}

MAIN

int main(int argc, char **argv)

{

PDH_acnt = atoi(argv[1]);

PDH_res = atof(argv[2]);

num_buckets = (int)(BOX_SIZE * 1.732 / PDH_res) + 1;

size_t histogramSize = sizeof(bucket)*num_buckets;

size_t atomSize = sizeof(atom)*PDH_acnt;

histogram = (bucket *)malloc(histogramSize);

atom_list = (atom *)malloc(atomSize);

srand(1);

/* uniform distribution */

for(int i = 0; i < PDH_acnt; i++) {

atom_list[i].x_pos = ((double)(rand()) / RAND_MAX) * BOX_SIZE;

atom_list[i].y_pos = ((double)(rand()) / RAND_MAX) * BOX_SIZE;

atom_list[i].z_pos = ((double)(rand()) / RAND_MAX) * BOX_SIZE;

}

/* Malloc Space on Device, copy to Device */

bucket *d_histogram = NULL;

atom *d_atom_list = NULL;

/* Error Checks */

checkError( cudaMalloc((void**) &d_histogram, histogramSize), "Malloc Histogram");

checkError( cudaMalloc((void**) &d_atom_list, atomSize), "Malloc Atom List");

checkError( cudaMemcpy(d_histogram, histogram, histogramSize, cudaMemcpyHostToDevice), "Copy Histogram to Device");

checkError( cudaMemcpy(d_atom_list, atom_list, atomSize, cudaMemcpyHostToDevice), "Copy Atom_List to Device");

/* start counting time */

gettimeofday(&startTime, &Idunno);

/* CUDA Kernel Call */

PDH_baseline <<>> (d_histogram, d_atom_list, PDH_res, PDH_acnt);

/* Checks Cuda Error*/

checkError(cudaGetLastError(), "Checking Last Error, Kernel Launch");

checkError( cudaMemcpy(histogram, d_histogram, histogramSize, cudaMemcpyDeviceToHost), "Copy Device Histogram to Host");

/* check the total running time */

report_running_time();

/* print out the histogram */

output_histogram(histogram);

/* Error Checks */

checkError(cudaFree(d_histogram), "Free Device Histogram");

checkError(cudaFree(d_atom_list), "Free Device Atom_List");

/* Free Memory */

free(histogram);

free(atom_list);

/* Reset */

checkError(cudaDeviceReset(), "Device Reset");

return 0;

}

Step by Step Solution

There are 3 Steps involved in it

Step: 1

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

Step: 3

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Pro SQL Server Administration

Authors: Peter Carter

1st Edition

★★★★★

Provide an example of how the sort level Unspecified can be excluded from the Gender Dimension in a Pivot Table worksheet genderwaggap01.

Answered: 1 week ago

Previous Question Next Question