Divergence Example¶

Here is the code for divergence.cu

/*
 * Code to demonstrate thread divergence
 *
 * compile with:
 *   nvcc -o divergence divergence.cu
 */

#include <stdio.h>

__global__ void kernel_1(int *a) {
    int tid = threadIdx.x;
    int cell = tid % 32;
    a[cell]++;
}

__global__ void kernel_2(int *a) {
    int tid = threadIdx.x;
    int cell = tid % 32;
    switch(cell) {
    case 0:
      a[0]++;
      break;
    case 1:
      a[1]++;
      break;
    case 2:
      a[2]++;
      break;
    case 3:
      a[3]++;
      break;
    case 4:
      a[4]++;
      break;
    case 5:
      a[5]++;
      break;
    case 6:
      a[6]++;
      break;
    case 7:
      a[7]++;
      break;
    default:
      a[cell]++;
    }
}

void check(cudaError_t retVal) {
  //takes return value of a CUDA function and checks if it was an error
  if(retVal != cudaSuccess) {
    fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(retVal));
    exit(1);
  }
}

int main() {
  int a[32];   //array used for bucketing (on host)
  int *a_dev;  //GPU version

  //allocate the memory on the GPU
  check(cudaMalloc((void**)&a_dev, 32*sizeof(int)));

  //initialize a[i] to 0
  for (int i=0; i<32; i++)
    a[i] = 0;

  //allocate timers
  cudaEvent_t start;
  check(cudaEventCreate(&start));
  cudaEvent_t stop;
  check(cudaEventCreate(&stop));

  //start timer
  check(cudaEventRecord(start,0));

  //transfer array to the GPU
  check(cudaMemcpy(a_dev, a, 32*sizeof(int), cudaMemcpyHostToDevice));

  //call the kernel
  int threads = 512;
  int blocks = 512;
  kernel_2<<<blocks,threads>>>(a_dev);

  //transfer array back to the host
  check(cudaMemcpy(a, a_dev, 32*sizeof(int), cudaMemcpyDeviceToHost));

  //stop timer and print time
  check(cudaEventRecord(stop,0));
  check(cudaEventSynchronize(stop));
  float diff;
  check(cudaEventElapsedTime(&diff, start, stop));
  printf("time: %f ms\n", diff);

  /*
    //print bucket contents
    for(int i=0; i<32; i++)
    printf("%d ", a[i]);
    printf("\n");
  */

  //free memory and deallocate timers
  check(cudaFree(a_dev));
  check(cudaEventDestroy(start));
  check(cudaEventDestroy(stop));
}

Previous topic

Add Vectors Example

Next topic