/* kernel routine starts with keyword __global__ */
#include <stdio.h>

#define BLOCK_SIZE 512

__global__ void vecadd(float* A, float* B, float* C)
{
  int  i = blockIdx.x * blockDim.x + threadIdx.x;  // threadIdx is a CUDA built-in variable 
  C[i] = A[i] + B[i];
}

int main(int argc, char * argv[])
{
  float *unified_A, *unified_B, *unified_C;
  int i, n;

  if (argc == 1) n = 1024;
  else n = atoi(argv[1]);

  /* 1. allocate unified memory */
  cudaMallocManaged( &unified_A, n*sizeof(float) );
  cudaMallocManaged( &unified_B, n*sizeof(float) ); 
  cudaMallocManaged( &unified_C, n*sizeof(float) ); 

  /* initialize array A and B */
  for(  int i = 0; i < n; ++i ) {
    unified_A[i] = (float) 1.0;
    unified_B[i] = (float) 1.0;
  }

  /* 2. call kernel routine to execute on GPU */
  /* launch 1 thread per vector-element, 1024 threads per block */
  vecadd<<<n/BLOCK_SIZE, BLOCK_SIZE>>>( unified_A, unified_B, unified_C );
  //cudaDeviceSynchronize();
 
#ifdef CHECK
  {
    FILE *fd;
    fd = fopen("tmp333", "w");
    for (i=0;i<n; i++) {	
      fprintf(fd, "%f\n", unified_C[i]);
    }
  }
#endif

  /* free host and GPU memory */
  cudaFree(unified_A);
  cudaFree(unified_B);
  cudaFree(unified_C);
 
  return( 0 );
}