/* kernel routine starts with keyword __global__ */ #include #define BLOCK_SIZE 512 __global__ void vecadd(float* A, float* B, float* C) { int i = blockIdx.x * blockDim.x + threadIdx.x; // threadIdx is a CUDA built-in variable C[i] = A[i] + B[i]; } int main(int argc, char * argv[]) { float *unified_A, *unified_B, *unified_C; int i, n; if (argc == 1) n = 1024; else n = atoi(argv[1]); /* 1. allocate unified memory */ cudaMallocManaged( &unified_A, n*sizeof(float) ); cudaMallocManaged( &unified_B, n*sizeof(float) ); cudaMallocManaged( &unified_C, n*sizeof(float) ); /* initialize array A and B */ for( int i = 0; i < n; ++i ) { unified_A[i] = (float) 1.0; unified_B[i] = (float) 1.0; } /* 2. call kernel routine to execute on GPU */ /* launch 1 thread per vector-element, 1024 threads per block */ vecadd<<>>( unified_A, unified_B, unified_C ); //cudaDeviceSynchronize(); #ifdef CHECK { FILE *fd; fd = fopen("tmp333", "w"); for (i=0;i