__global__ void sum_array(const int * array, int * total, unsigned int n) { | |
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; | |
unsigned int stride = gridDim.x * blockDim.x; | |
unsigned int input_idx = idx; | |
__shared__ int partial_res[256]; | |
int partial_sum = 0; | |
while (input_idx < n) { | |
partial_sum += array[input_idx]; | |
input_idx += stride; | |
} | |
partial_res[threadIdx.x] = partial_sum; | |
__syncthreads(); | |
// reduction | |
unsigned int i = blockDim.x / 2; | |
while (i != 0) { | |
if (threadIdx.x < i) { | |
partial_res[threadIdx.x] += partial_res[threadIdx.x + i]; | |
} | |
__syncthreads(); | |
i /= 2; | |
} | |
if (threadIdx.x == 0) { | |
atomicAdd(total, partial_res[0]); | |
} | |
} |