|  | __global__ void sum_array(const int * array, int * total, unsigned int n) { | 
|  | unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; | 
|  | unsigned int stride = gridDim.x * blockDim.x; | 
|  | unsigned int input_idx = idx; | 
|  |  | 
|  | __shared__ int partial_res[256]; | 
|  |  | 
|  | int partial_sum = 0; | 
|  | while (input_idx < n) { | 
|  | partial_sum += array[input_idx]; | 
|  | input_idx += stride; | 
|  | } | 
|  |  | 
|  | partial_res[threadIdx.x] = partial_sum; | 
|  |  | 
|  | __syncthreads(); | 
|  |  | 
|  | // reduction | 
|  | unsigned int i = blockDim.x / 2; | 
|  | while (i != 0) { | 
|  | if (threadIdx.x < i) { | 
|  | partial_res[threadIdx.x] += partial_res[threadIdx.x + i]; | 
|  | } | 
|  | __syncthreads(); | 
|  | i /= 2; | 
|  | } | 
|  |  | 
|  | if (threadIdx.x == 0) { | 
|  | atomicAdd(total, partial_res[0]); | 
|  | } | 
|  | } |