37 #ifndef PCL_GPU_DEVICE_REDUCE_HPP_
38 #define PCL_GPU_DEVICE_REDUCE_HPP_
44 template <
unsigned int CTA_SIZE,
typename T,
typename BinaryFunction>
45 __device__ __forceinline__
void reduce_block(
volatile T* data, BinaryFunction op,
unsigned int tid = threadIdx.x)
50 if (CTA_SIZE >= 512) {
if (tid < 256) { data[tid] = val = op(val, data[tid + 256]); } __syncthreads(); }
51 if (CTA_SIZE >= 256) {
if (tid < 128) { data[tid] = val = op(val, data[tid + 128]); } __syncthreads(); }
52 if (CTA_SIZE >= 128) {
if (tid < 64) { data[tid] = val = op(val, data[tid + 64]); } __syncthreads(); }
56 if (CTA_SIZE >= 64) data[tid] = val = op(val, data[tid + 32]);
57 if (CTA_SIZE >= 32) data[tid] = val = op(val, data[tid + 16]);
58 if (CTA_SIZE >= 16) data[tid] = val = op(val, data[tid + 8]);
59 if (CTA_SIZE >= 8) data[tid] = val = op(val, data[tid + 4]);
60 if (CTA_SIZE >= 4) data[tid] = val = op(val, data[tid + 2]);
61 if (CTA_SIZE >= 2) data[tid] = val = op(val, data[tid + 1]);
__device__ __forceinline__ void reduce_block(volatile T *data, BinaryFunction op, unsigned int tid=threadIdx.x)