37 #ifndef PCL_GPU_OCTREE_SCAN_BLOCK_HPP
38 #define PCL_GPU_OCTREE_SCAN_BLOCK_HPP
47 template <ScanKind Kind ,
class T>
48 __device__ __forceinline__ T
scan_warp (
volatile T *ptr ,
const unsigned int idx = threadIdx.x )
50 const unsigned int lane = idx & 31;
52 if ( lane >= 1) ptr [idx ] = ptr [idx - 1] + ptr [idx];
53 if ( lane >= 2) ptr [idx ] = ptr [idx - 2] + ptr [idx];
54 if ( lane >= 4) ptr [idx ] = ptr [idx - 4] + ptr [idx];
55 if ( lane >= 8) ptr [idx ] = ptr [idx - 8] + ptr [idx];
56 if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx];
61 return (lane > 0) ? ptr [idx - 1] : 0;
64 template <ScanKind Kind ,
class T>
65 __device__ __forceinline__ T
scan_block(
volatile T *ptr ,
const unsigned int idx = threadIdx.x )
67 const unsigned int lane = idx & 31;
68 const unsigned int warpid = idx >> 5;
71 T val = scan_warp <Kind>( ptr , idx );
85 ptr [ warpid ] = ptr [idx ];
91 scan_warp<inclusive>( ptr , idx );
97 val = ptr [warpid -1] + val;
__device__ __forceinline__ T scan_warp(volatile T *ptr, const unsigned int idx=threadIdx.x)
__device__ __forceinline__ T scan_block(volatile T *ptr, const unsigned int idx=threadIdx.x)