37 #ifndef PCL_DEVICE_UTILS_BLOCK_HPP_
38 #define PCL_DEVICE_UTILS_BLOCK_HPP_
46 static __device__ __forceinline__
unsigned int id()
51 static __device__ __forceinline__
unsigned int stride()
53 return blockDim.x * blockDim.y * blockDim.z;
56 static __device__ __forceinline__
void sync()
63 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
66 template<
typename It,
typename T>
67 static __device__ __forceinline__
void fill(It beg, It end,
const T& value)
72 for(; t < end; t += STRIDE)
76 template<
typename OutIt,
typename T>
77 static __device__ __forceinline__
void yota(OutIt beg, OutIt end, T value)
83 for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
87 template<
typename InIt,
typename OutIt>
88 static __device__ __forceinline__
void copy(InIt beg, InIt end, OutIt out)
92 OutIt o = out + (t - beg);
94 for(; t < end; t += STRIDE, o += STRIDE)
98 template<
typename InIt,
typename OutIt,
class UnOp>
99 static __device__ __forceinline__
void transform(InIt beg, InIt end, OutIt out, UnOp op)
103 OutIt o = out + (t - beg);
105 for(; t < end; t += STRIDE, o += STRIDE)
109 template<
typename InIt1,
typename InIt2,
typename OutIt,
class BinOp>
110 static __device__ __forceinline__
void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
115 OutIt o = out + (t1 - beg1);
117 for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
121 template<
int CTA_SIZE,
typename T,
class BinOp>
122 static __device__ __forceinline__
void reduce(
volatile T* buffer, BinOp op)
127 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
128 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
129 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
130 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
134 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
135 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
136 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
137 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
138 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
139 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
143 template<
int CTA_SIZE,
typename T,
class BinOp>
144 static __device__ __forceinline__ T
reduce(
volatile T* buffer, T init, BinOp op)
147 T val = buffer[tid] = init;
150 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
151 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
152 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
153 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
157 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
158 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
159 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
160 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
161 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
162 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
168 template <
typename T,
class BinOp>
169 static __device__ __forceinline__
void reduce_n(T* data,
unsigned int n, BinOp op)
176 for (
unsigned int i = sft + ftid; i < n; i += sft)
177 data[ftid] = op(data[ftid], data[i]);
186 unsigned int half = n/2;
189 data[ftid] = op(data[ftid], data[n - ftid - 1]);
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
static __device__ __forceinline__ void sync()
static __device__ __forceinline__ void reduce_n(T *data, unsigned int n, BinOp op)
static __device__ __forceinline__ unsigned int stride()
static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
static __device__ __forceinline__ unsigned int id()
static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
static __device__ __forceinline__ int flattenedThreadId()