37 #ifndef PCL_DEVICE_UTILS_WARP_HPP_
38 #define PCL_DEVICE_UTILS_WARP_HPP_
54 static __device__ __forceinline__
unsigned int laneId()
57 asm(
"mov.u32 %0, %laneid;" :
"=r"(ret) );
64 asm(
"mov.u32 %0, %lanemask_le;" :
"=r"(ret) );
71 asm(
"mov.u32 %0, %lanemask_lt;" :
"=r"(ret) );
74 static __device__ __forceinline__
unsigned int id()
76 int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
90 template<
typename It,
typename T>
91 static __device__ __forceinline__
void fill(It beg, It end,
const T& value)
97 template<
typename InIt,
typename OutIt>
98 static __device__ __forceinline__ OutIt
copy(InIt beg, InIt end, OutIt out)
100 unsigned int lane =
laneId();
102 OutIt o = out + lane;
109 template<
typename InIt,
typename OutIt,
class UnOp>
110 static __device__ __forceinline__ OutIt
transform(InIt beg, InIt end, OutIt out, UnOp op)
112 unsigned int lane =
laneId();
114 OutIt o = out + lane;
121 template<
typename InIt1,
typename InIt2,
typename OutIt,
class BinOp>
122 static __device__ __forceinline__ OutIt
transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
124 unsigned int lane =
laneId();
125 InIt1 t1 = beg1 + lane;
126 InIt2 t2 = beg2 + lane;
127 OutIt o = out + lane;
134 template<
typename OutIt,
typename T>
135 static __device__ __forceinline__
void yota(OutIt beg, OutIt end, T value)
137 unsigned int lane =
laneId();
140 for(OutIt t = beg + lane; t < end; t +=
STRIDE, value +=
STRIDE)
144 template<
typename T,
class BinOp>
145 static __device__ __forceinline__
void reduce(
volatile T* buffer, BinOp op)
147 unsigned int lane =
laneId();
148 T val = buffer[lane];
152 buffer[lane] = val = op(val, buffer[lane + 16]);
153 buffer[lane] = val = op(val, buffer[lane + 8]);
154 buffer[lane] = val = op(val, buffer[lane + 4]);
155 buffer[lane] = val = op(val, buffer[lane + 2]);
156 buffer[lane] = val = op(val, buffer[lane + 1]);
160 template<
typename T,
class BinOp>
161 static __device__ __forceinline__ T
reduce(
volatile T* buffer, T init, BinOp op)
163 unsigned int lane =
laneId();
164 T val = buffer[lane] = init;
168 buffer[lane] = val = op(val, buffer[lane + 16]);
169 buffer[lane] = val = op(val, buffer[lane + 8]);
170 buffer[lane] = val = op(val, buffer[lane + 4]);
171 buffer[lane] = val = op(val, buffer[lane + 2]);
172 buffer[lane] = val = op(val, buffer[lane + 1]);
static __device__ __forceinline__ int laneMaskLe()
static __device__ __forceinline__ int binaryInclScan(int ballot_mask)
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
static __device__ __forceinline__ int laneMaskLt()
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
static __device__ __forceinline__ unsigned int id()
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)