Point Cloud Library (PCL)  1.14.1-dev
utils.hpp
1 /*
2  * Software License Agreement (BSD License)
3  *
4  * Point Cloud Library (PCL) - www.pointclouds.org
5  * Copyright (c) 2011, Willow Garage, Inc.
6  *
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * * Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * * Redistributions in binary form must reproduce the above
16  * copyright notice, this list of conditions and the following
17  * disclaimer in the documentation and/or other materials provided
18  * with the distribution.
19  * * Neither the name of Willow Garage, Inc. nor the names of its
20  * contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  *
36  */
37 
38 
39 #ifndef PCL_GPU_KINFU_CUDA_UTILS_HPP_
40 #define PCL_GPU_KINFU_CUDA_UTILS_HPP_
41 
42 #include <pcl/common/utils.h> // pcl::utils::ignore
43 
44 #include <limits>
45 
46 #include <cuda.h>
47 
48 namespace pcl
49 {
50  namespace device
51  {
52  namespace kinfuLS
53  {
54  template <class T>
55  __device__ __host__ __forceinline__ void swap ( T& a, T& b )
56  {
57  T c(a); a=b; b=c;
58  }
59 
60  __device__ __forceinline__ float
61  dot(const float3& v1, const float3& v2)
62  {
63  return v1.x * v2.x + v1.y*v2.y + v1.z*v2.z;
64  }
65 
66  __device__ __forceinline__ float3&
67  operator+=(float3& vec, const float& v)
68  {
69  vec.x += v; vec.y += v; vec.z += v; return vec;
70  }
71 
72  __device__ __forceinline__ float3
73  operator+(const float3& v1, const float3& v2)
74  {
75  return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
76  }
77 
78  __device__ __forceinline__ float3&
79  operator*=(float3& vec, const float& v)
80  {
81  vec.x *= v; vec.y *= v; vec.z *= v; return vec;
82  }
83 
84  __device__ __forceinline__ float3
85  operator-(const float3& v1, const float3& v2)
86  {
87  return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
88  }
89 
90  __device__ __forceinline__ float3
91  operator*(const float3& v1, const float& v)
92  {
93  return make_float3(v1.x * v, v1.y * v, v1.z * v);
94  }
95 
96  __device__ __forceinline__ float
97  norm(const float3& v)
98  {
99  return sqrt(dot(v, v));
100  }
101 
102  __device__ __forceinline__ float3
103  normalized(const float3& v)
104  {
105  return v * rsqrt(dot(v, v));
106  }
107 
108  __device__ __host__ __forceinline__ float3
109  cross(const float3& v1, const float3& v2)
110  {
111  return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
112  }
113 
114  __device__ __forceinline__ void computeRoots2(const float& b, const float& c, float3& roots)
115  {
116  roots.x = 0.f;
117  float d = b * b - 4.f * c;
118  if (d < 0.f) // no real roots!!!! THIS SHOULD NOT HAPPEN!
119  d = 0.f;
120 
121  float sd = sqrtf(d);
122 
123  roots.z = 0.5f * (b + sd);
124  roots.y = 0.5f * (b - sd);
125  }
126 
127  __device__ __forceinline__ void
128  computeRoots3(float c0, float c1, float c2, float3& roots)
129  {
130  if ( std::abs(c0) < std::numeric_limits<float>::epsilon())// one root is 0 -> quadratic equation
131  {
132  computeRoots2 (c2, c1, roots);
133  }
134  else
135  {
136  const float s_inv3 = 1.f/3.f;
137  const float s_sqrt3 = sqrtf(3.f);
138  // Construct the parameters used in classifying the roots of the equation
139  // and in solving the equation for the roots in closed form.
140  float c2_over_3 = c2 * s_inv3;
141  float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
142  if (a_over_3 > 0.f)
143  a_over_3 = 0.f;
144 
145  float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
146 
147  float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
148  if (q > 0.f)
149  q = 0.f;
150 
151  // Compute the eigenvalues by solving for the roots of the polynomial.
152  float rho = sqrtf(-a_over_3);
153  float theta = std::atan2 (sqrtf (-q), half_b)*s_inv3;
154  float cos_theta = __cosf (theta);
155  float sin_theta = __sinf (theta);
156  roots.x = c2_over_3 + 2.f * rho * cos_theta;
157  roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
158  roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
159 
160  // Sort in increasing order.
161  if (roots.x >= roots.y)
162  swap(roots.x, roots.y);
163 
164  if (roots.y >= roots.z)
165  {
166  swap(roots.y, roots.z);
167 
168  if (roots.x >= roots.y)
169  swap (roots.x, roots.y);
170  }
171  if (roots.x <= 0) // eigenval for symmetric positive semi-definite matrix can not be negative! Set it to 0
172  computeRoots2 (c2, c1, roots);
173  }
174  }
175 
176  struct Eigen33
177  {
178  public:
179  template<int Rows>
180  struct MiniMat
181  {
182  float3 data[Rows];
183  __device__ __host__ __forceinline__ float3& operator[](int i) { return data[i]; }
184  __device__ __host__ __forceinline__ const float3& operator[](int i) const { return data[i]; }
185  };
186  using Mat33 = MiniMat<3>;
187  using Mat43 = MiniMat<4>;
188 
189 
190  static __forceinline__ __device__ float3
191  unitOrthogonal (const float3& src)
192  {
193  float3 perp;
194  /* Let us compute the crossed product of *this with a vector
195  * that is not too close to being colinear to *this.
196  */
197 
198  /* unless the x and y coords are both close to zero, we can
199  * simply take ( -y, x, 0 ) and normalize it.
200  */
201  if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
202  {
203  float invnm = rsqrtf(src.x*src.x + src.y*src.y);
204  perp.x = -src.y * invnm;
205  perp.y = src.x * invnm;
206  perp.z = 0.0f;
207  }
208  /* if both x and y are close to zero, then the vector is close
209  * to the z-axis, so it's far from colinear to the x-axis for instance.
210  * So we take the crossed product with (1,0,0) and normalize it.
211  */
212  else
213  {
214  float invnm = rsqrtf(src.z * src.z + src.y * src.y);
215  perp.x = 0.0f;
216  perp.y = -src.z * invnm;
217  perp.z = src.y * invnm;
218  }
219 
220  return perp;
221  }
222 
223  __device__ __forceinline__
224  Eigen33(volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
225  __device__ __forceinline__ void
226  compute(Mat33& tmp, Mat33& vec_tmp, Mat33& evecs, float3& evals)
227  {
228  // Scale the matrix so its entries are in [-1,1]. The scaling is applied
229  // only when at least one matrix entry has magnitude larger than 1.
230 
231  float max01 = fmaxf( std::abs(mat_pkg[0]), std::abs(mat_pkg[1]) );
232  float max23 = fmaxf( std::abs(mat_pkg[2]), std::abs(mat_pkg[3]) );
233  float max45 = fmaxf( std::abs(mat_pkg[4]), std::abs(mat_pkg[5]) );
234  float m0123 = fmaxf( max01, max23);
235  float scale = fmaxf( max45, m0123);
236 
237  if (scale <= std::numeric_limits<float>::min())
238  scale = 1.f;
239 
240  mat_pkg[0] /= scale;
241  mat_pkg[1] /= scale;
242  mat_pkg[2] /= scale;
243  mat_pkg[3] /= scale;
244  mat_pkg[4] /= scale;
245  mat_pkg[5] /= scale;
246 
247  // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0. The
248  // eigenvalues are the roots to this equation, all guaranteed to be
249  // real-valued, because the matrix is symmetric.
250  float c0 = m00() * m11() * m22()
251  + 2.f * m01() * m02() * m12()
252  - m00() * m12() * m12()
253  - m11() * m02() * m02()
254  - m22() * m01() * m01();
255  float c1 = m00() * m11() -
256  m01() * m01() +
257  m00() * m22() -
258  m02() * m02() +
259  m11() * m22() -
260  m12() * m12();
261  float c2 = m00() + m11() + m22();
262 
263  computeRoots3(c0, c1, c2, evals);
264 
265  if(evals.z - evals.x <= std::numeric_limits<float>::epsilon())
266  {
267  evecs[0] = make_float3(1.f, 0.f, 0.f);
268  evecs[1] = make_float3(0.f, 1.f, 0.f);
269  evecs[2] = make_float3(0.f, 0.f, 1.f);
270  }
271  else if (evals.y - evals.x <= std::numeric_limits<float>::epsilon() )
272  {
273  // first and second equal
274  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
275  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
276 
277  vec_tmp[0] = cross(tmp[0], tmp[1]);
278  vec_tmp[1] = cross(tmp[0], tmp[2]);
279  vec_tmp[2] = cross(tmp[1], tmp[2]);
280 
281  float len1 = dot (vec_tmp[0], vec_tmp[0]);
282  float len2 = dot (vec_tmp[1], vec_tmp[1]);
283  float len3 = dot (vec_tmp[2], vec_tmp[2]);
284 
285  if (len1 >= len2 && len1 >= len3)
286  {
287  evecs[2] = vec_tmp[0] * rsqrtf (len1);
288  }
289  else if (len2 >= len1 && len2 >= len3)
290  {
291  evecs[2] = vec_tmp[1] * rsqrtf (len2);
292  }
293  else
294  {
295  evecs[2] = vec_tmp[2] * rsqrtf (len3);
296  }
297 
298  evecs[1] = unitOrthogonal(evecs[2]);
299  evecs[0] = cross(evecs[1], evecs[2]);
300  }
301  else if (evals.z - evals.y <= std::numeric_limits<float>::epsilon() )
302  {
303  // second and third equal
304  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
305  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
306 
307  vec_tmp[0] = cross(tmp[0], tmp[1]);
308  vec_tmp[1] = cross(tmp[0], tmp[2]);
309  vec_tmp[2] = cross(tmp[1], tmp[2]);
310 
311  float len1 = dot(vec_tmp[0], vec_tmp[0]);
312  float len2 = dot(vec_tmp[1], vec_tmp[1]);
313  float len3 = dot(vec_tmp[2], vec_tmp[2]);
314 
315  if (len1 >= len2 && len1 >= len3)
316  {
317  evecs[0] = vec_tmp[0] * rsqrtf(len1);
318  }
319  else if (len2 >= len1 && len2 >= len3)
320  {
321  evecs[0] = vec_tmp[1] * rsqrtf(len2);
322  }
323  else
324  {
325  evecs[0] = vec_tmp[2] * rsqrtf(len3);
326  }
327 
328  evecs[1] = unitOrthogonal( evecs[0] );
329  evecs[2] = cross(evecs[0], evecs[1]);
330  }
331  else
332  {
333 
334  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
335  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
336 
337  vec_tmp[0] = cross(tmp[0], tmp[1]);
338  vec_tmp[1] = cross(tmp[0], tmp[2]);
339  vec_tmp[2] = cross(tmp[1], tmp[2]);
340 
341  float len1 = dot(vec_tmp[0], vec_tmp[0]);
342  float len2 = dot(vec_tmp[1], vec_tmp[1]);
343  float len3 = dot(vec_tmp[2], vec_tmp[2]);
344 
345  float mmax[3];
346 
347  unsigned int min_el = 2;
348  unsigned int max_el = 2;
349  if (len1 >= len2 && len1 >= len3)
350  {
351  mmax[2] = len1;
352  evecs[2] = vec_tmp[0] * rsqrtf (len1);
353  }
354  else if (len2 >= len1 && len2 >= len3)
355  {
356  mmax[2] = len2;
357  evecs[2] = vec_tmp[1] * rsqrtf (len2);
358  }
359  else
360  {
361  mmax[2] = len3;
362  evecs[2] = vec_tmp[2] * rsqrtf (len3);
363  }
364 
365  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
366  tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
367 
368  vec_tmp[0] = cross(tmp[0], tmp[1]);
369  vec_tmp[1] = cross(tmp[0], tmp[2]);
370  vec_tmp[2] = cross(tmp[1], tmp[2]);
371 
372  len1 = dot(vec_tmp[0], vec_tmp[0]);
373  len2 = dot(vec_tmp[1], vec_tmp[1]);
374  len3 = dot(vec_tmp[2], vec_tmp[2]);
375 
376  if (len1 >= len2 && len1 >= len3)
377  {
378  mmax[1] = len1;
379  evecs[1] = vec_tmp[0] * rsqrtf (len1);
380  min_el = len1 <= mmax[min_el] ? 1 : min_el;
381  max_el = len1 > mmax[max_el] ? 1 : max_el;
382  }
383  else if (len2 >= len1 && len2 >= len3)
384  {
385  mmax[1] = len2;
386  evecs[1] = vec_tmp[1] * rsqrtf (len2);
387  min_el = len2 <= mmax[min_el] ? 1 : min_el;
388  max_el = len2 > mmax[max_el] ? 1 : max_el;
389  }
390  else
391  {
392  mmax[1] = len3;
393  evecs[1] = vec_tmp[2] * rsqrtf (len3);
394  min_el = len3 <= mmax[min_el] ? 1 : min_el;
395  max_el = len3 > mmax[max_el] ? 1 : max_el;
396  }
397 
398  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
399  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
400 
401  vec_tmp[0] = cross(tmp[0], tmp[1]);
402  vec_tmp[1] = cross(tmp[0], tmp[2]);
403  vec_tmp[2] = cross(tmp[1], tmp[2]);
404 
405  len1 = dot (vec_tmp[0], vec_tmp[0]);
406  len2 = dot (vec_tmp[1], vec_tmp[1]);
407  len3 = dot (vec_tmp[2], vec_tmp[2]);
408 
409 
410  if (len1 >= len2 && len1 >= len3)
411  {
412  mmax[0] = len1;
413  evecs[0] = vec_tmp[0] * rsqrtf (len1);
414  min_el = len3 <= mmax[min_el] ? 0 : min_el;
415  max_el = len3 > mmax[max_el] ? 0 : max_el;
416  }
417  else if (len2 >= len1 && len2 >= len3)
418  {
419  mmax[0] = len2;
420  evecs[0] = vec_tmp[1] * rsqrtf (len2);
421  min_el = len3 <= mmax[min_el] ? 0 : min_el;
422  max_el = len3 > mmax[max_el] ? 0 : max_el;
423  }
424  else
425  {
426  mmax[0] = len3;
427  evecs[0] = vec_tmp[2] * rsqrtf (len3);
428  min_el = len3 <= mmax[min_el] ? 0 : min_el;
429  max_el = len3 > mmax[max_el] ? 0 : max_el;
430  }
431 
432  unsigned mid_el = 3 - min_el - max_el;
433  evecs[min_el] = normalized( cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
434  evecs[mid_el] = normalized( cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
435  }
436  // Rescale back to the original size.
437  evals *= scale;
438  }
439  private:
440  volatile float* mat_pkg;
441 
442  __device__ __forceinline__ float m00() const { return mat_pkg[0]; }
443  __device__ __forceinline__ float m01() const { return mat_pkg[1]; }
444  __device__ __forceinline__ float m02() const { return mat_pkg[2]; }
445  __device__ __forceinline__ float m10() const { return mat_pkg[1]; }
446  __device__ __forceinline__ float m11() const { return mat_pkg[3]; }
447  __device__ __forceinline__ float m12() const { return mat_pkg[4]; }
448  __device__ __forceinline__ float m20() const { return mat_pkg[2]; }
449  __device__ __forceinline__ float m21() const { return mat_pkg[4]; }
450  __device__ __forceinline__ float m22() const { return mat_pkg[5]; }
451 
452  __device__ __forceinline__ float3 row0() const { return make_float3( m00(), m01(), m02() ); }
453  __device__ __forceinline__ float3 row1() const { return make_float3( m10(), m11(), m12() ); }
454  __device__ __forceinline__ float3 row2() const { return make_float3( m20(), m21(), m22() ); }
455 
456  __device__ __forceinline__ static bool isMuchSmallerThan (float x, float y)
457  {
458  // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
459  constexpr float prec_sqr = std::numeric_limits<float>::epsilon() * std::numeric_limits<float>::epsilon();
460  return x * x <= prec_sqr * y * y;
461  }
462  };
463 
464  struct Block
465  {
466  static __device__ __forceinline__ unsigned int stride()
467  {
468  return blockDim.x * blockDim.y * blockDim.z;
469  }
470 
471  static __device__ __forceinline__ int
473  {
474  return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
475  }
476 
477  template<int CTA_SIZE, typename T, class BinOp>
478  static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
479  {
480  int tid = flattenedThreadId();
481  T val = buffer[tid];
482 
483  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
484  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
485  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
486  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
487 
488  if (tid < 32)
489  {
490  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
491  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
492  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
493  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
494  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
495  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
496  }
497  }
498 
499  template<int CTA_SIZE, typename T, class BinOp>
500  static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
501  {
502  int tid = flattenedThreadId();
503  T val = buffer[tid] = init;
504  __syncthreads();
505 
506  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
507  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
508  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
509  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
510 
511  if (tid < 32)
512  {
513  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
514  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
515  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
516  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
517  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
518  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
519  }
520  __syncthreads();
521  return buffer[0];
522  }
523  };
524 
525  struct Warp
526  {
527  enum
528  {
531  STRIDE = WARP_SIZE
532  };
533 
534  /** \brief Returns the warp lane ID of the calling thread. */
535  static __device__ __forceinline__ unsigned int
537  {
538  unsigned int ret;
539  asm("mov.u32 %0, %laneid;" : "=r"(ret) );
540  return ret;
541  }
542 
543  static __device__ __forceinline__ unsigned int id()
544  {
545  int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
546  return tid >> LOG_WARP_SIZE;
547  }
548 
549  static __device__ __forceinline__
551  {
552  unsigned int ret;
553  asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
554  return ret;
555  }
556 
557  static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
558  {
559  return __popc(Warp::laneMaskLt() & ballot_mask);
560  }
561  };
562 
563 
564  struct Emulation
565  {
566  static __device__ __forceinline__ int
567  warp_reduce ( volatile int *ptr , const unsigned int tid)
568  {
569  const unsigned int lane = tid & 31; // index of thread in warp (0..31)
570 
571  if (lane < 16)
572  {
573  int partial = ptr[tid];
574 
575  ptr[tid] = partial = partial + ptr[tid + 16];
576  ptr[tid] = partial = partial + ptr[tid + 8];
577  ptr[tid] = partial = partial + ptr[tid + 4];
578  ptr[tid] = partial = partial + ptr[tid + 2];
579  ptr[tid] = partial = partial + ptr[tid + 1];
580  }
581  return ptr[tid - lane];
582  }
583 
584  static __forceinline__ __device__ int
585  Ballot(int predicate, volatile int* cta_buffer)
586  {
587  pcl::utils::ignore(cta_buffer);
588  #if CUDA_VERSION >= 9000
589  return __ballot_sync (__activemask (), predicate);
590  #else
591  return __ballot (predicate);
592  #endif
593  }
594 
595  static __forceinline__ __device__ bool
596  All(int predicate, volatile int* cta_buffer)
597  {
598  pcl::utils::ignore(cta_buffer);
599  #if CUDA_VERSION >= 9000
600  return __all_sync (__activemask (), predicate);
601  #else
602  return __all (predicate);
603  #endif
604  }
605  };
606  }
607  }
608 }
609 
610 #endif /* PCL_GPU_KINFU_CUDA_UTILS_HPP_ */
__device__ __forceinline__ float norm(const float3 &v)
Definition: utils.hpp:97
__device__ __forceinline__ float3 operator*(const Mat33 &m, const float3 &vec)
Definition: device.hpp:78
__device__ __forceinline__ float3 operator+(const float3 &v1, const float3 &v2)
Definition: utils.hpp:73
__device__ __forceinline__ float dot(const float3 &v1, const float3 &v2)
Definition: utils.hpp:61
__device__ __forceinline__ void computeRoots2(const float &b, const float &c, float3 &roots)
Definition: utils.hpp:114
__device__ __forceinline__ float3 & operator+=(float3 &vec, const float &v)
Definition: utils.hpp:67
__device__ __forceinline__ float3 & operator*=(float3 &vec, const float &v)
Definition: utils.hpp:79
__device__ __forceinline__ float3 operator-(const float3 &v1, const float3 &v2)
Definition: utils.hpp:85
__device__ __host__ __forceinline__ void swap(T &a, T &b)
Definition: utils.hpp:55
__device__ __forceinline__ float3 normalized(const float3 &v)
Definition: utils.hpp:103
__device__ __host__ __forceinline__ float3 cross(const float3 &v1, const float3 &v2)
Definition: utils.hpp:109
__device__ __forceinline__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
Definition: utils.hpp:128
void ignore(const T &...)
Utility function to eliminate unused variable warnings.
Definition: utils.h:62
static __device__ __forceinline__ int flattenedThreadId()
Definition: utils.hpp:472
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: utils.hpp:478
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: utils.hpp:500
static __device__ __forceinline__ unsigned int stride()
Definition: utils.hpp:466
__device__ __host__ __forceinline__ float3 & operator[](int i)
Definition: utils.hpp:183
__device__ __host__ __forceinline__ const float3 & operator[](int i) const
Definition: utils.hpp:184
static __forceinline__ __device__ float3 unitOrthogonal(const float3 &src)
Definition: utils.hpp:191
__device__ __forceinline__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
Definition: utils.hpp:226
__device__ __forceinline__ Eigen33(volatile float *mat_pkg_arg)
Definition: utils.hpp:224
static __forceinline__ __device__ bool All(int predicate, volatile int *cta_buffer)
Definition: utils.hpp:596
static __forceinline__ __device__ int Ballot(int predicate, volatile int *cta_buffer)
Definition: utils.hpp:585
static __device__ __forceinline__ int warp_reduce(volatile int *ptr, const unsigned int tid)
Definition: utils.hpp:567
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: utils.hpp:536
static __device__ __forceinline__ int laneMaskLt()
Definition: utils.hpp:550
static __device__ __forceinline__ unsigned int id()
Definition: utils.hpp:543
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
Definition: utils.hpp:557