Point Cloud Library (PCL)  1.14.0-dev
block.hpp
1 /*
2 * Software License Agreement (BSD License)
3 *
4 * Copyright (c) 2011, Willow Garage, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer in the documentation and/or other materials provided
16 * with the distribution.
17 * * Neither the name of Willow Garage, Inc. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 *
34 * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com)
35 */
36 
37 #ifndef PCL_DEVICE_UTILS_BLOCK_HPP_
38 #define PCL_DEVICE_UTILS_BLOCK_HPP_
39 
40 namespace pcl
41 {
42  namespace device
43  {
44  struct Block
45  {
46  static __device__ __forceinline__ unsigned int id()
47  {
48  return blockIdx.x;
49  }
50 
51  static __device__ __forceinline__ unsigned int stride()
52  {
53  return blockDim.x * blockDim.y * blockDim.z;
54  }
55 
56  static __device__ __forceinline__ void sync()
57  {
58  __syncthreads();
59  }
60 
61  static __device__ __forceinline__ int flattenedThreadId()
62  {
63  return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
64  }
65 
66  template<typename It, typename T>
67  static __device__ __forceinline__ void fill(It beg, It end, const T& value)
68  {
69  int STRIDE = stride();
70  It t = beg + flattenedThreadId();
71 
72  for(; t < end; t += STRIDE)
73  *t = value;
74  }
75 
76  template<typename OutIt, typename T>
77  static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
78  {
79  int STRIDE = stride();
80  int tid = flattenedThreadId();
81  value += tid;
82 
83  for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
84  *t = value;
85  }
86 
87  template<typename InIt, typename OutIt>
88  static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
89  {
90  int STRIDE = stride();
91  InIt t = beg + flattenedThreadId();
92  OutIt o = out + (t - beg);
93 
94  for(; t < end; t += STRIDE, o += STRIDE)
95  *o = *t;
96  }
97 
98  template<typename InIt, typename OutIt, class UnOp>
99  static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
100  {
101  int STRIDE = stride();
102  InIt t = beg + flattenedThreadId();
103  OutIt o = out + (t - beg);
104 
105  for(; t < end; t += STRIDE, o += STRIDE)
106  *o = op(*t);
107  }
108 
109  template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
110  static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
111  {
112  int STRIDE = stride();
113  InIt1 t1 = beg1 + flattenedThreadId();
114  InIt2 t2 = beg2 + flattenedThreadId();
115  OutIt o = out + (t1 - beg1);
116 
117  for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
118  *o = op(*t1, *t2);
119  }
120 
121  template<int CTA_SIZE, typename T, class BinOp>
122  static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
123  {
124  int tid = flattenedThreadId();
125  T val = buffer[tid];
126 
127  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
128  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
129  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
130  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
131 
132  if (tid < 32)
133  {
134  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
135  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
136  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
137  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
138  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
139  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
140  }
141  }
142 
143  template<int CTA_SIZE, typename T, class BinOp>
144  static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
145  {
146  int tid = flattenedThreadId();
147  T val = buffer[tid] = init;
148  __syncthreads();
149 
150  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
151  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
152  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
153  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
154 
155  if (tid < 32)
156  {
157  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
158  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
159  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
160  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
161  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
162  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
163  }
164  __syncthreads();
165  return buffer[0];
166  }
167 
168  template <typename T, class BinOp>
169  static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
170  {
171  int ftid = flattenedThreadId();
172  int sft = stride();
173 
174  if (sft < n)
175  {
176  for (unsigned int i = sft + ftid; i < n; i += sft)
177  data[ftid] = op(data[ftid], data[i]);
178 
179  __syncthreads();
180 
181  n = sft;
182  }
183 
184  while (n > 1)
185  {
186  unsigned int half = n/2;
187 
188  if (ftid < half)
189  data[ftid] = op(data[ftid], data[n - ftid - 1]);
190 
191  __syncthreads();
192 
193  n = n - half;
194  }
195  }
196  };
197  }
198 }
199 
200 #endif /* PCL_DEVICE_UTILS_BLOCK_HPP_ */
201 
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
Definition: block.hpp:77
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
Definition: block.hpp:67
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: block.hpp:122
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: block.hpp:144
static __device__ __forceinline__ void sync()
Definition: block.hpp:56
static __device__ __forceinline__ void reduce_n(T *data, unsigned int n, BinOp op)
Definition: block.hpp:169
static __device__ __forceinline__ unsigned int stride()
Definition: block.hpp:51
static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
Definition: block.hpp:110
static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
Definition: block.hpp:88
static __device__ __forceinline__ unsigned int id()
Definition: block.hpp:46
static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
Definition: block.hpp:99
static __device__ __forceinline__ int flattenedThreadId()
Definition: block.hpp:61