documentation/block_8hpp_source.html

 /*

 * Software License Agreement (BSD License)

 *

 *  Copyright (c) 2011, Willow Garage, Inc.

 *  All rights reserved.

 *

 *  Redistribution and use in source and binary forms, with or without

 *  modification, are permitted provided that the following conditions

 *  are met:

 *

 *   * Redistributions of source code must retain the above copyright

 *     notice, this list of conditions and the following disclaimer.

 *   * Redistributions in binary form must reproduce the above

 *     copyright notice, this list of conditions and the following

 *     disclaimer in the documentation and/or other materials provided

 *     with the distribution.

 *   * Neither the name of Willow Garage, Inc. nor the names of its

 *     contributors may be used to endorse or promote products derived

 *     from this software without specific prior written permission.

 *

 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

 *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

 *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

 *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

 *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

 *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 *  POSSIBILITY OF SUCH DAMAGE.

 *

 *  Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com)

 */


 #ifndef PCL_DEVICE_UTILS_BLOCK_HPP_

 #define PCL_DEVICE_UTILS_BLOCK_HPP_


 namespace pcl

 {

   namespace device

   {

     struct Block

     {

       static __device__ __forceinline__ unsigned int id()

       {

         return blockIdx.x;

       }


       static __device__ __forceinline__ unsigned int stride()

       {

         return blockDim.x * blockDim.y * blockDim.z;

       }


       static __device__ __forceinline__ void sync()

       {

         __syncthreads();

       }


       static __device__ __forceinline__ int flattenedThreadId()

       {

         return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;

       }


       template<typename It, typename T>

       static __device__ __forceinline__ void fill(It beg, It end, const T& value)

       {

         int STRIDE = stride();

         It t = beg + flattenedThreadId();


         for(; t < end; t += STRIDE)

           *t = value;

       }


       template<typename OutIt, typename T>

       static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)

       {

         int STRIDE = stride();

         int tid = flattenedThreadId();

         value += tid;


         for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)

           *t = value;

       }


       template<typename InIt, typename OutIt>

       static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)

       {

         int STRIDE = stride();

         InIt  t = beg + flattenedThreadId();

         OutIt o = out + (t - beg);


         for(; t < end; t += STRIDE, o += STRIDE)

           *o = *t;

       }


       template<typename InIt, typename OutIt, class UnOp>

       static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)

       {

         int STRIDE = stride();

         InIt  t = beg + flattenedThreadId();

         OutIt o = out + (t - beg);


         for(; t < end; t += STRIDE, o += STRIDE)

           *o = op(*t);

       }


       template<typename InIt1, typename InIt2, typename OutIt, class BinOp>

       static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)

       {

         int STRIDE = stride();

         InIt1 t1 = beg1 + flattenedThreadId();

         InIt2 t2 = beg2 + flattenedThreadId();

         OutIt o  = out + (t1 - beg1);


         for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)

           *o = op(*t1, *t2);

       }


       template<int CTA_SIZE, typename T, class BinOp>

       static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)

       {

         int tid = flattenedThreadId();

         T val =  buffer[tid];


         if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }

         if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }

         if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }

         if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }


         if (tid < 32)

         {

           if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }

           if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }

           if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }

           if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }

           if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }

           if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }

         }

       }


       template<int CTA_SIZE, typename T, class BinOp>

       static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)

       {

         int tid = flattenedThreadId();

         T val =  buffer[tid] = init;

         __syncthreads();


         if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }

         if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }

         if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }

         if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }


         if (tid < 32)

         {

           if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }

           if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }

           if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }

           if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }

           if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }

           if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }

         }

         __syncthreads();

         return buffer[0];

       }


       template <typename T, class BinOp>

       static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)

       {

         int ftid = flattenedThreadId();

         int sft = stride();


         if (sft < n)

         {

           for (unsigned int i = sft + ftid; i < n; i += sft)

             data[ftid] = op(data[ftid], data[i]);


           __syncthreads();


           n = sft;

         }


         while (n > 1)

         {

           unsigned int half = n/2;


           if (ftid < half)

             data[ftid] = op(data[ftid], data[n - ftid - 1]);


           __syncthreads();


           n = n - half;

         }

       }

     };

   }

 }


 #endif /* PCL_DEVICE_UTILS_BLOCK_HPP_ */


pcl
Definition: convolution.h:46

pcl::device::Block::yota
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
Definition: block.hpp:77

pcl::device::Block::fill
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
Definition: block.hpp:67

pcl::device::Block::reduce
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: block.hpp:122

pcl::device::Block::reduce
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: block.hpp:144

pcl::device::Block::sync
static __device__ __forceinline__ void sync()
Definition: block.hpp:56

pcl::device::Block::reduce_n
static __device__ __forceinline__ void reduce_n(T *data, unsigned int n, BinOp op)
Definition: block.hpp:169

pcl::device::Block::stride
static __device__ __forceinline__ unsigned int stride()
Definition: block.hpp:51

pcl::device::Block::transform
static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
Definition: block.hpp:110

pcl::device::Block::copy
static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
Definition: block.hpp:88

pcl::device::Block::id
static __device__ __forceinline__ unsigned int id()
Definition: block.hpp:46

pcl::device::Block::transform
static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
Definition: block.hpp:99

pcl::device::Block::flattenedThreadId
static __device__ __forceinline__ int flattenedThreadId()
Definition: block.hpp:61