OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_reduce.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
39 #include "../util_ptx.cuh"
40 #include "../util_type.cuh"
41 #include "../thread/thread_operators.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
50 
51 
52 /******************************************************************************
53  * Algorithmic variants
54  ******************************************************************************/
55 
61 {
62 
90 
91 
120 
121 
149 };
150 
151 
152 /******************************************************************************
153  * Block reduce
154  ******************************************************************************/
155 
214 template <
215  typename T,
216  int BLOCK_DIM_X,
218  int BLOCK_DIM_Y = 1,
219  int BLOCK_DIM_Z = 1,
220  int PTX_ARCH = CUB_PTX_ARCH>
222 {
223 private:
224 
225  /******************************************************************************
226  * Constants and type definitions
227  ******************************************************************************/
228 
230  enum
231  {
233  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
234  };
235 
239 
241  typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
242  WarpReductions,
243  typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
244  RakingCommutativeOnly,
245  Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking
246 
249 
250 
251  /******************************************************************************
252  * Utility methods
253  ******************************************************************************/
254 
256  __device__ __forceinline__ _TempStorage& PrivateStorage()
257  {
258  __shared__ _TempStorage private_storage;
259  return private_storage;
260  }
261 
262 
263  /******************************************************************************
264  * Thread fields
265  ******************************************************************************/
266 
269 
271  unsigned int linear_tid;
272 
273 
274 public:
275 
277  struct TempStorage : Uninitialized<_TempStorage> {};
278 
279 
280  /******************************************************************/
284 
288  __device__ __forceinline__ BlockReduce()
289  :
291  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
292  {}
293 
294 
298  __device__ __forceinline__ BlockReduce(
300  :
301  temp_storage(temp_storage.Alias()),
302  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
303  {}
304 
305 
307  /******************************************************************/
311 
312 
347  template <typename ReductionOp>
348  __device__ __forceinline__ T Reduce(
349  T input,
350  ReductionOp reduction_op)
351  {
352  return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
353  }
354 
355 
392  template <
393  int ITEMS_PER_THREAD,
394  typename ReductionOp>
395  __device__ __forceinline__ T Reduce(
396  T (&inputs)[ITEMS_PER_THREAD],
397  ReductionOp reduction_op)
398  {
399  // Reduce partials
400  T partial = internal::ThreadReduce(inputs, reduction_op);
401  return Reduce(partial, reduction_op);
402  }
403 
404 
439  template <typename ReductionOp>
440  __device__ __forceinline__ T Reduce(
441  T input,
442  ReductionOp reduction_op,
443  int num_valid)
444  {
445  // Determine if we scan skip bounds checking
446  if (num_valid >= BLOCK_THREADS)
447  {
448  return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
449  }
450  else
451  {
452  return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
453  }
454  }
455 
456 
458  /******************************************************************/
462 
463 
497  __device__ __forceinline__ T Sum(
498  T input)
499  {
500  return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
501  }
502 
538  template <int ITEMS_PER_THREAD>
539  __device__ __forceinline__ T Sum(
540  T (&inputs)[ITEMS_PER_THREAD])
541  {
542  // Reduce partials
543  T partial = internal::ThreadReduce(inputs, cub::Sum());
544  return Sum(partial);
545  }
546 
547 
582  __device__ __forceinline__ T Sum(
583  T input,
584  int num_valid)
585  {
586  // Determine if we scan skip bounds checking
587  if (num_valid >= BLOCK_THREADS)
588  {
589  return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
590  }
591  else
592  {
593  return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
594  }
595  }
596 
597 
599 };
600 
605 } // CUB namespace
606 CUB_NS_POSTFIX // Optional outer namespace(s)
607 
__device__ __forceinline__ BlockReduce(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
Optional outer namespace(s)
BlockReduceAlgorithm
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA ...
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
__device__ __forceinline__ BlockReduce()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ T Sum(T input, int num_valid)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
The thread block size in threads.
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int num_valid)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
InternalBlockReduce::TempStorage _TempStorage
Shared memory storage layout type for BlockReduce.
unsigned int linear_tid
Linear thread-id.
If<(ALGORITHM==BLOCK_REDUCE_WARP_REDUCTIONS), WarpReductions, typename If<(ALGORITHM==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), RakingCommutativeOnly, Raking >::Type >::Type InternalBlockReduce
Internal specialization type.
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
__device__ __forceinline__ T Sum(T(&inputs)[ITEMS_PER_THREAD])
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ T Sum(T input)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
__device__ __forceinline__ T Reduce(T(&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
Alias wrapper allowing storage to be unioned.
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
\smemstorage{BlockReduce}
_TempStorage & temp_storage
Shared storage reference.
Default sum functor.
BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA th...