template<typename T,
int BLOCK_DIM_X,
BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread block.

- Template Parameters
-
| T | Data type being reduced |
| BLOCK_DIM_X | The thread block length in threads along the X dimension |
| ALGORITHM | [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) |
| BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
| BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
| PTX_ARCH | [optional] \ptxversion |
- Overview
-
- Performance Considerations
- \granularity
- Very efficient (only one synchronization barrier).
- Incurs zero bank conflicts for most types
- Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- Summation (vs. generic reduction)
BLOCK_THREADS is a multiple of the architecture's warp size
- Every thread has a valid input (i.e., full vs. partial-tiles)
- See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- A Simple Example
- \blockcollective{BlockReduce}
- The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int thread_data[4];
...
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
__device__ __forceinline__ BlockReduce()
Collective constructor using a private static allocation of shared memory as temporary storage.
_TempStorage & temp_storage
Shared storage reference.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
\smemstorage{BlockReduce}
Definition at line 221 of file block_reduce.cuh.
|
|
| __device__ __forceinline__ | BlockReduce () |
| | Collective constructor using a private static allocation of shared memory as temporary storage.
|
| |
| __device__ __forceinline__ | BlockReduce (TempStorage &temp_storage) |
| | Collective constructor using the specified memory allocation as temporary storage.
|
| |
|
| template<typename ReductionOp > |
| __device__ __forceinline__ T | Reduce (T input, ReductionOp reduction_op) |
| | Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element.
|
| |
| template<int ITEMS_PER_THREAD, typename ReductionOp > |
| __device__ __forceinline__ T | Reduce (T(&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op) |
| | Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements.
|
| |
| template<typename ReductionOp > |
| __device__ __forceinline__ T | Reduce (T input, ReductionOp reduction_op, int num_valid) |
| | Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first num_valid threads each contribute one input element.
|
| |
|
| __device__ __forceinline__ T | Sum (T input) |
| | Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element.
|
| |
| template<int ITEMS_PER_THREAD> |
| __device__ __forceinline__ T | Sum (T(&inputs)[ITEMS_PER_THREAD]) |
| | Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
|
| |
| __device__ __forceinline__ T | Sum (T input, int num_valid) |
| | Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first num_valid threads each contribute one input element.
|
| |
|
| enum | { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z
} |
| | Constants. More...
|
| |
| typedef BlockReduceWarpReductions< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | WarpReductions |
| |
| typedef BlockReduceRakingCommutativeOnly< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | RakingCommutativeOnly |
| |
| typedef BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | Raking |
| |
| typedef If<(ALGORITHM==BLOCK_REDUCE_WARP_REDUCTIONS), WarpReductions, typenameIf<(ALGORITHM==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), RakingCommutativeOnly, Raking >::Type >::Type | InternalBlockReduce |
| | Internal specialization type.
|
| |
| typedef InternalBlockReduce::TempStorage | _TempStorage |
| | Shared memory storage layout type for BlockReduce.
|
| |