template<typename T,
int BLOCK_DIM_X,
BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread block.
- Template Parameters
-
T | Data type being reduced |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ALGORITHM | [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
-
- Performance Considerations
- \granularity
- Very efficient (only one synchronization barrier).
- Incurs zero bank conflicts for most types
- Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- Summation (vs. generic reduction)
BLOCK_THREADS
is a multiple of the architecture's warp size
- Every thread has a valid input (i.e., full vs. partial-tiles)
- See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- A Simple Example
- \blockcollective{BlockReduce}
- The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int thread_data[4];
...
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
__device__ __forceinline__ BlockReduce()
Collective constructor using a private static allocation of shared memory as temporary storage.
_TempStorage & temp_storage
Shared storage reference.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
\smemstorage{BlockReduce}
Definition at line 221 of file block_reduce.cuh.
|
|
__device__ __forceinline__ | BlockReduce () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockReduce (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage.
|
|
|
template<typename ReductionOp > |
__device__ __forceinline__ T | Reduce (T input, ReductionOp reduction_op) |
| Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element.
|
|
template<int ITEMS_PER_THREAD, typename ReductionOp > |
__device__ __forceinline__ T | Reduce (T(&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op) |
| Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements.
|
|
template<typename ReductionOp > |
__device__ __forceinline__ T | Reduce (T input, ReductionOp reduction_op, int num_valid) |
| Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first num_valid threads each contribute one input element.
|
|
|
__device__ __forceinline__ T | Sum (T input) |
| Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element.
|
|
template<int ITEMS_PER_THREAD> |
__device__ __forceinline__ T | Sum (T(&inputs)[ITEMS_PER_THREAD]) |
| Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
|
|
__device__ __forceinline__ T | Sum (T input, int num_valid) |
| Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first num_valid threads each contribute one input element.
|
|
|
enum | { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z
} |
| Constants. More...
|
|
typedef BlockReduceWarpReductions< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | WarpReductions |
|
typedef BlockReduceRakingCommutativeOnly< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | RakingCommutativeOnly |
|
typedef BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | Raking |
|
typedef If<(ALGORITHM==BLOCK_REDUCE_WARP_REDUCTIONS), WarpReductions, typenameIf<(ALGORITHM==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), RakingCommutativeOnly, Raking >::Type >::Type | InternalBlockReduce |
| Internal specialization type.
|
|
typedef InternalBlockReduce::TempStorage | _TempStorage |
| Shared memory storage layout type for BlockReduce.
|
|