template<typename T, int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread block.
- Template Parameters
-
T | Data type being reduced |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ALGORITHM | [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
-
- Performance Considerations
- \granularity
- Very efficient (only one synchronization barrier).
- Incurs zero bank conflicts for most types
- Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- Summation (vs. generic reduction)
BLOCK_THREADS
is a multiple of the architecture's warp size
- Every thread has a valid input (i.e., full vs. partial-tiles)
- See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- A Simple Example
- \blockcollective{BlockReduce}
- The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int thread_data[4];
...
Definition at line 221 of file block_reduce.cuh.
|
|
__device__ __forceinline__ | BlockReduce () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockReduce (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage. More...
|
|
|
template<typename ReductionOp > |
__device__ __forceinline__ T | Reduce (T input, ReductionOp reduction_op) |
| Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. More...
|
|
template<int ITEMS_PER_THREAD, typename ReductionOp > |
__device__ __forceinline__ T | Reduce (T(&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op) |
| Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. More...
|
|
template<typename ReductionOp > |
__device__ __forceinline__ T | Reduce (T input, ReductionOp reduction_op, int num_valid) |
| Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first num_valid threads each contribute one input element. More...
|
|
|
__device__ __forceinline__ T | Sum (T input) |
| Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. More...
|
|
template<int ITEMS_PER_THREAD> |
__device__ __forceinline__ T | Sum (T(&inputs)[ITEMS_PER_THREAD]) |
| Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. More...
|
|
__device__ __forceinline__ T | Sum (T input, int num_valid) |
| Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first num_valid threads each contribute one input element. More...
|
|
|
enum | { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z
} |
| Constants. More...
|
|
typedef BlockReduceWarpReductions< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | WarpReductions |
|
typedef BlockReduceRakingCommutativeOnly< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | RakingCommutativeOnly |
|
typedef BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | Raking |
|
typedef If<(ALGORITHM==BLOCK_REDUCE_WARP_REDUCTIONS), WarpReductions, typename If<(ALGORITHM==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), RakingCommutativeOnly, Raking >::Type >::Type | InternalBlockReduce |
| Internal specialization type.
|
|
typedef InternalBlockReduce::TempStorage | _TempStorage |
| Shared memory storage layout type for BlockReduce.
|
|
template<typename T , int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
Collective constructor using the specified memory allocation as temporary storage.
- Parameters
-
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 298 of file block_reduce.cuh.
template<typename T , int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename ReductionOp >
__device__ __forceinline__ T cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Reduce |
( |
T |
input, |
|
|
ReductionOp |
reduction_op |
|
) |
| |
|
inline |
Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element.
- The return value is undefined in threads other than thread0.
- \rowmajor
- \smemreuse
- Snippet
- The code snippet below illustrates a max reduction of 128 integer items that are partitioned across 128 threads.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int thread_data;
...
- Template Parameters
-
ReductionOp | [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) |
- Parameters
-
[in] | input | Calling thread's input |
[in] | reduction_op | Binary reduction functor |
Definition at line 348 of file block_reduce.cuh.
template<typename T , int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int ITEMS_PER_THREAD, typename ReductionOp >
__device__ __forceinline__ T cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Reduce |
( |
T(&) |
inputs[ITEMS_PER_THREAD], |
|
|
ReductionOp |
reduction_op |
|
) |
| |
|
inline |
Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements.
- The return value is undefined in threads other than thread0.
- \granularity
- \smemreuse
- Snippet
- The code snippet below illustrates a max reduction of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int thread_data[4];
...
- Template Parameters
-
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
ReductionOp | [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) |
- Parameters
-
[in] | inputs | Calling thread's input segment |
[in] | reduction_op | Binary reduction functor |
Definition at line 395 of file block_reduce.cuh.
template<typename T , int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename ReductionOp >
__device__ __forceinline__ T cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Reduce |
( |
T |
input, |
|
|
ReductionOp |
reduction_op, |
|
|
int |
num_valid |
|
) |
| |
|
inline |
Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first num_valid
threads each contribute one input element.
- The return value is undefined in threads other than thread0.
- \rowmajor
- \smemreuse
- Snippet
- The code snippet below illustrates a max reduction of a partially-full tile of integer items that are partitioned across 128 threads.
#include <cub/cub.cuh>
__global__ void ExampleKernel(int num_valid, ...)
{
int thread_data;
if (threadIdx.x < num_valid) thread_data = ...
- Template Parameters
-
ReductionOp | [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) |
- Parameters
-
[in] | input | Calling thread's input |
[in] | reduction_op | Binary reduction functor |
[in] | num_valid | Number of threads containing valid elements (may be less than BLOCK_THREADS) |
Definition at line 440 of file block_reduce.cuh.
template<typename T , int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ T cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Sum |
( |
T |
input | ) |
|
|
inline |
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element.
- The return value is undefined in threads other than thread0.
- \rowmajor
- \smemreuse
- Snippet
- The code snippet below illustrates a sum reduction of 128 integer items that are partitioned across 128 threads.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int thread_data;
...
- Parameters
-
[in] | input | Calling thread's input |
Definition at line 497 of file block_reduce.cuh.
template<typename T , int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int ITEMS_PER_THREAD>
__device__ __forceinline__ T cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Sum |
( |
T(&) |
inputs[ITEMS_PER_THREAD] | ) |
|
|
inline |
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
- The return value is undefined in threads other than thread0.
- \granularity
- \smemreuse
- Snippet
- The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int thread_data[4];
...
- Template Parameters
-
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
- Parameters
-
[in] | inputs | Calling thread's input segment |
Definition at line 539 of file block_reduce.cuh.
template<typename T , int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ T cub::BlockReduce< T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Sum |
( |
T |
input, |
|
|
int |
num_valid |
|
) |
| |
|
inline |
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first num_valid
threads each contribute one input element.
- The return value is undefined in threads other than thread0.
- \rowmajor
- \smemreuse
- Snippet
- The code snippet below illustrates a sum reduction of a partially-full tile of integer items that are partitioned across 128 threads.
#include <cub/cub.cuh>
__global__ void ExampleKernel(int num_valid, ...)
{
int thread_data;
if (threadIdx.x < num_valid)
thread_data = ...
- Parameters
-
[in] | input | Calling thread's input |
[in] | num_valid | Number of threads containing valid elements (may be less than BLOCK_THREADS) |
Definition at line 582 of file block_reduce.cuh.