BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators.
More...
template<typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
struct cub::BlockReduceWarpReductions< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators.
< The PTX compute capability for which to to specialize this collective
Definition at line 57 of file block_reduce_warp_reductions.cuh.
|
__device__ __forceinline__ | BlockReduceWarpReductions (TempStorage &temp_storage) |
| Constructor.
|
|
template<bool FULL_TILE, typename ReductionOp , int SUCCESSOR_WARP> |
__device__ __forceinline__ T | ApplyWarpAggregates (ReductionOp reduction_op, T warp_aggregate, int num_valid, Int2Type< SUCCESSOR_WARP >) |
|
template<bool FULL_TILE, typename ReductionOp > |
__device__ __forceinline__ T | ApplyWarpAggregates (ReductionOp, T warp_aggregate, int, Int2Type< WARPS >) |
|
template<bool FULL_TILE, typename ReductionOp > |
__device__ __forceinline__ T | ApplyWarpAggregates (ReductionOp reduction_op, T warp_aggregate, int num_valid) |
| Returns block-wide aggregate in thread0. More...
|
|
template<bool FULL_TILE> |
__device__ __forceinline__ T | Sum (T input, int num_valid) |
| Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. More...
|
|
template<bool FULL_TILE, typename ReductionOp > |
__device__ __forceinline__ T | Reduce (T input, int num_valid, ReductionOp reduction_op) |
| Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. More...
|
|
◆ anonymous enum
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
Constants.
Enumerator |
---|
BLOCK_THREADS | The thread block size in threads.
|
WARP_THREADS | Number of warp threads.
|
WARPS | Number of active warps.
|
LOGICAL_WARP_SIZE | The logical warp size for warp reductions.
|
EVEN_WARP_MULTIPLE | Whether or not the logical warp size evenly divides the thread block size.
|
Definition at line 60 of file block_reduce_warp_reductions.cuh.
◆ ApplyWarpAggregates() [1/3]
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool FULL_TILE, typename ReductionOp , int SUCCESSOR_WARP>
__device__ __forceinline__ T cub::BlockReduceWarpReductions< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ApplyWarpAggregates |
( |
ReductionOp |
reduction_op, |
|
|
T |
warp_aggregate, |
|
|
int |
num_valid, |
|
|
Int2Type< SUCCESSOR_WARP > |
|
|
) |
| |
|
inline |
- Parameters
-
[in] | reduction_op | Binary scan operator |
[in] | warp_aggregate | [lane0 only] Warp-wide aggregate reduction of input items |
[in] | num_valid | Number of valid elements (may be less than BLOCK_THREADS) |
Definition at line 114 of file block_reduce_warp_reductions.cuh.
◆ ApplyWarpAggregates() [2/3]
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool FULL_TILE, typename ReductionOp >
◆ ApplyWarpAggregates() [3/3]
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool FULL_TILE, typename ReductionOp >
__device__ __forceinline__ T cub::BlockReduceWarpReductions< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ApplyWarpAggregates |
( |
ReductionOp |
reduction_op, |
|
|
T |
warp_aggregate, |
|
|
int |
num_valid |
|
) |
| |
|
inline |
Returns block-wide aggregate in thread0.
- Parameters
-
[in] | reduction_op | Binary scan operator |
[in] | warp_aggregate | [lane0 only] Warp-wide aggregate reduction of input items |
[in] | num_valid | Number of valid elements (may be less than BLOCK_THREADS) |
Definition at line 143 of file block_reduce_warp_reductions.cuh.
◆ Reduce()
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool FULL_TILE, typename ReductionOp >
__device__ __forceinline__ T cub::BlockReduceWarpReductions< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Reduce |
( |
T |
input, |
|
|
int |
num_valid, |
|
|
ReductionOp |
reduction_op |
|
) |
| |
|
inline |
Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
- Parameters
-
[in] | input | Calling thread's input partial reductions |
[in] | num_valid | Number of valid elements (may be less than BLOCK_THREADS) |
[in] | reduction_op | Binary reduction operator |
Definition at line 193 of file block_reduce_warp_reductions.cuh.
◆ Sum()
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool FULL_TILE>
Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
- Parameters
-
[in] | input | Calling thread's input partial reductions |
[in] | num_valid | Number of valid elements (may be less than BLOCK_THREADS) |
Definition at line 168 of file block_reduce_warp_reductions.cuh.
The documentation for this struct was generated from the following file: