36#include "../../warp/warp_reduce.cuh"
37#include "../../util_ptx.cuh"
38#include "../../util_arch.cuh"
39#include "../../util_namespace.cuh"
106 temp_storage(temp_storage.Alias()),
107 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
113 template <
bool FULL_TILE,
typename ReductionOp,
int SUCCESSOR_WARP>
122 T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
128 template <
bool FULL_TILE,
typename ReductionOp>
135 return warp_aggregate;
142 typename ReductionOp>
151 temp_storage.warp_aggregates[warp_id] = warp_aggregate;
162 return warp_aggregate;
167 template <
bool FULL_TILE>
168 __device__ __forceinline__ T
Sum(
176 num_valid - warp_offset;
179 T warp_aggregate =
WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
185 return ApplyWarpAggregates<FULL_TILE>(
reduction_op, warp_aggregate, num_valid);
192 typename ReductionOp>
201 num_valid - warp_offset;
204 T warp_aggregate =
WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
210 return ApplyWarpAggregates<FULL_TILE>(
reduction_op, warp_aggregate, num_valid);
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Alias wrapper allowing storage to be unioned.
Shared memory storage layout type.
T warp_aggregates[WARPS]
Shared totals from each warp-synchronous scan.
T block_prefix
Shared prefix for the entire thread block.
WarpReduce::TempStorage warp_reduce[WARPS]
Buffer for warp-synchronous scan.
BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA ...
__device__ __forceinline__ BlockReduceWarpReductions(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ T Sum(T input, int num_valid)
Computes a thread block-wide reduction using addition (+) as the reduction operator....
@ BLOCK_THREADS
The thread block size in threads.
@ WARP_THREADS
Number of warp threads.
@ EVEN_WARP_MULTIPLE
Whether or not the logical warp size evenly divides the thread block size.
@ LOGICAL_WARP_SIZE
The logical warp size for warp reductions.
@ WARPS
Number of active warps.
__device__ __forceinline__ T ApplyWarpAggregates(ReductionOp, T warp_aggregate, int, Int2Type< WARPS >)
__device__ __forceinline__ T Reduce(T input, int num_valid, ReductionOp reduction_op)
Computes a thread block-wide reduction using the specified reduction operator. The first num_valid th...
WarpReduce< T, LOGICAL_WARP_SIZE, PTX_ARCH >::InternalWarpReduce WarpReduce
WarpReduce utility type.
__device__ __forceinline__ T ApplyWarpAggregates(ReductionOp reduction_op, T warp_aggregate, int num_valid)
Returns block-wide aggregate in thread0.
__device__ __forceinline__ T ApplyWarpAggregates(ReductionOp reduction_op, T warp_aggregate, int num_valid, Int2Type< SUCCESSOR_WARP >)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.