36 #include "../../warp/warp_reduce.cuh" 37 #include "../../util_ptx.cuh" 38 #include "../../util_arch.cuh" 39 #include "../../util_namespace.cuh" 106 temp_storage(temp_storage.Alias()),
107 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
113 template <
bool FULL_TILE,
typename ReductionOp,
int SUCCESSOR_WARP>
122 T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
128 template <
bool FULL_TILE,
typename ReductionOp>
135 return warp_aggregate;
142 typename ReductionOp>
151 temp_storage.warp_aggregates[warp_id] = warp_aggregate;
162 return warp_aggregate;
167 template <
bool FULL_TILE>
168 __device__ __forceinline__ T
Sum(
176 num_valid - warp_offset;
179 T warp_aggregate =
WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
185 return ApplyWarpAggregates<FULL_TILE>(
reduction_op, warp_aggregate, num_valid);
192 typename ReductionOp>
201 num_valid - warp_offset;
204 T warp_aggregate =
WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
210 return ApplyWarpAggregates<FULL_TILE>(
reduction_op, warp_aggregate, num_valid);
__device__ __forceinline__ BlockReduceWarpReductions(TempStorage &temp_storage)
Constructor.
Optional outer namespace(s)
Shared memory storage layout type.
Alias wrapper allowing storage to be unioned.
The logical warp size for warp reductions.
BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA ...
T block_prefix
Shared prefix for the entire thread block.
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
WarpReduce::TempStorage warp_reduce[WARPS]
Buffer for warp-synchronous scan.
__device__ __forceinline__ T Reduce(T input, int num_valid, ReductionOp reduction_op)
Computes a thread block-wide reduction using the specified reduction operator. The first num_valid th...
__device__ __forceinline__ T Sum(T input, int num_valid)
Computes a thread block-wide reduction using addition (+) as the reduction operator....
Whether or not the logical warp size evenly divides the thread block size.
If<(PTX_ARCH >=300) &&(IS_POW_OF_TWO), WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >, WarpReduceSmem< T, LOGICAL_WARP_THREADS, PTX_ARCH > >::Type InternalWarpReduce
Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THRE...
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ T ApplyWarpAggregates(ReductionOp reduction_op, T warp_aggregate, int num_valid)
Returns block-wide aggregate in thread0.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
The thread block size in threads.
__device__ __forceinline__ T ApplyWarpAggregates(ReductionOp reduction_op, T warp_aggregate, int num_valid, Int2Type< SUCCESSOR_WARP >)
#define CUB_MIN(a, b)
Select minimum(a, b)
WarpReduce< T, LOGICAL_WARP_SIZE, PTX_ARCH >::InternalWarpReduce WarpReduce
WarpReduce utility type.
T warp_aggregates[WARPS]
Shared totals from each warp-synchronous scan.
__device__ __forceinline__ T ApplyWarpAggregates(ReductionOp, T warp_aggregate, int, Int2Type< WARPS >)