36#include "../../block/block_raking_layout.cuh"
37#include "../../warp/warp_reduce.cuh"
38#include "../../thread/thread_reduce.cuh"
39#include "../../util_ptx.cuh"
40#include "../../util_namespace.cuh"
118 unsigned int linear_tid;
125 temp_storage(temp_storage.Alias()),
126 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
130 template <
bool IS_FULL_TILE,
typename ReductionOp,
int ITERATION>
141 T addend = raking_segment[ITERATION];
147 template <
bool IS_FULL_TILE,
typename ReductionOp>
163 typename ReductionOp>
172 partial =
WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(
189 partial = raking_segment[0];
193 int valid_raking_threads = (IS_FULL_TILE) ?
197 partial =
WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED>(
199 valid_raking_threads,
210 template <
bool IS_FULL_TILE>
211 __device__ __forceinline__ T
Sum(
217 return Reduce<IS_FULL_TILE>(partial, num_valid,
reduction_op);
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Alias wrapper allowing storage to be unioned.
@ UNGUARDED
Whether or not we need bounds checking during raking (the number of reduction elements is not a multi...
@ SEGMENT_LENGTH
Number of raking elements per warp-synchronous raking thread (rounded up)
@ RAKING_THREADS
Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LE...
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
Alias wrapper allowing storage to be unioned.
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
__device__ __forceinline__ BlockReduceRaking(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ T RakingReduction(ReductionOp, T *, T partial, int, Int2Type< SEGMENT_LENGTH >)
__device__ __forceinline__ T Reduce(T partial, int num_valid, ReductionOp reduction_op)
Computes a thread block-wide reduction using the specified reduction operator. The first num_valid th...
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ T Sum(T partial, int num_valid)
Computes a thread block-wide reduction using addition (+) as the reduction operator....
@ WARP_SYNCHRONOUS
Cooperative work can be entirely warp synchronous.
@ WARP_SYNCHRONOUS_UNGUARDED
Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a...
@ RAKING_THREADS
Number of raking threads.
@ SEGMENT_LENGTH
Number of raking elements per warp synchronous raking thread.
@ RAKING_UNGUARDED
Whether or not accesses into smem are unguarded.
WarpReduce< T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH >::InternalWarpReduce WarpReduce
WarpReduce utility type.
__device__ __forceinline__ T RakingReduction(ReductionOp reduction_op, T *raking_segment, T partial, int num_valid, Int2Type< ITERATION >)
BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > BlockRakingLayout
Layout type for padded thread block raking grid.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Statically determine if N is a power-of-two.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Shared memory storage layout type.
WarpReduce::TempStorage warp_storage
Storage for warp-synchronous reduction.
BlockRakingLayout::TempStorage raking_grid
Padded thread block raking grid.