36 #include "../../block/block_raking_layout.cuh" 37 #include "../../warp/warp_reduce.cuh" 38 #include "../../thread/thread_reduce.cuh" 39 #include "../../util_ptx.cuh" 40 #include "../../util_namespace.cuh" 118 unsigned int linear_tid;
125 temp_storage(temp_storage.Alias()),
126 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
130 template <
bool IS_FULL_TILE,
typename ReductionOp,
int ITERATION>
141 T addend = raking_segment[ITERATION];
147 template <
bool IS_FULL_TILE,
typename ReductionOp>
163 typename ReductionOp>
172 partial =
WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(
189 partial = raking_segment[0];
193 int valid_raking_threads = (IS_FULL_TILE) ?
197 partial =
WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED>(
199 valid_raking_threads,
210 template <
bool IS_FULL_TILE>
211 __device__ __forceinline__ T
Sum(
217 return Reduce<IS_FULL_TILE>(partial, num_valid,
reduction_op);
Number of raking threads.
Alias wrapper allowing storage to be unioned.
Optional outer namespace(s)
Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LE...
__device__ __forceinline__ T Reduce(T partial, int num_valid, ReductionOp reduction_op)
Computes a thread block-wide reduction using the specified reduction operator. The first num_valid th...
Number of raking elements per warp-synchronous raking thread (rounded up)
__device__ __forceinline__ BlockReduceRaking(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ T RakingReduction(ReductionOp, T *, T partial, int, Int2Type< SEGMENT_LENGTH >)
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
The thread block size in threads.
Whether or not we need bounds checking during raking (the number of reduction elements is not a multi...
__device__ __forceinline__ T RakingReduction(ReductionOp reduction_op, T *raking_segment, T partial, int num_valid, Int2Type< ITERATION >)
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > BlockRakingLayout
Layout type for padded thread block raking grid.
Statically determine if N is a power-of-two.
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
If<(PTX_ARCH >=300) &&(IS_POW_OF_TWO), WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >, WarpReduceSmem< T, LOGICAL_WARP_THREADS, PTX_ARCH > >::Type InternalWarpReduce
Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THRE...
Cooperative work can be entirely warp synchronous.
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Alias wrapper allowing storage to be unioned.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a...
BlockRakingLayout::TempStorage raking_grid
Padded thread block raking grid.
Whether or not accesses into smem are unguarded.
WarpReduce< T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH >::InternalWarpReduce WarpReduce
WarpReduce utility type.
__device__ __forceinline__ T Sum(T partial, int num_valid)
Computes a thread block-wide reduction using addition (+) as the reduction operator....
Shared memory storage layout type.
WarpReduce::TempStorage warp_storage
Storage for warp-synchronous reduction.
Number of raking elements per warp synchronous raking thread.