37#include "../../warp/warp_reduce.cuh"
38#include "../../thread/thread_reduce.cuh"
39#include "../../util_ptx.cuh"
40#include "../../util_namespace.cuh"
113 unsigned int linear_tid;
120 temp_storage(temp_storage.Alias()),
121 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
126 template <
bool FULL_TILE>
127 __device__ __forceinline__ T
Sum(
148 partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment,
cub::Sum(), partial);
151 partial =
WarpReduce(temp_storage.warp_storage).Sum(partial);
162 typename ReductionOp>
170 return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid,
reduction_op);
185 partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment,
reduction_op, partial);
#define CUB_MAX(a, b)
Select maximum(a, b)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Alias wrapper allowing storage to be unioned.
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
Alias wrapper allowing storage to be unioned.
BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA th...
WarpReduce< T, RAKING_THREADS, PTX_ARCH > WarpReduce
WarpReduce utility type.
BlockRakingLayout< T, SHARING_THREADS, PTX_ARCH > BlockRakingLayout
Layout type for padded thread block raking grid.
__device__ __forceinline__ T Sum(T partial, int num_valid)
Computes a thread block-wide reduction using addition (+) as the reduction operator....
@ RAKING_THREADS
Number of raking threads.
@ SEGMENT_LENGTH
Number of raking elements per warp synchronous raking thread.
@ USE_FALLBACK
Whether or not to use fall-back.
@ WARP_THREADS
Number of warp threads.
@ SHARING_THREADS
Number of threads actually sharing items with the raking threads.
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ BlockReduceRakingCommutativeOnly(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ T Reduce(T partial, int num_valid, ReductionOp reduction_op)
Computes a thread block-wide reduction using the specified reduction operator. The first num_valid th...
Alias wrapper allowing storage to be unioned.
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Shared memory storage layout type.
FallBack::TempStorage fallback_storage
Fall-back storage for non-commutative block scan.
BlockRakingLayout::TempStorage raking_grid
Padded thread block raking grid.
WarpReduce::TempStorage warp_storage
Storage for warp-synchronous reduction.