37 #include "../../warp/warp_reduce.cuh" 38 #include "../../thread/thread_reduce.cuh" 39 #include "../../util_ptx.cuh" 40 #include "../../util_namespace.cuh" 113 unsigned int linear_tid;
120 temp_storage(temp_storage.Alias()),
121 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
126 template <
bool FULL_TILE>
127 __device__ __forceinline__ T
Sum(
148 partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment,
cub::Sum(), partial);
151 partial =
WarpReduce(temp_storage.warp_storage).Sum(partial);
162 typename ReductionOp>
170 return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid,
reduction_op);
185 partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment,
reduction_op, partial);
Alias wrapper allowing storage to be unioned.
Optional outer namespace(s)
Number of raking elements per warp synchronous raking thread.
Shared memory storage layout type.
BlockRakingLayout< T, SHARING_THREADS, PTX_ARCH > BlockRakingLayout
Layout type for padded thread block raking grid.
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
__device__ __forceinline__ BlockReduceRakingCommutativeOnly(TempStorage &temp_storage)
Constructor.
Alias wrapper allowing storage to be unioned.
WarpReduce< T, RAKING_THREADS, PTX_ARCH > WarpReduce
WarpReduce utility type.
__device__ __forceinline__ T Sum(T partial, int num_valid)
Computes a thread block-wide reduction using addition (+) as the reduction operator....
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
WarpReduce::TempStorage warp_storage
Storage for warp-synchronous reduction.
Alias wrapper allowing storage to be unioned.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Number of threads actually sharing items with the raking threads.
BlockRakingLayout::TempStorage raking_grid
Padded thread block raking grid.
The thread block size in threads.
Whether or not to use fall-back.
#define CUB_MAX(a, b)
Select maximum(a, b)
__device__ __forceinline__ T Reduce(T partial, int num_valid, ReductionOp reduction_op)
Computes a thread block-wide reduction using the specified reduction operator. The first num_valid th...
BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA th...
FallBack::TempStorage fallback_storage
Fall-back storage for non-commutative block scan.
Number of raking threads.