OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > Struct Template Reference

BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. More...

Detailed Description

template<typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
struct cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >

BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators.

Supports non-commutative binary reduction operators. Unlike commutative reduction operators (e.g., addition), the application of a non-commutative reduction operator (e.g, string concatenation) across a sequence of inputs must honor the relative ordering of items and partial reductions when applying the reduction operator.

Compared to the implementation of BlockReduceRaking (which does not support non-commutative operators), this implementation requires a few extra rounds of inter-thread communication. < The PTX compute capability for which to to specialize this collective

Definition at line 68 of file block_reduce_raking.cuh.

Data Structures

union  _TempStorage
 Shared memory storage layout type. More...
 
struct  TempStorage
 Alias wrapper allowing storage to be unioned. More...
 

Public Types

enum  { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z }
 Constants. More...
 
enum  {
  RAKING_THREADS = BlockRakingLayout::RAKING_THREADS , SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH , WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS) , WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE ,
  RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED
}
 Constants. More...
 
typedef BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > BlockRakingLayout
 Layout type for padded thread block raking grid.
 
typedef WarpReduce< T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH >::InternalWarpReduce WarpReduce
 WarpReduce utility type.
 

Public Member Functions

__device__ __forceinline__ BlockReduceRaking (TempStorage &temp_storage)
 Constructor.
 
template<bool IS_FULL_TILE, typename ReductionOp , int ITERATION>
__device__ __forceinline__ T RakingReduction (ReductionOp reduction_op, T *raking_segment, T partial, int num_valid, Int2Type< ITERATION >)
 
template<bool IS_FULL_TILE, typename ReductionOp >
__device__ __forceinline__ T RakingReduction (ReductionOp, T *, T partial, int, Int2Type< SEGMENT_LENGTH >)
 
template<bool IS_FULL_TILE, typename ReductionOp >
__device__ __forceinline__ T Reduce (T partial, int num_valid, ReductionOp reduction_op)
 Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
 
template<bool IS_FULL_TILE>
__device__ __forceinline__ T Sum (T partial, int num_valid)
 Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
 

Data Fields

_TempStoragetemp_storage
 
unsigned int linear_tid
 

Member Typedef Documentation

◆ BlockRakingLayout

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockRakingLayout

Layout type for padded thread block raking grid.

Definition at line 78 of file block_reduce_raking.cuh.

◆ WarpReduce

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
typedef WarpReduce<T,BlockRakingLayout::RAKING_THREADS,PTX_ARCH>::InternalWarpReduce cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::WarpReduce

WarpReduce utility type.

Definition at line 81 of file block_reduce_raking.cuh.

Member Enumeration Documentation

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
anonymous enum

Constants.

Enumerator
BLOCK_THREADS 

The thread block size in threads.

Definition at line 71 of file block_reduce_raking.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
anonymous enum

Constants.

Enumerator
RAKING_THREADS 

Number of raking threads.

SEGMENT_LENGTH 

Number of raking elements per warp synchronous raking thread.

WARP_SYNCHRONOUS 

Cooperative work can be entirely warp synchronous.

WARP_SYNCHRONOUS_UNGUARDED 

Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two.

RAKING_UNGUARDED 

Whether or not accesses into smem are unguarded.

Definition at line 84 of file block_reduce_raking.cuh.

Constructor & Destructor Documentation

◆ BlockReduceRaking()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
__device__ __forceinline__ cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockReduceRaking ( TempStorage temp_storage)
inline

Constructor.

Definition at line 122 of file block_reduce_raking.cuh.

Member Function Documentation

◆ RakingReduction() [1/2]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool IS_FULL_TILE, typename ReductionOp , int ITERATION>
__device__ __forceinline__ T cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::RakingReduction ( ReductionOp  reduction_op,
T *  raking_segment,
partial,
int  num_valid,
Int2Type< ITERATION >   
)
inline
Parameters
[in]reduction_opBinary scan operator
[in]partial[lane0 only] Warp-wide aggregate reduction of input items
[in]num_validNumber of valid elements (may be less than BLOCK_THREADS)

Definition at line 131 of file block_reduce_raking.cuh.

◆ RakingReduction() [2/2]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool IS_FULL_TILE, typename ReductionOp >
__device__ __forceinline__ T cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::RakingReduction ( ReductionOp  ,
T *  ,
partial,
int  ,
Int2Type< SEGMENT_LENGTH  
)
inline
Parameters
[in]partial[lane0 only] Warp-wide aggregate reduction of input items

Definition at line 148 of file block_reduce_raking.cuh.

◆ Reduce()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool IS_FULL_TILE, typename ReductionOp >
__device__ __forceinline__ T cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Reduce ( partial,
int  num_valid,
ReductionOp  reduction_op 
)
inline

Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.

Parameters
[in]partialCalling thread's input partial reductions
[in]num_validNumber of valid elements (may be less than BLOCK_THREADS)
[in]reduction_opBinary reduction operator

Definition at line 164 of file block_reduce_raking.cuh.

◆ Sum()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<bool IS_FULL_TILE>
__device__ __forceinline__ T cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Sum ( partial,
int  num_valid 
)
inline

Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.

Parameters
[in]partialCalling thread's input partial reductions
[in]num_validNumber of valid elements (may be less than BLOCK_THREADS)

Definition at line 211 of file block_reduce_raking.cuh.

Field Documentation

◆ linear_tid

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
unsigned int cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid

Definition at line 118 of file block_reduce_raking.cuh.

◆ temp_storage

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
_TempStorage& cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage

Definition at line 117 of file block_reduce_raking.cuh.


The documentation for this struct was generated from the following file: