BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. More...

Detailed Description

template<typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
struct cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >

BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators.

Supports non-commutative binary reduction operators. Unlike commutative reduction operators (e.g., addition), the application of a non-commutative reduction operator (e.g, string concatenation) across a sequence of inputs must honor the relative ordering of items and partial reductions when applying the reduction operator.

Compared to the implementation of BlockReduceRaking (which does not support non-commutative operators), this implementation requires a few extra rounds of inter-thread communication. < The PTX compute capability for which to to specialize this collective

Definition at line 68 of file block_reduce_raking.cuh.

Data Structures
union	_TempStorage
	Shared memory storage layout type. More...

struct	TempStorage
	Alias wrapper allowing storage to be unioned. More...

Public Types
enum	{ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z }
	Constants. More...

enum	{ RAKING_THREADS = BlockRakingLayout::RAKING_THREADS , SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH , WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS) , WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE , RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED }
	Constants. More...

typedef BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH >	BlockRakingLayout
	Layout type for padded thread block raking grid.

typedef WarpReduce< T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH >::InternalWarpReduce	WarpReduce
	WarpReduce utility type.

Public Member Functions
__device__ __forceinline__	BlockReduceRaking (TempStorage &temp_storage)
	Constructor.

template<bool IS_FULL_TILE, typename ReductionOp , int ITERATION>
__device__ __forceinline__ T	RakingReduction (ReductionOp reduction_op, T *raking_segment, T partial, int num_valid, Int2Type< ITERATION >)

template<bool IS_FULL_TILE, typename ReductionOp >
__device__ __forceinline__ T	RakingReduction (ReductionOp, T *, T partial, int, Int2Type< SEGMENT_LENGTH >)

template<bool IS_FULL_TILE, typename ReductionOp >
__device__ __forceinline__ T	Reduce (T partial, int num_valid, ReductionOp reduction_op)
	Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread₀.

template<bool IS_FULL_TILE>
__device__ __forceinline__ T	Sum (T partial, int num_valid)
	Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread₀.

Data Fields
_TempStorage &	temp_storage

unsigned int	linear_tid

Member Typedef Documentation

◆ BlockRakingLayout

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockRakingLayout

Layout type for padded thread block raking grid.

Definition at line 78 of file block_reduce_raking.cuh.

◆ WarpReduce

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

typedef WarpReduce<T,BlockRakingLayout::RAKING_THREADS,PTX_ARCH>::InternalWarpReduce cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::WarpReduce

WarpReduce utility type.

Definition at line 81 of file block_reduce_raking.cuh.

Member Enumeration Documentation

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

anonymous enum

Constants.

Enumerator
BLOCK_THREADS	The thread block size in threads.

Definition at line 71 of file block_reduce_raking.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

anonymous enum

Constants.

Enumerator
RAKING_THREADS	Number of raking threads.
SEGMENT_LENGTH	Number of raking elements per warp synchronous raking thread.
WARP_SYNCHRONOUS	Cooperative work can be entirely warp synchronous.
WARP_SYNCHRONOUS_UNGUARDED	Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two.
RAKING_UNGUARDED	Whether or not accesses into smem are unguarded.

Definition at line 84 of file block_reduce_raking.cuh.

Constructor & Destructor Documentation

◆ BlockReduceRaking()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

__device__ __forceinline__ cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockReduceRaking ( TempStorage & temp_storage )

inline

Constructor.

Definition at line 122 of file block_reduce_raking.cuh.

Member Function Documentation

◆ RakingReduction() [1/2]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

template<bool IS_FULL_TILE, typename ReductionOp , int ITERATION>

__device__ __forceinline__ T cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::RakingReduction	(	ReductionOp	reduction_op,
		T *	raking_segment,
		T	partial,
		int	num_valid,
		Int2Type< ITERATION >
	)

inline

Parameters

[in]	reduction_op	Binary scan operator
[in]	partial	*[lane₀ only]* Warp-wide aggregate reduction of input items
[in]	num_valid	Number of valid elements (may be less than BLOCK_THREADS)

Definition at line 131 of file block_reduce_raking.cuh.

◆ RakingReduction() [2/2]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

template<bool IS_FULL_TILE, typename ReductionOp >

__device__ __forceinline__ T cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::RakingReduction	(	ReductionOp	,
		T *	,
		T	partial,
		int	,
		Int2Type< SEGMENT_LENGTH >
	)

inline

Parameters

[in] partial [lane₀ only] Warp-wide aggregate reduction of input items

Definition at line 148 of file block_reduce_raking.cuh.

◆ Reduce()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

template<bool IS_FULL_TILE, typename ReductionOp >

__device__ __forceinline__ T cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Reduce	(	T	partial,
		int	num_valid,
		ReductionOp	reduction_op
	)

inline

Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread₀.

Parameters

[in]	partial	Calling thread's input partial reductions
[in]	num_valid	Number of valid elements (may be less than BLOCK_THREADS)
[in]	reduction_op	Binary reduction operator

Definition at line 164 of file block_reduce_raking.cuh.

◆ Sum()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

template<bool IS_FULL_TILE>

__device__ __forceinline__ T cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Sum	(	T	partial,
		int	num_valid
	)

inline

Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread₀.

Parameters

[in]	partial	Calling thread's input partial reductions
[in]	num_valid	Number of valid elements (may be less than BLOCK_THREADS)

Definition at line 211 of file block_reduce_raking.cuh.

Field Documentation

◆ linear_tid

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

unsigned int cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid

Definition at line 118 of file block_reduce_raking.cuh.

◆ temp_storage

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>

_TempStorage& cub::BlockReduceRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage

Definition at line 117 of file block_reduce_raking.cuh.

The documentation for this struct was generated from the following file:

openfpm_data/src/util/cuda/cub_old/block/specializations/block_reduce_raking.cuh

Detailed Description

Data Structures

Public Types

Public Member Functions

Data Fields

Member Typedef Documentation

◆ BlockRakingLayout

◆ WarpReduce

Member Enumeration Documentation

◆ anonymous enum

◆ anonymous enum

Constructor & Destructor Documentation

◆ BlockReduceRaking()

Member Function Documentation

◆ RakingReduction() [1/2]

◆ RakingReduction() [2/2]

◆ Reduce()

◆ Sum()

Field Documentation

◆ linear_tid

◆ temp_storage