OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH > Struct Template Reference

BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. More...

Detailed Description

template<typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
struct cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >

BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.

< The PTX compute capability for which to to specialize this collective

Definition at line 62 of file block_scan_raking.cuh.

Data Structures

struct  _TempStorage
 Shared memory storage layout type. More...
 
struct  TempStorage
 Alias wrapper allowing storage to be unioned. More...
 

Public Types

enum  { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z }
 Constants. More...
 
enum  { RAKING_THREADS = BlockRakingLayout::RAKING_THREADS , SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH , WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS) }
 Constants. More...
 
typedef BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > BlockRakingLayout
 Layout type for padded thread block raking grid.
 
typedef WarpScan< T, RAKING_THREADS, PTX_ARCH > WarpScan
 WarpScan utility type.
 

Public Member Functions

template<int ITERATION, typename ScanOp >
__device__ __forceinline__ T GuardedReduce (T *raking_ptr, ScanOp scan_op, T raking_partial, Int2Type< ITERATION >)
 Templated reduction.
 
template<typename ScanOp >
__device__ __forceinline__ T GuardedReduce (T *, ScanOp, T raking_partial, Int2Type< SEGMENT_LENGTH >)
 Templated reduction (base case)
 
template<int ITERATION>
__device__ __forceinline__ void CopySegment (T *out, T *in, Int2Type< ITERATION >)
 Templated copy.
 
__device__ __forceinline__ void CopySegment (T *, T *, Int2Type< SEGMENT_LENGTH >)
 Templated copy (base case)
 
template<typename ScanOp >
__device__ __forceinline__ T Upsweep (ScanOp scan_op)
 Performs upsweep raking reduction, returning the aggregate.
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveDownsweep (ScanOp scan_op, T raking_partial, bool apply_prefix=true)
 Performs exclusive downsweep raking scan.
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveDownsweep (ScanOp scan_op, T raking_partial, bool apply_prefix=true)
 Performs inclusive downsweep raking scan.
 
__device__ __forceinline__ BlockScanRaking (TempStorage &temp_storage)
 Constructor.
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined.
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &output, const T &initial_value, ScanOp scan_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &output, ScanOp scan_op, T &block_aggregate)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined.
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.
 
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs.
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveScan (T input, T &output, ScanOp scan_op)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveScan (T input, T &output, ScanOp scan_op, T &block_aggregate)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.
 
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void InclusiveScan (T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs.
 

Data Fields

_TempStoragetemp_storage
 
unsigned int linear_tid
 
cached_segment [SEGMENT_LENGTH]
 

Member Typedef Documentation

◆ BlockRakingLayout

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::BlockRakingLayout

Layout type for padded thread block raking grid.

Definition at line 76 of file block_scan_raking.cuh.

◆ WarpScan

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::WarpScan

WarpScan utility type.

Definition at line 92 of file block_scan_raking.cuh.

Member Enumeration Documentation

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
anonymous enum

Constants.

Enumerator
BLOCK_THREADS 

The thread block size in threads.

Definition at line 69 of file block_scan_raking.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
anonymous enum

Constants.

Enumerator
RAKING_THREADS 

Number of raking threads.

SEGMENT_LENGTH 

Number of raking elements per warp synchronous raking thread.

WARP_SYNCHRONOUS 

Cooperative work can be entirely warp synchronous.

Definition at line 79 of file block_scan_raking.cuh.

Constructor & Destructor Documentation

◆ BlockScanRaking()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
__device__ __forceinline__ cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::BlockScanRaking ( TempStorage temp_storage)
inline

Constructor.

Definition at line 236 of file block_scan_raking.cuh.

Member Function Documentation

◆ CopySegment() [1/2]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::CopySegment ( T *  ,
T *  ,
Int2Type< SEGMENT_LENGTH  
)
inline

Templated copy (base case)

Definition at line 164 of file block_scan_raking.cuh.

◆ CopySegment() [2/2]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<int ITERATION>
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::CopySegment ( T *  out,
T *  in,
Int2Type< ITERATION >   
)
inline

Templated copy.

Parameters
[out]outOut array
[in]inInput array

Definition at line 153 of file block_scan_raking.cuh.

◆ ExclusiveDownsweep()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::ExclusiveDownsweep ( ScanOp  scan_op,
raking_partial,
bool  apply_prefix = true 
)
inline

Performs exclusive downsweep raking scan.

Definition at line 189 of file block_scan_raking.cuh.

◆ ExclusiveScan() [1/5]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator

Definition at line 250 of file block_scan_raking.cuh.

◆ ExclusiveScan() [2/5]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::ExclusiveScan ( input,
T &  output,
const T &  initial_value,
ScanOp  scan_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.

Parameters
[in]inputCalling thread's input items
[out]outputCalling thread's output items (may be aliased to input)
[in]initial_valueInitial value to seed the exclusive scan
[in]scan_opBinary scan operator

Definition at line 291 of file block_scan_raking.cuh.

◆ ExclusiveScan() [3/5]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::ExclusiveScan ( input,
T &  output,
const T &  initial_value,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input items
[out]outputCalling thread's output items (may be aliased to input)
[in]initial_valueInitial value to seed the exclusive scan
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 385 of file block_scan_raking.cuh.

◆ ExclusiveScan() [4/5]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::ExclusiveScan ( input,
T &  output,
ScanOp  scan_op,
BlockPrefixCallbackOp &  block_prefix_callback_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
block_prefix_callback_op[in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.

Definition at line 438 of file block_scan_raking.cuh.

◆ ExclusiveScan() [5/5]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::ExclusiveScan ( input,
T &  output,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined.

Parameters
[in]inputCalling thread's input item
[out]outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 334 of file block_scan_raking.cuh.

◆ GuardedReduce() [1/2]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ T cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::GuardedReduce ( T *  ,
ScanOp  ,
raking_partial,
Int2Type< SEGMENT_LENGTH  
)
inline

Templated reduction (base case)

Parameters
[in]raking_partialPrefix to seed reduction with

Definition at line 141 of file block_scan_raking.cuh.

◆ GuardedReduce() [2/2]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<int ITERATION, typename ScanOp >
__device__ __forceinline__ T cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::GuardedReduce ( T *  raking_ptr,
ScanOp  scan_op,
raking_partial,
Int2Type< ITERATION >   
)
inline

Templated reduction.

Parameters
[in]raking_ptrInput array
[in]scan_opBinary reduction operator
[in]raking_partialPrefix to seed reduction with

Definition at line 123 of file block_scan_raking.cuh.

◆ InclusiveDownsweep()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::InclusiveDownsweep ( ScanOp  scan_op,
raking_partial,
bool  apply_prefix = true 
)
inline

Performs inclusive downsweep raking scan.

Definition at line 211 of file block_scan_raking.cuh.

◆ InclusiveScan() [1/3]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::InclusiveScan ( input,
T &  output,
ScanOp  scan_op 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.

Parameters
[in]inputCalling thread's input item
[out]outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator

Definition at line 506 of file block_scan_raking.cuh.

◆ InclusiveScan() [2/3]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::InclusiveScan ( input,
T &  output,
ScanOp  scan_op,
BlockPrefixCallbackOp &  block_prefix_callback_op 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
block_prefix_callback_op[in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.

Definition at line 601 of file block_scan_raking.cuh.

◆ InclusiveScan() [3/3]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::InclusiveScan ( input,
T &  output,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 548 of file block_scan_raking.cuh.

◆ Upsweep()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ T cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::Upsweep ( ScanOp  scan_op)
inline

Performs upsweep raking reduction, returning the aggregate.

Definition at line 173 of file block_scan_raking.cuh.

Field Documentation

◆ cached_segment

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
T cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::cached_segment[SEGMENT_LENGTH]

Definition at line 114 of file block_scan_raking.cuh.

◆ linear_tid

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
unsigned int cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::linear_tid

Definition at line 113 of file block_scan_raking.cuh.

◆ temp_storage

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int PTX_ARCH>
_TempStorage& cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::temp_storage

Definition at line 112 of file block_scan_raking.cuh.


The documentation for this struct was generated from the following file: