OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > Struct Template Reference

BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. More...

Detailed Description

template<typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
struct cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >

BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.

< The PTX compute capability for which to to specialize this collective

Definition at line 56 of file block_scan_warp_scans.cuh.

Data Structures

struct  _TempStorage
 Shared memory storage layout type. More...
 
struct  TempStorage
 Alias wrapper allowing storage to be unioned. More...
 

Public Types

enum  { WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS }
 Constants. More...
 
enum  { WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS }
 Constants. More...
 
enum  { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS, OUTER_WARPS = INNER_WARP_THREADS }
 Constants. More...
 
typedef WarpScan< T, WARP_THREADS, PTX_ARCH > WarpScanT
 WarpScan utility type.
 
typedef WarpScan< T, WARPS, PTX_ARCH > WarpAggregateScan
 WarpScan utility type.
 
typedef WarpScan< T, WARP_THREADS, PTX_ARCH > WarpScanT
 WarpScan utility type.
 
typedef WarpScan< T, WARPS, PTX_ARCH > WarpAggregateScanT
 WarpScan utility type.
 
typedef WarpScan< T, OUTER_WARP_THREADS, PTX_ARCH > OuterWarpScanT
 Outer WarpScan utility type.
 
typedef WarpScan< T, INNER_WARP_THREADS, PTX_ARCH > InnerWarpScanT
 Inner WarpScan utility type.
 
typedef OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS]
 

Public Member Functions

struct __align__ (32) _TempStorage
 Shared memory storage layout type. More...
 
__device__ __forceinline__ BlockScanWarpScans (TempStorage &temp_storage)
 Constructor.
 
template<typename ScanOp , int WARP>
__device__ __forceinline__ void ApplyWarpAggregates (T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP >)
 
template<typename ScanOp >
__device__ __forceinline__ void ApplyWarpAggregates (T &, ScanOp, T &, Int2Type< WARPS >)
 
template<typename ScanOp >
__device__ __forceinline__ T ComputeWarpPrefix (ScanOp scan_op, T warp_aggregate, T &block_aggregate)
 Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. More...
 
template<typename ScanOp >
__device__ __forceinline__ T ComputeWarpPrefix (ScanOp scan_op, T warp_aggregate, T &block_aggregate, const T &initial_value)
 Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveScan (T input, T &inclusive_output, ScanOp scan_op)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. More...
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveScan (T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void InclusiveScan (T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
__device__ __forceinline__ BlockScanWarpScans (TempStorage &temp_storage)
 Constructor.
 
template<typename ScanOp , int WARP>
__device__ __forceinline__ void ApplyWarpAggregates (T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP > addend_warp)
 
template<typename ScanOp >
__device__ __forceinline__ void ApplyWarpAggregates (T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARPS > addend_warp)
 
template<typename ScanOp >
__device__ __forceinline__ T ComputeWarpPrefix (ScanOp scan_op, T warp_aggregate, T &block_aggregate)
 Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. More...
 
template<typename ScanOp >
__device__ __forceinline__ T ComputeWarpPrefix (ScanOp scan_op, T warp_aggregate, T &block_aggregate, const T &initial_value)
 Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveScan (T input, T &inclusive_output, ScanOp scan_op)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. More...
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveScan (T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void InclusiveScan (T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
__device__ __forceinline__ BlockScanWarpScans (TempStorage &temp_storage)
 Constructor.
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. More...
 
template<typename ScanOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
 Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. The call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. More...
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveScan (T input, T &inclusive_output, ScanOp scan_op)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. More...
 
template<typename ScanOp >
__device__ __forceinline__ void InclusiveScan (T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. More...
 
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void InclusiveScan (T input, T &inclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
 Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. More...
 

Data Fields

_TempStoragetemp_storage
 
unsigned int linear_tid
 
unsigned int warp_id
 
unsigned int lane_id
 

Member Enumeration Documentation

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
anonymous enum

Constants.

Enumerator
WARP_THREADS 

Number of warp threads.

BLOCK_THREADS 

The thread block size in threads.

WARPS 

Number of active warps.

Definition at line 63 of file block_scan_warp_scans.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
anonymous enum

Constants.

Enumerator
WARP_THREADS 

Number of warp threads.

BLOCK_THREADS 

The thread block size in threads.

WARPS 

Number of active warps.

Definition at line 63 of file block_scan_warp_scans2.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
anonymous enum

Constants.

Enumerator
BLOCK_THREADS 

The thread block size in threads.

INNER_WARP_THREADS 

Number of warp threads.

OUTER_WARPS 

Number of outer scan warps.

Definition at line 63 of file block_scan_warp_scans3.cuh.

Member Function Documentation

◆ __align__()

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
struct cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::__align__ ( 32  )
inline

Shared memory storage layout type.

< Buffer for warp-synchronous scans

< Shared prefix for the entire thread block

Definition at line 83 of file block_scan_warp_scans.cuh.

◆ ApplyWarpAggregates() [1/3]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp , int WARP>
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ApplyWarpAggregates ( T &  warp_prefix,
ScanOp  scan_op,
T &  block_aggregate,
Int2Type< WARP >   
)
inline
Parameters
[out]warp_prefixThe calling thread's partial reduction
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 126 of file block_scan_warp_scans.cuh.

◆ ApplyWarpAggregates() [2/3]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp , int WARP>
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ApplyWarpAggregates ( T &  warp_prefix,
ScanOp  scan_op,
T &  block_aggregate,
Int2Type< WARP >  addend_warp 
)
inline
Parameters
[out]warp_prefixThe calling thread's partial reduction
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 126 of file block_scan_warp_scans2.cuh.

◆ ApplyWarpAggregates() [3/3]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ApplyWarpAggregates ( T &  warp_prefix,
ScanOp  scan_op,
T &  block_aggregate,
Int2Type< WARPS addend_warp 
)
inline
Parameters
[out]warp_prefixThe calling thread's partial reduction
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 142 of file block_scan_warp_scans2.cuh.

◆ ComputeWarpPrefix() [1/4]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ T cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ComputeWarpPrefix ( ScanOp  scan_op,
warp_aggregate,
T &  block_aggregate 
)
inline

Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads.

Parameters
[in]scan_opBinary scan operator
[in]warp_aggregate[laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 152 of file block_scan_warp_scans.cuh.

◆ ComputeWarpPrefix() [2/4]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ T cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ComputeWarpPrefix ( ScanOp  scan_op,
warp_aggregate,
T &  block_aggregate 
)
inline

Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads.

Parameters
[in]scan_opBinary scan operator
[in]warp_aggregate[laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 152 of file block_scan_warp_scans2.cuh.

◆ ComputeWarpPrefix() [3/4]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ T cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ComputeWarpPrefix ( ScanOp  scan_op,
warp_aggregate,
T &  block_aggregate,
const T &  initial_value 
)
inline

Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads.

Parameters
[in]scan_opBinary scan operator
[in]warp_aggregate[laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items
[out]block_aggregateThreadblock-wide aggregate reduction of input items
[in]initial_valueInitial value to seed the exclusive scan

Definition at line 187 of file block_scan_warp_scans2.cuh.

◆ ComputeWarpPrefix() [4/4]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ T cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ComputeWarpPrefix ( ScanOp  scan_op,
warp_aggregate,
T &  block_aggregate,
const T &  initial_value 
)
inline

Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads.

Parameters
[in]scan_opBinary scan operator
[in]warp_aggregate[laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items
[out]block_aggregateThreadblock-wide aggregate reduction of input items
[in]initial_valueInitial value to seed the exclusive scan

Definition at line 187 of file block_scan_warp_scans.cuh.

◆ ExclusiveScan() [1/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator

Definition at line 137 of file block_scan_warp_scans3.cuh.

◆ ExclusiveScan() [2/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
const T &  initial_value,
ScanOp  scan_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.

Parameters
[in]inputCalling thread's input items
[out]exclusive_outputCalling thread's output items (may be aliased to input)
[in]initial_valueInitial value to seed the exclusive scan
[in]scan_opBinary scan operator

Definition at line 150 of file block_scan_warp_scans3.cuh.

◆ ExclusiveScan() [3/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 163 of file block_scan_warp_scans3.cuh.

◆ ExclusiveScan() [4/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator

Definition at line 209 of file block_scan_warp_scans2.cuh.

◆ ExclusiveScan() [5/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator

Definition at line 209 of file block_scan_warp_scans.cuh.

◆ ExclusiveScan() [6/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
const T &  initial_value,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input items
[out]exclusive_outputCalling thread's output items (may be aliased to input)
[in]initial_valueInitial value to seed the exclusive scan
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 210 of file block_scan_warp_scans3.cuh.

◆ ExclusiveScan() [7/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
const T &  initial_value,
ScanOp  scan_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.

Parameters
[in]inputCalling thread's input items
[out]exclusive_outputCalling thread's output items (may be aliased to input)
[in]initial_valueInitial value to seed the exclusive scan
[in]scan_opBinary scan operator

Definition at line 222 of file block_scan_warp_scans.cuh.

◆ ExclusiveScan() [8/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
const T &  initial_value,
ScanOp  scan_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.

Parameters
[in]inputCalling thread's input items
[out]exclusive_outputCalling thread's output items (may be aliased to input)
[in]initial_valueInitial value to seed the exclusive scan
[in]scan_opBinary scan operator

Definition at line 222 of file block_scan_warp_scans2.cuh.

◆ ExclusiveScan() [9/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 235 of file block_scan_warp_scans.cuh.

◆ ExclusiveScan() [10/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 235 of file block_scan_warp_scans2.cuh.

◆ ExclusiveScan() [11/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op,
BlockPrefixCallbackOp &  block_prefix_callback_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. The call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[in,out]block_prefix_callback_op[warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.

Definition at line 259 of file block_scan_warp_scans3.cuh.

◆ ExclusiveScan() [12/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
const T &  initial_value,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input items
[out]exclusive_outputCalling thread's output items (may be aliased to input)
[in]initial_valueInitial value to seed the exclusive scan
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 260 of file block_scan_warp_scans.cuh.

◆ ExclusiveScan() [13/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
const T &  initial_value,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input items
[out]exclusive_outputCalling thread's output items (may be aliased to input)
[in]initial_valueInitial value to seed the exclusive scan
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 282 of file block_scan_warp_scans2.cuh.

◆ ExclusiveScan() [14/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op,
BlockPrefixCallbackOp &  block_prefix_callback_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[in,out]block_prefix_callback_op[warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.

Definition at line 285 of file block_scan_warp_scans.cuh.

◆ ExclusiveScan() [15/15]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ExclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op,
BlockPrefixCallbackOp &  block_prefix_callback_op 
)
inline

Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[in,out]block_prefix_callback_op[warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.

Definition at line 329 of file block_scan_warp_scans2.cuh.

◆ InclusiveScan() [1/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  inclusive_output,
ScanOp  scan_op 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.

Parameters
[in]inputCalling thread's input item
[out]inclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator

Definition at line 312 of file block_scan_warp_scans3.cuh.

◆ InclusiveScan() [2/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  inclusive_output,
ScanOp  scan_op 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.

Parameters
[in]inputCalling thread's input item
[out]inclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator

Definition at line 324 of file block_scan_warp_scans.cuh.

◆ InclusiveScan() [3/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  inclusive_output,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]inclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 324 of file block_scan_warp_scans3.cuh.

◆ InclusiveScan() [4/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  inclusive_output,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]inclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 336 of file block_scan_warp_scans.cuh.

◆ InclusiveScan() [5/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op,
BlockPrefixCallbackOp &  block_prefix_callback_op 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[in,out]block_prefix_callback_op[warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.

Definition at line 359 of file block_scan_warp_scans.cuh.

◆ InclusiveScan() [6/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  inclusive_output,
ScanOp  scan_op 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element.

Parameters
[in]inputCalling thread's input item
[out]inclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator

Definition at line 368 of file block_scan_warp_scans2.cuh.

◆ InclusiveScan() [7/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  inclusive_output,
ScanOp  scan_op,
BlockPrefixCallbackOp &  block_prefix_callback_op 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.

Parameters
[in]inputCalling thread's input item
[out]inclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[in,out]block_prefix_callback_op[warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.

Definition at line 370 of file block_scan_warp_scans3.cuh.

◆ InclusiveScan() [8/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  inclusive_output,
ScanOp  scan_op,
T &  block_aggregate 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]inclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[out]block_aggregateThreadblock-wide aggregate reduction of input items

Definition at line 380 of file block_scan_warp_scans2.cuh.

◆ InclusiveScan() [9/9]

template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int PTX_ARCH>
template<typename ScanOp , typename BlockPrefixCallbackOp >
__device__ __forceinline__ void cub::BlockScanWarpScans< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InclusiveScan ( input,
T &  exclusive_output,
ScanOp  scan_op,
BlockPrefixCallbackOp &  block_prefix_callback_op 
)
inline

Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs.

Parameters
[in]inputCalling thread's input item
[out]exclusive_outputCalling thread's output item (may be aliased to input)
[in]scan_opBinary scan operator
[in,out]block_prefix_callback_op[warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.

Definition at line 403 of file block_scan_warp_scans2.cuh.


The documentation for this struct was generated from the following files: