BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. More...
BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
< The PTX compute capability for which to to specialize this collective
Definition at line 62 of file block_scan_raking.cuh.
Data Structures | |
struct | _TempStorage |
Shared memory storage layout type. More... | |
struct | TempStorage |
Alias wrapper allowing storage to be unioned. More... | |
Public Types | |
enum | { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z } |
Constants. More... | |
enum | { RAKING_THREADS = BlockRakingLayout::RAKING_THREADS , SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH , WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS) } |
Constants. More... | |
typedef BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > | BlockRakingLayout |
Layout type for padded thread block raking grid. | |
typedef WarpScan< T, RAKING_THREADS, PTX_ARCH > | WarpScan |
WarpScan utility type. | |
Public Member Functions | |
template<int ITERATION, typename ScanOp > | |
__device__ __forceinline__ T | GuardedReduce (T *raking_ptr, ScanOp scan_op, T raking_partial, Int2Type< ITERATION >) |
Templated reduction. | |
template<typename ScanOp > | |
__device__ __forceinline__ T | GuardedReduce (T *, ScanOp, T raking_partial, Int2Type< SEGMENT_LENGTH >) |
Templated reduction (base case) | |
template<int ITERATION> | |
__device__ __forceinline__ void | CopySegment (T *out, T *in, Int2Type< ITERATION >) |
Templated copy. | |
__device__ __forceinline__ void | CopySegment (T *, T *, Int2Type< SEGMENT_LENGTH >) |
Templated copy (base case) | |
template<typename ScanOp > | |
__device__ __forceinline__ T | Upsweep (ScanOp scan_op) |
Performs upsweep raking reduction, returning the aggregate. | |
template<typename ScanOp > | |
__device__ __forceinline__ void | ExclusiveDownsweep (ScanOp scan_op, T raking_partial, bool apply_prefix=true) |
Performs exclusive downsweep raking scan. | |
template<typename ScanOp > | |
__device__ __forceinline__ void | InclusiveDownsweep (ScanOp scan_op, T raking_partial, bool apply_prefix=true) |
Performs inclusive downsweep raking scan. | |
__device__ __forceinline__ | BlockScanRaking (TempStorage &temp_storage) |
Constructor. | |
template<typename ScanOp > | |
__device__ __forceinline__ void | ExclusiveScan (T input, T &exclusive_output, ScanOp scan_op) |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. | |
template<typename ScanOp > | |
__device__ __forceinline__ void | ExclusiveScan (T input, T &output, const T &initial_value, ScanOp scan_op) |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. | |
template<typename ScanOp > | |
__device__ __forceinline__ void | ExclusiveScan (T input, T &output, ScanOp scan_op, T &block_aggregate) |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. | |
template<typename ScanOp > | |
__device__ __forceinline__ void | ExclusiveScan (T input, T &output, const T &initial_value, ScanOp scan_op, T &block_aggregate) |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. | |
template<typename ScanOp , typename BlockPrefixCallbackOp > | |
__device__ __forceinline__ void | ExclusiveScan (T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op) |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs. | |
template<typename ScanOp > | |
__device__ __forceinline__ void | InclusiveScan (T input, T &output, ScanOp scan_op) |
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. | |
template<typename ScanOp > | |
__device__ __forceinline__ void | InclusiveScan (T input, T &output, ScanOp scan_op, T &block_aggregate) |
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate of all inputs. | |
template<typename ScanOp , typename BlockPrefixCallbackOp > | |
__device__ __forceinline__ void | InclusiveScan (T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op) |
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate of all inputs. | |
Data Fields | |
_TempStorage & | temp_storage |
unsigned int | linear_tid |
T | cached_segment [SEGMENT_LENGTH] |
typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::BlockRakingLayout |
Layout type for padded thread block raking grid.
Definition at line 76 of file block_scan_raking.cuh.
typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::WarpScan |
WarpScan utility type.
Definition at line 92 of file block_scan_raking.cuh.
anonymous enum |
Constants.
Enumerator | |
---|---|
BLOCK_THREADS | The thread block size in threads. |
Definition at line 69 of file block_scan_raking.cuh.
anonymous enum |
Constants.
Enumerator | |
---|---|
RAKING_THREADS | Number of raking threads. |
SEGMENT_LENGTH | Number of raking elements per warp synchronous raking thread. |
WARP_SYNCHRONOUS | Cooperative work can be entirely warp synchronous. |
Definition at line 79 of file block_scan_raking.cuh.
|
inline |
Constructor.
Definition at line 236 of file block_scan_raking.cuh.
|
inline |
Templated copy (base case)
Definition at line 164 of file block_scan_raking.cuh.
|
inline |
Templated copy.
[out] | out | Out array |
[in] | in | Input array |
Definition at line 153 of file block_scan_raking.cuh.
|
inline |
Performs exclusive downsweep raking scan.
Definition at line 189 of file block_scan_raking.cuh.
|
inline |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op
functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined.
[in] | input | Calling thread's input item |
[out] | exclusive_output | Calling thread's output item (may be aliased to input ) |
[in] | scan_op | Binary scan operator |
Definition at line 250 of file block_scan_raking.cuh.
|
inline |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op
functor. Each thread contributes one input element.
[in] | input | Calling thread's input items |
[out] | output | Calling thread's output items (may be aliased to input ) |
[in] | initial_value | Initial value to seed the exclusive scan |
[in] | scan_op | Binary scan operator |
Definition at line 291 of file block_scan_raking.cuh.
|
inline |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op
functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate
of all inputs.
[in] | input | Calling thread's input items |
[out] | output | Calling thread's output items (may be aliased to input ) |
[in] | initial_value | Initial value to seed the exclusive scan |
[in] | scan_op | Binary scan operator |
[out] | block_aggregate | Threadblock-wide aggregate reduction of input items |
Definition at line 385 of file block_scan_raking.cuh.
|
inline |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op
functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op
is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate
of all inputs.
[in] | input | Calling thread's input item |
[out] | output | Calling thread's output item (may be aliased to input ) |
[in] | scan_op | Binary scan operator |
block_prefix_callback_op | [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. |
Definition at line 438 of file block_scan_raking.cuh.
|
inline |
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op
functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate
of all inputs. With no initial value, the output computed for thread0 is undefined.
[in] | input | Calling thread's input item |
[out] | output | Calling thread's output item (may be aliased to input ) |
[in] | scan_op | Binary scan operator |
[out] | block_aggregate | Threadblock-wide aggregate reduction of input items |
Definition at line 334 of file block_scan_raking.cuh.
|
inline |
Templated reduction (base case)
[in] | raking_partial | Prefix to seed reduction with |
Definition at line 141 of file block_scan_raking.cuh.
|
inline |
Templated reduction.
[in] | raking_ptr | Input array |
[in] | scan_op | Binary reduction operator |
[in] | raking_partial | Prefix to seed reduction with |
Definition at line 123 of file block_scan_raking.cuh.
|
inline |
Performs inclusive downsweep raking scan.
Definition at line 211 of file block_scan_raking.cuh.
|
inline |
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op
functor. Each thread contributes one input element.
[in] | input | Calling thread's input item |
[out] | output | Calling thread's output item (may be aliased to input ) |
[in] | scan_op | Binary scan operator |
Definition at line 506 of file block_scan_raking.cuh.
|
inline |
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op
functor. Each thread contributes one input element. the call-back functor block_prefix_callback_op
is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide block_aggregate
of all inputs.
[in] | input | Calling thread's input item |
[out] | output | Calling thread's output item (may be aliased to input ) |
[in] | scan_op | Binary scan operator |
block_prefix_callback_op | [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. |
Definition at line 601 of file block_scan_raking.cuh.
|
inline |
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op
functor. Each thread contributes one input element. Also provides every thread with the block-wide block_aggregate
of all inputs.
[in] | input | Calling thread's input item |
[out] | output | Calling thread's output item (may be aliased to input ) |
[in] | scan_op | Binary scan operator |
[out] | block_aggregate | Threadblock-wide aggregate reduction of input items |
Definition at line 548 of file block_scan_raking.cuh.
|
inline |
Performs upsweep raking reduction, returning the aggregate.
Definition at line 173 of file block_scan_raking.cuh.
T cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::cached_segment[SEGMENT_LENGTH] |
Definition at line 114 of file block_scan_raking.cuh.
unsigned int cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::linear_tid |
Definition at line 113 of file block_scan_raking.cuh.
_TempStorage& cub::BlockScanRaking< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, MEMOIZE, PTX_ARCH >::temp_storage |
Definition at line 112 of file block_scan_raking.cuh.