36#include "../../util_arch.cuh"
37#include "../../util_ptx.cuh"
38#include "../../warp/warp_scan.cuh"
39#include "../../util_namespace.cuh"
85 T warp_aggregates[
WARPS];
101 unsigned int linear_tid;
102 unsigned int warp_id;
103 unsigned int lane_id;
114 temp_storage(temp_storage.Alias()),
115 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
125 template <
typename ScanOp,
int WARP>
133 warp_prefix = block_aggregate;
135 T addend = temp_storage.warp_aggregates[WARP];
136 block_aggregate =
scan_op(block_aggregate, addend);
141 template <
typename ScanOp>
151 template <
typename ScanOp>
159 temp_storage.warp_aggregates[warp_id] = warp_aggregate;
165 block_aggregate = temp_storage.warp_aggregates[0];
186 template <
typename ScanOp>
191 const T &initial_value)
195 warp_prefix =
scan_op(initial_value, warp_prefix);
198 warp_prefix = initial_value;
208 template <
typename ScanOp>
221 template <
typename ScanOp>
225 const T &initial_value,
234 template <
typename ScanOp>
243 WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output,
scan_op);
251 exclusive_output =
scan_op(warp_prefix, exclusive_output);
253 exclusive_output = warp_prefix;
259 template <
typename ScanOp>
263 const T &initial_value,
269 WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output,
scan_op);
275 exclusive_output =
scan_op(warp_prefix, exclusive_output);
277 exclusive_output = warp_prefix;
284 typename BlockPrefixCallbackOp>
289 BlockPrefixCallbackOp &block_prefix_callback_op)
298 T block_prefix = block_prefix_callback_op(block_aggregate);
302 temp_storage.block_prefix = block_prefix;
303 exclusive_output = block_prefix;
310 T block_prefix = temp_storage.block_prefix;
313 exclusive_output =
scan_op(block_prefix, exclusive_output);
323 template <
typename ScanOp>
335 template <
typename ScanOp>
342 WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output,
scan_op);
350 inclusive_output =
scan_op(warp_prefix, inclusive_output);
358 typename BlockPrefixCallbackOp>
363 BlockPrefixCallbackOp &block_prefix_callback_op)
371 T block_prefix = block_prefix_callback_op(block_aggregate);
375 temp_storage.block_prefix = block_prefix;
382 T block_prefix = temp_storage.block_prefix;
383 exclusive_output =
scan_op(block_prefix, exclusive_output);
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
Shared memory storage layout type.
BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread bloc...
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ WARP_THREADS
Number of warp threads.
@ WARPS
Number of active warps.
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
struct __align__(32) _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate)
Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate ...
WarpScan< T, WARPS, PTX_ARCH > WarpAggregateScan
WarpScan utility type.
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
WarpScan< T, WARP_THREADS, PTX_ARCH > WarpScanT
WarpScan utility type.
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP >)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.