36#include "../../util_arch.cuh"
37#include "../../util_ptx.cuh"
38#include "../../warp/warp_scan.cuh"
39#include "../../util_namespace.cuh"
56struct BlockScanWarpScans
110 _TempStorage &temp_storage;
111 unsigned int linear_tid;
112 unsigned int warp_id;
113 unsigned int lane_id;
124 temp_storage(temp_storage.Alias()),
125 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
126 warp_id((
OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
127 lane_id((
OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
136 template <
typename ScanOp>
149 template <
typename ScanOp>
153 const T &initial_value,
162 template <
typename ScanOp>
171 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
172 input, inclusive_output, exclusive_output,
scan_op);
175 if (lane_id == OUTER_WARP_THREADS - 1)
176 temp_storage.warp_aggregates[warp_id] = inclusive_output;
182 T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
183 T outer_warp_exclusive;
185 InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
186 outer_warp_input, outer_warp_exclusive,
scan_op, block_aggregate);
188 temp_storage.block_aggregate = block_aggregate;
189 temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
197 block_aggregate = temp_storage.block_aggregate;
200 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
201 exclusive_output =
scan_op(outer_warp_exclusive, exclusive_output);
203 exclusive_output = outer_warp_exclusive;
209 template <
typename ScanOp>
213 const T &initial_value,
219 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
220 input, inclusive_output, exclusive_output,
scan_op);
223 if (lane_id == OUTER_WARP_THREADS - 1)
225 temp_storage.warp_aggregates[warp_id] = inclusive_output;
232 T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
233 T outer_warp_exclusive;
235 InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
236 outer_warp_input, outer_warp_exclusive, initial_value,
scan_op, block_aggregate);
238 temp_storage.block_aggregate = block_aggregate;
239 temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
245 block_aggregate = temp_storage.block_aggregate;
248 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
249 exclusive_output =
scan_op(outer_warp_exclusive, exclusive_output);
251 exclusive_output = outer_warp_exclusive;
258 typename BlockPrefixCallbackOp>
263 BlockPrefixCallbackOp &block_prefix_callback_op)
267 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
268 input, inclusive_output, exclusive_output,
scan_op);
271 if (lane_id == OUTER_WARP_THREADS - 1)
272 temp_storage.warp_aggregates[warp_id] = inclusive_output;
278 InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
280 T upsweep = temp_storage.warp_aggregates[linear_tid];
281 T downsweep_prefix, block_aggregate;
286 T block_prefix = block_prefix_callback_op(block_aggregate);
287 block_prefix = inner_scan.
Broadcast(block_prefix, 0);
289 downsweep_prefix =
scan_op(block_prefix, downsweep_prefix);
291 downsweep_prefix = block_prefix;
293 temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
299 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
300 exclusive_output =
scan_op(outer_warp_exclusive, exclusive_output);
302 exclusive_output = outer_warp_exclusive;
311 template <
typename ScanOp>
323 template <
typename ScanOp>
331 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
332 input, inclusive_output,
scan_op);
335 if (lane_id == OUTER_WARP_THREADS - 1)
336 temp_storage.warp_aggregates[warp_id] = inclusive_output;
342 T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
343 T outer_warp_exclusive;
345 InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
346 outer_warp_input, outer_warp_exclusive,
scan_op, block_aggregate);
348 temp_storage.block_aggregate = block_aggregate;
349 temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
357 block_aggregate = temp_storage.block_aggregate;
360 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
361 inclusive_output =
scan_op(outer_warp_exclusive, inclusive_output);
369 typename BlockPrefixCallbackOp>
374 BlockPrefixCallbackOp &block_prefix_callback_op)
377 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
378 input, inclusive_output,
scan_op);
381 if (lane_id == OUTER_WARP_THREADS - 1)
382 temp_storage.warp_aggregates[warp_id] = inclusive_output;
388 InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
390 T upsweep = temp_storage.warp_aggregates[linear_tid];
391 T downsweep_prefix, block_aggregate;
395 T block_prefix = block_prefix_callback_op(block_aggregate);
396 block_prefix = inner_scan.
Broadcast(block_prefix, 0);
398 downsweep_prefix =
scan_op(block_prefix, downsweep_prefix);
400 downsweep_prefix = block_prefix;
402 temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
408 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
409 inclusive_output =
scan_op(outer_warp_exclusive, inclusive_output);
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
Shared memory storage layout type.
T block_aggregate
Shared prefix for the entire thread block.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
WarpScan< T, INNER_WARP_THREADS, PTX_ARCH > InnerWarpScanT
Inner WarpScan utility type.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
WarpScan< T, OUTER_WARP_THREADS, PTX_ARCH > OuterWarpScanT
Outer WarpScan utility type.
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ INNER_WARP_THREADS
Number of warp threads.
@ BLOCK_THREADS
The thread block size in threads.
@ OUTER_WARPS
Number of outer scan warps.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
InnerWarpScanT::TempStorage inner_warp_scan
Buffer for warp-synchronous inner scan.
Uninitialized< OuterScanArray > outer_warp_scan
Buffer for warp-synchronous outer scans.