37#include "../../util_ptx.cuh"
38#include "../../util_arch.cuh"
39#include "../../block/block_raking_layout.cuh"
40#include "../../thread/thread_reduce.cuh"
41#include "../../thread/thread_scan.cuh"
42#include "../../warp/warp_scan.cuh"
43#include "../../util_namespace.cuh"
113 unsigned int linear_tid;
122 template <
int ITERATION,
typename ScanOp>
131 T addend = raking_ptr[ITERATION];
132 raking_partial =
scan_op(raking_partial, addend);
140 template <
typename ScanOp>
147 return raking_partial;
152 template <
int ITERATION>
158 out[ITERATION] = in[ITERATION];
172 template <
typename ScanOp>
181 T raking_partial = cached_segment[0];
188 template <
typename ScanOp>
192 bool apply_prefix =
true)
210 template <
typename ScanOp>
214 bool apply_prefix =
true)
239 temp_storage(temp_storage.Alias()),
240 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
249 template <
typename ScanOp>
258 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output,
scan_op);
264 *placement_ptr = input;
276 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial,
scan_op);
285 exclusive_output = *placement_ptr;
290 template <
typename ScanOp>
294 const T &initial_value,
300 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value,
scan_op);
306 *placement_ptr = input;
318 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value,
scan_op);
327 output = *placement_ptr;
333 template <
typename ScanOp>
343 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output,
scan_op, block_aggregate);
349 *placement_ptr = input;
362 WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial,
scan_op);
369 temp_storage.block_aggregate = inclusive_partial;
375 output = *placement_ptr;
378 block_aggregate = temp_storage.block_aggregate;
384 template <
typename ScanOp>
388 const T &initial_value,
395 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value,
scan_op, block_aggregate);
401 *placement_ptr = input;
413 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value,
scan_op, block_aggregate);
420 temp_storage.block_aggregate = block_aggregate;
426 output = *placement_ptr;
429 block_aggregate = temp_storage.block_aggregate;
437 typename BlockPrefixCallbackOp>
442 BlockPrefixCallbackOp &block_prefix_callback_op)
448 WarpScan warp_scan(temp_storage.warp_scan);
449 warp_scan.ExclusiveScan(input, output,
scan_op, block_aggregate);
452 T block_prefix = block_prefix_callback_op(block_aggregate);
453 block_prefix = warp_scan.Broadcast(block_prefix, 0);
455 output =
scan_op(block_prefix, output);
457 output = block_prefix;
463 *placement_ptr = input;
470 WarpScan warp_scan(temp_storage.warp_scan);
476 T exclusive_partial, block_aggregate;
477 warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial,
scan_op, block_aggregate);
480 T block_prefix = block_prefix_callback_op(block_aggregate);
481 block_prefix = warp_scan.Broadcast(block_prefix, 0);
484 T downsweep_prefix =
scan_op(block_prefix, exclusive_partial);
486 downsweep_prefix = block_prefix;
495 output = *placement_ptr;
505 template <
typename ScanOp>
514 WarpScan(temp_storage.warp_scan).InclusiveScan(input, output,
scan_op);
520 *placement_ptr = input;
532 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial,
scan_op);
541 output = *placement_ptr;
547 template <
typename ScanOp>
557 WarpScan(temp_storage.warp_scan).InclusiveScan(input, output,
scan_op, block_aggregate);
563 *placement_ptr = input;
576 WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial,
scan_op);
583 temp_storage.block_aggregate = inclusive_partial;
589 output = *placement_ptr;
592 block_aggregate = temp_storage.block_aggregate;
600 typename BlockPrefixCallbackOp>
605 BlockPrefixCallbackOp &block_prefix_callback_op)
611 WarpScan warp_scan(temp_storage.warp_scan);
612 warp_scan.InclusiveScan(input, output,
scan_op, block_aggregate);
615 T block_prefix = block_prefix_callback_op(block_aggregate);
616 block_prefix = warp_scan.Broadcast(block_prefix, 0);
619 output =
scan_op(block_prefix, output);
625 *placement_ptr = input;
632 WarpScan warp_scan(temp_storage.warp_scan);
638 T exclusive_partial, block_aggregate;
639 warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial,
scan_op, block_aggregate);
642 T block_prefix = block_prefix_callback_op(block_aggregate);
643 block_prefix = warp_scan.Broadcast(block_prefix, 0);
646 T downsweep_prefix =
scan_op(block_prefix, exclusive_partial);
648 downsweep_prefix = block_prefix;
657 output = *placement_ptr;
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ T ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
@ UNGUARDED
Whether or not we need bounds checking during raking (the number of reduction elements is not a multi...
@ SEGMENT_LENGTH
Number of raking elements per warp-synchronous raking thread (rounded up)
@ RAKING_THREADS
Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LE...
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
Alias wrapper allowing storage to be unioned.
Shared memory storage layout type.
BlockRakingLayout::TempStorage raking_grid
Padded thread block raking grid.
T block_aggregate
Block aggregate.
WarpScan::TempStorage warp_scan
Buffer for warp-synchronous scan.
BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > BlockRakingLayout
Layout type for padded thread block raking grid.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T GuardedReduce(T *, ScanOp, T raking_partial, Int2Type< SEGMENT_LENGTH >)
Templated reduction (base case)
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ BlockScanRaking(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ void CopySegment(T *out, T *in, Int2Type< ITERATION >)
Templated copy.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix=true)
Performs inclusive downsweep raking scan.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ WARP_SYNCHRONOUS
Cooperative work can be entirely warp synchronous.
@ SEGMENT_LENGTH
Number of raking elements per warp synchronous raking thread.
@ RAKING_THREADS
Number of raking threads.
WarpScan< T, RAKING_THREADS, PTX_ARCH > WarpScan
WarpScan utility type.
__device__ __forceinline__ void CopySegment(T *, T *, Int2Type< SEGMENT_LENGTH >)
Templated copy (base case)
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T Upsweep(ScanOp scan_op)
Performs upsweep raking reduction, returning the aggregate.
__device__ __forceinline__ T GuardedReduce(T *raking_ptr, ScanOp scan_op, T raking_partial, Int2Type< ITERATION >)
Templated reduction.
__device__ __forceinline__ void ExclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix=true)
Performs exclusive downsweep raking scan.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.