37#include "../../util_ptx.cuh" 
   38#include "../../util_arch.cuh" 
   39#include "../../block/block_raking_layout.cuh" 
   40#include "../../thread/thread_reduce.cuh" 
   41#include "../../thread/thread_scan.cuh" 
   42#include "../../warp/warp_scan.cuh" 
   43#include "../../util_namespace.cuh" 
  113    unsigned int    linear_tid;
 
  122    template <
int ITERATION, 
typename ScanOp>
 
  131            T addend = raking_ptr[ITERATION];
 
  132            raking_partial = 
scan_op(raking_partial, addend);
 
  140    template <
typename ScanOp>
 
  147        return raking_partial;
 
  152    template <
int ITERATION>
 
  158        out[ITERATION] = in[ITERATION];
 
  172    template <
typename ScanOp>
 
  181        T raking_partial = cached_segment[0];
 
  188    template <
typename ScanOp>
 
  192        bool            apply_prefix = 
true)
 
  210    template <
typename ScanOp>
 
  214        bool            apply_prefix = 
true)
 
  239        temp_storage(temp_storage.Alias()),
 
  240        linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
 
  249    template <
typename ScanOp>
 
  258            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, 
scan_op);
 
  264            *placement_ptr = input;
 
  276                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, 
scan_op);
 
  285            exclusive_output = *placement_ptr;
 
  290    template <
typename ScanOp>
 
  294        const T         &initial_value,     
 
  300            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, 
scan_op);
 
  306            *placement_ptr = input;
 
  318                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, 
scan_op);
 
  327            output = *placement_ptr;
 
  333    template <
typename ScanOp>
 
  343            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, 
scan_op, block_aggregate);
 
  349            *placement_ptr = input;
 
  362                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, 
scan_op);
 
  369                    temp_storage.block_aggregate = inclusive_partial;
 
  375            output = *placement_ptr;
 
  378            block_aggregate = temp_storage.block_aggregate;
 
  384    template <
typename ScanOp>
 
  388        const T         &initial_value,     
 
  395            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, 
scan_op, block_aggregate);
 
  401            *placement_ptr = input;
 
  413                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, 
scan_op, block_aggregate);
 
  420                    temp_storage.block_aggregate = block_aggregate;
 
  426            output = *placement_ptr;
 
  429            block_aggregate = temp_storage.block_aggregate;
 
  437        typename BlockPrefixCallbackOp>
 
  442        BlockPrefixCallbackOp   &block_prefix_callback_op)      
 
  448            WarpScan warp_scan(temp_storage.warp_scan);
 
  449            warp_scan.ExclusiveScan(input, output, 
scan_op, block_aggregate);
 
  452            T block_prefix = block_prefix_callback_op(block_aggregate);
 
  453            block_prefix = warp_scan.Broadcast(block_prefix, 0);
 
  455            output = 
scan_op(block_prefix, output);
 
  457                output = block_prefix;
 
  463            *placement_ptr = input;
 
  470                WarpScan warp_scan(temp_storage.warp_scan);
 
  476                T exclusive_partial, block_aggregate;
 
  477                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, 
scan_op, block_aggregate);
 
  480                T block_prefix = block_prefix_callback_op(block_aggregate);
 
  481                block_prefix = warp_scan.Broadcast(block_prefix, 0);
 
  484                T downsweep_prefix = 
scan_op(block_prefix, exclusive_partial);
 
  486                    downsweep_prefix = block_prefix;
 
  495            output = *placement_ptr;
 
  505    template <
typename ScanOp>
 
  514            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, 
scan_op);
 
  520            *placement_ptr = input;
 
  532                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, 
scan_op);
 
  541            output = *placement_ptr;
 
  547    template <
typename ScanOp>
 
  557            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, 
scan_op, block_aggregate);
 
  563            *placement_ptr = input;
 
  576                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, 
scan_op);
 
  583                    temp_storage.block_aggregate = inclusive_partial;
 
  589            output = *placement_ptr;
 
  592            block_aggregate = temp_storage.block_aggregate;
 
  600        typename BlockPrefixCallbackOp>
 
  605        BlockPrefixCallbackOp   &block_prefix_callback_op)      
 
  611            WarpScan warp_scan(temp_storage.warp_scan);
 
  612            warp_scan.InclusiveScan(input, output, 
scan_op, block_aggregate);
 
  615            T block_prefix = block_prefix_callback_op(block_aggregate);
 
  616            block_prefix = warp_scan.Broadcast(block_prefix, 0);
 
  619            output = 
scan_op(block_prefix, output);
 
  625            *placement_ptr = input;
 
  632                WarpScan warp_scan(temp_storage.warp_scan);
 
  638                T exclusive_partial, block_aggregate;
 
  639                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, 
scan_op, block_aggregate);
 
  642                T block_prefix = block_prefix_callback_op(block_aggregate);
 
  643                block_prefix = warp_scan.Broadcast(block_prefix, 0);
 
  646                T downsweep_prefix = 
scan_op(block_prefix, exclusive_partial);
 
  648                    downsweep_prefix = block_prefix;
 
  657            output = *placement_ptr;
 
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
 
__device__ __forceinline__ T ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
 
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
 
Optional outer namespace(s)
 
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
 
Alias wrapper allowing storage to be unioned.
 
@ UNGUARDED
Whether or not we need bounds checking during raking (the number of reduction elements is not a multi...
 
@ SEGMENT_LENGTH
Number of raking elements per warp-synchronous raking thread (rounded up)
 
@ RAKING_THREADS
Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LE...
 
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
 
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
 
Alias wrapper allowing storage to be unioned.
 
Shared memory storage layout type.
 
BlockRakingLayout::TempStorage raking_grid
Padded thread block raking grid.
 
T block_aggregate
Block aggregate.
 
WarpScan::TempStorage warp_scan
Buffer for warp-synchronous scan.
 
BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
 
BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > BlockRakingLayout
Layout type for padded thread block raking grid.
 
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
 
__device__ __forceinline__ T GuardedReduce(T *, ScanOp, T raking_partial, Int2Type< SEGMENT_LENGTH >)
Templated reduction (base case)
 
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
 
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
 
__device__ __forceinline__ BlockScanRaking(TempStorage &temp_storage)
Constructor.
 
__device__ __forceinline__ void CopySegment(T *out, T *in, Int2Type< ITERATION >)
Templated copy.
 
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
 
__device__ __forceinline__ void ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
 
__device__ __forceinline__ void InclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix=true)
Performs inclusive downsweep raking scan.
 
__device__ __forceinline__ void ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
 
@ BLOCK_THREADS
The thread block size in threads.
 
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
 
@ WARP_SYNCHRONOUS
Cooperative work can be entirely warp synchronous.
 
@ SEGMENT_LENGTH
Number of raking elements per warp synchronous raking thread.
 
@ RAKING_THREADS
Number of raking threads.
 
WarpScan< T, RAKING_THREADS, PTX_ARCH > WarpScan
WarpScan utility type.
 
__device__ __forceinline__ void CopySegment(T *, T *, Int2Type< SEGMENT_LENGTH >)
Templated copy (base case)
 
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
 
__device__ __forceinline__ T Upsweep(ScanOp scan_op)
Performs upsweep raking reduction, returning the aggregate.
 
__device__ __forceinline__ T GuardedReduce(T *raking_ptr, ScanOp scan_op, T raking_partial, Int2Type< ITERATION >)
Templated reduction.
 
__device__ __forceinline__ void ExclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix=true)
Performs exclusive downsweep raking scan.
 
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
 
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.