39#include "../block/block_load.cuh"
40#include "../block/block_store.cuh"
41#include "../block/block_scan.cuh"
42#include "../grid/grid_queue.cuh"
43#include "../iterator/cache_modified_input_iterator.cuh"
44#include "../util_namespace.cuh"
62 int _ITEMS_PER_THREAD,
92 typename AgentScanPolicyT,
93 typename InputIteratorT,
94 typename OutputIteratorT,
105 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
109 typename std::iterator_traits<InputIteratorT>::value_type,
110 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
118 InputIteratorT>::Type
119 WrappedInputIteratorT;
125 BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS,
126 ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD,
127 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
133 AgentScanPolicyT::BLOCK_THREADS,
134 AgentScanPolicyT::ITEMS_PER_THREAD,
135 AgentScanPolicyT::LOAD_ALGORITHM>
141 AgentScanPolicyT::BLOCK_THREADS,
142 AgentScanPolicyT::ITEMS_PER_THREAD,
143 AgentScanPolicyT::STORE_ALGORITHM>
149 AgentScanPolicyT::BLOCK_THREADS,
150 AgentScanPolicyT::SCAN_ALGORITHM>
201 __device__ __forceinline__
203 OutputT (&items)[ITEMS_PER_THREAD],
206 OutputT &block_aggregate,
217 __device__ __forceinline__
219 OutputT (&items)[ITEMS_PER_THREAD],
222 OutputT &block_aggregate,
232 template <
typename PrefixCallback>
233 __device__ __forceinline__
235 OutputT (&items)[ITEMS_PER_THREAD],
237 PrefixCallback &prefix_op,
247 template <
typename PrefixCallback>
248 __device__ __forceinline__
250 OutputT (&items)[ITEMS_PER_THREAD],
252 PrefixCallback &prefix_op,
264 __device__ __forceinline__
268 OutputIteratorT
d_out,
287 template <
bool IS_LAST_TILE>
295 OutputT items[ITEMS_PER_THREAD];
308 OutputT block_aggregate;
310 if ((!IS_LAST_TILE) && (threadIdx.x == 0))
343 if (num_remaining > TILE_ITEMS)
346 ConsumeTile<false>(num_remaining, tile_idx, tile_offset,
tile_state);
348 else if (num_remaining > 0)
351 ConsumeTile<true>(num_remaining, tile_idx, tile_offset,
tile_state);
369 int valid_items = TILE_ITEMS)
372 OutputT items[ITEMS_PER_THREAD];
384 OutputT block_aggregate;
386 prefix_op.running_total = block_aggregate;
412 if (range_offset + TILE_ITEMS <= range_end)
415 ConsumeTile<true, true>(range_offset, prefix_op);
416 range_offset += TILE_ITEMS;
419 while (range_offset + TILE_ITEMS <= range_end)
421 ConsumeTile<false, true>(range_offset, prefix_op);
422 range_offset += TILE_ITEMS;
426 if (range_offset < range_end)
428 int valid_items = range_end - range_offset;
429 ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
435 int valid_items = range_end - range_offset;
436 ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
452 while (range_offset + TILE_ITEMS <= range_end)
454 ConsumeTile<true, false>(range_offset, prefix_op);
455 range_offset += TILE_ITEMS;
459 if (range_offset < range_end)
461 int valid_items = range_end - range_offset;
462 ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
The BlockStore class provides collective data movement methods for writing a blocked arrangement of i...
BlockStoreAlgorithm
cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arr...
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
< The BlockScan algorithm to use
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
@ BLOCK_THREADS
Threads per thread block.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
static const BlockStoreAlgorithm STORE_ALGORITHM
The BlockStore algorithm to use.
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
OutputIteratorT d_out
Output data.
__device__ __forceinline__ void ConsumeRange(OffsetT range_offset, OffsetT range_end)
__device__ __forceinline__ AgentScan(TempStorage &temp_storage, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value)
_TempStorage & temp_storage
Reference to temp_storage.
__device__ __forceinline__ void ConsumeRange(int num_items, ScanTileStateT &tile_state, int start_tile)
__device__ __forceinline__ void ConsumeTile(OffsetT tile_offset, RunningPrefixCallbackOp &prefix_op, int valid_items=TILE_ITEMS)
__device__ __forceinline__ void ScanTile(OutputT(&items)[ITEMS_PER_THREAD], OutputT init_value, ScanOpT scan_op, OutputT &block_aggregate, Int2Type< false >)
__device__ __forceinline__ void ScanTile(OutputT(&items)[ITEMS_PER_THREAD], InitValueT, ScanOpT scan_op, OutputT &block_aggregate, Int2Type< true >)
InitValueT init_value
The init_value element for ScanOpT.
__device__ __forceinline__ void ConsumeRange(OffsetT range_offset, OffsetT range_end, OutputT prefix)
WrappedInputIteratorT d_in
Input data.
__device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state)
< Whether the current tile is the last tile
__device__ __forceinline__ void ScanTile(OutputT(&items)[ITEMS_PER_THREAD], ScanOpT scan_op, PrefixCallback &prefix_op, Int2Type< false >)
ScanOpT scan_op
Binary scan operator.
__device__ __forceinline__ void ScanTile(OutputT(&items)[ITEMS_PER_THREAD], ScanOpT scan_op, PrefixCallback &prefix_op, Int2Type< true >)
< Wrapped scan operator type
Type selection (IF ? ThenType : ElseType)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.