39#include "../block/block_load.cuh"
40#include "../block/block_store.cuh"
41#include "../block/block_scan.cuh"
42#include "../block/block_exchange.cuh"
43#include "../block/block_discontinuity.cuh"
44#include "../grid/grid_queue.cuh"
45#include "../iterator/cache_modified_input_iterator.cuh"
46#include "../util_namespace.cuh"
64 int _ITEMS_PER_THREAD,
97 typename AgentSelectIfPolicyT,
98 typename InputIteratorT,
99 typename FlagsInputIteratorT,
100 typename SelectedOutputIteratorT,
102 typename EqualityOpT,
112 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
116 typename std::iterator_traits<InputIteratorT>::value_type,
117 typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type
OutputT;
120 typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
132 BLOCK_THREADS = AgentSelectIfPolicyT::BLOCK_THREADS,
133 ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
134 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
135 TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1),
147 InputIteratorT>::Type
148 WrappedInputIteratorT;
153 FlagsInputIteratorT>::Type
154 WrappedFlagsInputIteratorT;
161 AgentSelectIfPolicyT::LOAD_ALGORITHM>
169 AgentSelectIfPolicyT::LOAD_ALGORITHM>
182 AgentSelectIfPolicyT::SCAN_ALGORITHM>
193 typedef OutputT ItemExchangeT[TILE_ITEMS];
237 __device__ __forceinline__
264 template <
bool IS_FIRST_TILE,
bool IS_LAST_TILE>
268 OutputT (&items)[ITEMS_PER_THREAD],
269 OffsetT (&selection_flags)[ITEMS_PER_THREAD],
273 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
276 selection_flags[ITEM] = 1;
278 if (!IS_LAST_TILE || (
OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
279 selection_flags[ITEM] =
select_op(items[ITEM]);
287 template <
bool IS_FIRST_TILE,
bool IS_LAST_TILE>
292 OffsetT (&selection_flags)[ITEMS_PER_THREAD],
297 FlagT flags[ITEMS_PER_THREAD];
311 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
313 selection_flags[ITEM] = flags[ITEM];
321 template <
bool IS_FIRST_TILE,
bool IS_LAST_TILE>
325 OutputT (&items)[ITEMS_PER_THREAD],
326 OffsetT (&selection_flags)[ITEMS_PER_THREAD],
339 if (threadIdx.x == 0)
340 tile_predecessor =
d_in[tile_offset - 1];
349 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
352 if ((IS_LAST_TILE) && (
OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
353 selection_flags[ITEM] = 1;
365 template <
bool IS_LAST_TILE,
bool IS_FIRST_TILE>
367 OutputT (&items)[ITEMS_PER_THREAD],
368 OffsetT (&selection_flags)[ITEMS_PER_THREAD],
369 OffsetT (&selection_indices)[ITEMS_PER_THREAD],
374 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
376 if (selection_flags[ITEM])
378 if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
390 template <
bool IS_LAST_TILE,
bool IS_FIRST_TILE>
392 OutputT (&items)[ITEMS_PER_THREAD],
393 OffsetT (&selection_flags)[ITEMS_PER_THREAD],
394 OffsetT (&selection_indices)[ITEMS_PER_THREAD],
396 int num_tile_selections,
405 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
407 int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
408 if (selection_flags[ITEM])
416 for (
int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
426 template <
bool IS_LAST_TILE,
bool IS_FIRST_TILE>
428 OutputT (&items)[ITEMS_PER_THREAD],
429 OffsetT (&selection_flags)[ITEMS_PER_THREAD],
430 OffsetT (&selection_indices)[ITEMS_PER_THREAD],
432 int num_tile_selections,
439 int tile_num_rejections = num_tile_items - num_tile_selections;
443 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
445 int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
446 int local_selection_idx = selection_indices[ITEM] - num_selections_prefix;
447 int local_rejection_idx = item_idx - local_selection_idx;
448 int local_scatter_offset = (selection_flags[ITEM]) ?
449 tile_num_rejections + local_selection_idx :
459 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
461 int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
462 int rejection_idx = item_idx;
463 int selection_idx = item_idx - tile_num_rejections;
464 OffsetT scatter_offset = (item_idx < tile_num_rejections) ?
465 num_items - num_rejected_prefix - rejection_idx - 1 :
466 num_selections_prefix + selection_idx;
470 if (!IS_LAST_TILE || (item_idx < num_tile_items))
481 template <
bool IS_LAST_TILE,
bool IS_FIRST_TILE>
483 OutputT (&items)[ITEMS_PER_THREAD],
484 OffsetT (&selection_flags)[ITEMS_PER_THREAD],
485 OffsetT (&selection_indices)[ITEMS_PER_THREAD],
487 int num_tile_selections,
493 if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
495 ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
501 num_selections_prefix,
507 ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
523 template <
bool IS_LAST_TILE>
529 OutputT items[ITEMS_PER_THREAD];
530 OffsetT selection_flags[ITEMS_PER_THREAD];
531 OffsetT selection_indices[ITEMS_PER_THREAD];
540 InitializeSelections<true, IS_LAST_TILE>(
553 if (threadIdx.x == 0)
557 tile_state.SetInclusive(0, num_tile_selections);
562 num_tile_selections -= (TILE_ITEMS - num_tile_items);
565 Scatter<IS_LAST_TILE, true>(
573 num_tile_selections);
575 return num_tile_selections;
582 template <
bool IS_LAST_TILE>
589 OutputT items[ITEMS_PER_THREAD];
590 OffsetT selection_flags[ITEMS_PER_THREAD];
591 OffsetT selection_indices[ITEMS_PER_THREAD];
600 InitializeSelections<false, IS_LAST_TILE>(
613 OffsetT num_tile_selections = prefix_op.GetBlockAggregate();
614 OffsetT num_selections = prefix_op.GetInclusivePrefix();
615 OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix();
616 OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix;
621 int num_discount = TILE_ITEMS - num_tile_items;
622 num_selections -= num_discount;
623 num_tile_selections -= num_discount;
627 Scatter<IS_LAST_TILE, false>(
633 num_selections_prefix,
637 return num_selections;
644 template <
bool IS_LAST_TILE>
654 num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset,
tile_state);
658 num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset,
tile_state);
661 return num_selections;
668 template <
typename NumSelectedIteratorT>
675 int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;
676 OffsetT tile_offset = tile_idx * TILE_ITEMS;
681 ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset,
tile_state);
687 OffsetT num_selections = ConsumeTile<true>(num_remaining, tile_idx, tile_offset,
tile_state);
689 if (threadIdx.x == 0)
The BlockDiscontinuity class provides collective methods for flagging discontinuities within an order...
__device__ __forceinline__ void FlagHeads(FlagT(&head_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], T(&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Optional outer namespace(s)
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT d_num_selected_out
[out] Pointer to the total number of items selected (i.e., length of d_selected_out)
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
@ BLOCK_THREADS
Threads per thread block.
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wid...
__device__ __forceinline__ void InitializeSelections(OffsetT, OffsetT num_tile_items, OutputT(&items)[ITEMS_PER_THREAD], OffsetT(&selection_flags)[ITEMS_PER_THREAD], Int2Type< USE_SELECT_OP >)
__device__ __forceinline__ OffsetT ConsumeFirstTile(int num_tile_items, OffsetT tile_offset, ScanTileStateT &tile_state)
__device__ __forceinline__ void ScatterTwoPhase(OutputT(&items)[ITEMS_PER_THREAD], OffsetT(&selection_flags)[ITEMS_PER_THREAD], OffsetT(&selection_indices)[ITEMS_PER_THREAD], int, int num_tile_selections, OffsetT num_selections_prefix, OffsetT, Int2Type< false >)
__device__ __forceinline__ OffsetT ConsumeSubsequentTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state)
OffsetT num_items
Total number of input items.
__device__ __forceinline__ void InitializeSelections(OffsetT tile_offset, OffsetT num_tile_items, OutputT(&)[ITEMS_PER_THREAD], OffsetT(&selection_flags)[ITEMS_PER_THREAD], Int2Type< USE_SELECT_FLAGS >)
__device__ __forceinline__ void ConsumeRange(int num_tiles, ScanTileStateT &tile_state, NumSelectedIteratorT d_num_selected_out)
< Output iterator type for recording number of items selection_flags
WrappedFlagsInputIteratorT d_flags_in
Input selection flags (if applicable)
__device__ __forceinline__ void ScatterDirect(OutputT(&items)[ITEMS_PER_THREAD], OffsetT(&selection_flags)[ITEMS_PER_THREAD], OffsetT(&selection_indices)[ITEMS_PER_THREAD], OffsetT num_selections)
WrappedInputIteratorT d_in
Input items.
__device__ __forceinline__ AgentSelectIf(TempStorage &temp_storage, InputIteratorT d_in, FlagsInputIteratorT d_flags_in, SelectedOutputIteratorT d_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items)
_TempStorage & temp_storage
Reference to temp_storage.
__device__ __forceinline__ void ScatterTwoPhase(OutputT(&items)[ITEMS_PER_THREAD], OffsetT(&selection_flags)[ITEMS_PER_THREAD], OffsetT(&selection_indices)[ITEMS_PER_THREAD], int num_tile_items, int num_tile_selections, OffsetT num_selections_prefix, OffsetT num_rejected_prefix, Int2Type< true >)
__device__ __forceinline__ void Scatter(OutputT(&items)[ITEMS_PER_THREAD], OffsetT(&selection_flags)[ITEMS_PER_THREAD], OffsetT(&selection_indices)[ITEMS_PER_THREAD], int num_tile_items, int num_tile_selections, OffsetT num_selections_prefix, OffsetT num_rejected_prefix, OffsetT num_selections)
__device__ __forceinline__ OffsetT ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state)
__device__ __forceinline__ void InitializeSelections(OffsetT tile_offset, OffsetT num_tile_items, OutputT(&items)[ITEMS_PER_THREAD], OffsetT(&selection_flags)[ITEMS_PER_THREAD], Int2Type< USE_DISCONTINUITY >)
SelectOpT select_op
Selection operator.
InequalityWrapper< EqualityOpT > inequality_op
T inequality operator.
SelectedOutputIteratorT d_selected_out
Unique output items.
\smemstorage{BlockDiscontinuity}
Type selection (IF ? ThenType : ElseType)
Inequality functor (wraps equality functor)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
__host__ __device__ __forceinline__ T & Alias()
Alias.