39#include "../block/block_load.cuh"
40#include "../block/block_store.cuh"
41#include "../block/block_scan.cuh"
42#include "../block/block_exchange.cuh"
43#include "../block/block_discontinuity.cuh"
44#include "../grid/grid_queue.cuh"
45#include "../iterator/cache_modified_input_iterator.cuh"
46#include "../iterator/constant_input_iterator.cuh"
47#include "../util_namespace.cuh"
65 int _ITEMS_PER_THREAD,
68 bool _STORE_WARP_TIME_SLICING,
96 typename AgentRlePolicyT,
97 typename InputIteratorT,
98 typename OffsetsOutputIteratorT,
99 typename LengthsOutputIteratorT,
100 typename EqualityOpT,
109 typedef typename std::iterator_traits<InputIteratorT>::value_type
T;
114 typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type
LengthT;
125 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
126 BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS,
127 ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD,
128 WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD,
129 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
130 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
146 template <
bool LAST_TILE>
150 EqualityOpT equality_op;
156 num_remaining(num_remaining),
160 template <
typename Index>
161 __host__ __device__ __forceinline__
bool operator()(
T first,
T second, Index idx)
163 if (!LAST_TILE || (idx < num_remaining))
174 InputIteratorT>::Type
175 WrappedInputIteratorT;
180 AgentRlePolicyT::BLOCK_THREADS,
181 AgentRlePolicyT::ITEMS_PER_THREAD,
182 AgentRlePolicyT::LOAD_ALGORITHM>
231 unsigned long long align;
232 WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS];
269 __device__ __forceinline__
292 template <
bool FIRST_TILE,
bool LAST_TILE>
293 __device__ __forceinline__
void InitializeSelections(
296 T (&items)[ITEMS_PER_THREAD],
299 bool head_flags[ITEMS_PER_THREAD];
300 bool tail_flags[ITEMS_PER_THREAD];
302 OobInequalityOp<LAST_TILE> inequality_op(num_remaining,
equality_op);
304 if (FIRST_TILE && LAST_TILE)
308 BlockDiscontinuityT(
temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
309 head_flags, tail_flags, items, inequality_op);
316 T tile_successor_item;
317 if (threadIdx.x == BLOCK_THREADS - 1)
318 tile_successor_item =
d_in[tile_offset + TILE_ITEMS];
320 BlockDiscontinuityT(
temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
321 head_flags, tail_flags, tile_successor_item, items, inequality_op);
328 T tile_predecessor_item;
329 if (threadIdx.x == 0)
330 tile_predecessor_item =
d_in[tile_offset - 1];
332 BlockDiscontinuityT(
temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
333 head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
338 T tile_successor_item;
339 if (threadIdx.x == BLOCK_THREADS - 1)
340 tile_successor_item =
d_in[tile_offset + TILE_ITEMS];
343 T tile_predecessor_item;
344 if (threadIdx.x == 0)
345 tile_predecessor_item =
d_in[tile_offset - 1];
347 BlockDiscontinuityT(
temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
348 head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
353 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
355 lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]);
356 lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
375 unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
387 thread_exclusive_in_warp,
392 if (lane_id == WARP_THREADS - 1)
393 temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
398 warp_exclusive_in_tile = identity;
399 warp_aggregate =
temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
400 tile_aggregate =
temp_storage.aliasable.warp_aggregates.Alias()[0];
403 for (
int WARP = 1; WARP < WARPS; ++WARP)
406 warp_exclusive_in_tile = tile_aggregate;
408 tile_aggregate =
scan_op(tile_aggregate,
temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
420 template <
bool FIRST_TILE>
422 OffsetT tile_num_runs_exclusive_in_global,
423 OffsetT warp_num_runs_aggregate,
424 OffsetT warp_num_runs_exclusive_in_tile,
425 OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
429 unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
436 lengths_and_offsets, thread_num_runs_exclusive_in_warp);
441 for (
int SLICE = 1; SLICE < WARPS; ++SLICE)
445 if (warp_id == SLICE)
448 lengths_and_offsets, thread_num_runs_exclusive_in_warp);
454 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
456 if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
459 tile_num_runs_exclusive_in_global +
460 warp_num_runs_exclusive_in_tile +
461 (ITEM * WARP_THREADS) + lane_id;
467 if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
469 d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
479 template <
bool FIRST_TILE>
481 OffsetT tile_num_runs_exclusive_in_global,
482 OffsetT warp_num_runs_aggregate,
483 OffsetT warp_num_runs_exclusive_in_tile,
484 OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
488 unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
492 OffsetT run_offsets[ITEMS_PER_THREAD];
493 LengthT run_lengths[ITEMS_PER_THREAD];
496 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
498 run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
499 run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
503 run_offsets, thread_num_runs_exclusive_in_warp);
508 run_lengths, thread_num_runs_exclusive_in_warp);
512 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
514 if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
517 tile_num_runs_exclusive_in_global +
518 warp_num_runs_exclusive_in_tile +
519 (ITEM * WARP_THREADS) + lane_id;
525 if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
537 template <
bool FIRST_TILE>
539 OffsetT tile_num_runs_exclusive_in_global,
540 OffsetT warp_num_runs_aggregate,
541 OffsetT warp_num_runs_exclusive_in_tile,
542 OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
546 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
548 if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
551 tile_num_runs_exclusive_in_global +
552 warp_num_runs_exclusive_in_tile +
553 thread_num_runs_exclusive_in_warp[ITEM];
559 if (item_offset >= 1)
561 d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
571 template <
bool FIRST_TILE>
573 OffsetT tile_num_runs_aggregate,
574 OffsetT tile_num_runs_exclusive_in_global,
575 OffsetT warp_num_runs_aggregate,
576 OffsetT warp_num_runs_exclusive_in_tile,
577 OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
580 if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
583 if (warp_num_runs_aggregate)
585 ScatterDirect<FIRST_TILE>(
586 tile_num_runs_exclusive_in_global,
587 warp_num_runs_aggregate,
588 warp_num_runs_exclusive_in_tile,
589 thread_num_runs_exclusive_in_warp,
590 lengths_and_offsets);
596 ScatterTwoPhase<FIRST_TILE>(
597 tile_num_runs_exclusive_in_global,
598 warp_num_runs_aggregate,
599 warp_num_runs_exclusive_in_tile,
600 thread_num_runs_exclusive_in_warp,
629 T items[ITEMS_PER_THREAD];
641 InitializeSelections<true, LAST_TILE>(
645 lengths_and_num_runs);
656 warp_exclusive_in_tile,
657 thread_exclusive_in_warp,
658 lengths_and_num_runs);
661 if (!LAST_TILE && (threadIdx.x == 0))
665 if (thread_exclusive_in_warp.
key == 0)
666 thread_exclusive_in_warp.
value += warp_exclusive_in_tile.
value;
669 OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
678 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
680 lengths_and_offsets[ITEM].
value = lengths_and_num_runs2[ITEM].
value;
681 lengths_and_offsets[ITEM].
key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
682 thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].
key) ?
683 lengths_and_num_runs2[ITEM].key :
684 WARP_THREADS * ITEMS_PER_THREAD;
687 OffsetT tile_num_runs_aggregate = tile_aggregate.
key;
688 OffsetT tile_num_runs_exclusive_in_global = 0;
689 OffsetT warp_num_runs_aggregate = warp_aggregate.
key;
690 OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.
key;
694 tile_num_runs_aggregate,
695 tile_num_runs_exclusive_in_global,
696 warp_num_runs_aggregate,
697 warp_num_runs_exclusive_in_tile,
698 thread_num_runs_exclusive_in_warp,
699 lengths_and_offsets);
702 return tile_aggregate;
709 T items[ITEMS_PER_THREAD];
721 InitializeSelections<false, LAST_TILE>(
725 lengths_and_num_runs);
736 warp_exclusive_in_tile,
737 thread_exclusive_in_warp,
738 lengths_and_num_runs);
742 unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
745 prefix_op(tile_aggregate);
746 if (threadIdx.x == 0)
747 temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
756 if (thread_exclusive_in_warp.
key == 0)
757 thread_exclusive_in_warp.
value += thread_exclusive.
value;
762 OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
768 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
770 lengths_and_offsets[ITEM].
value = lengths_and_num_runs2[ITEM].
value;
771 lengths_and_offsets[ITEM].
key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
772 thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].
key) ?
773 lengths_and_num_runs2[ITEM].key :
774 WARP_THREADS * ITEMS_PER_THREAD;
777 OffsetT tile_num_runs_aggregate = tile_aggregate.
key;
778 OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.
key;
779 OffsetT warp_num_runs_aggregate = warp_aggregate.
key;
780 OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.
key;
784 tile_num_runs_aggregate,
785 tile_num_runs_exclusive_in_global,
786 warp_num_runs_aggregate,
787 warp_num_runs_exclusive_in_tile,
788 thread_num_runs_exclusive_in_warp,
789 lengths_and_offsets);
792 return prefix_op.inclusive_prefix;
800 template <
typename NumRunsIteratorT>
807 int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;
808 OffsetT tile_offset = tile_idx * TILE_ITEMS;
816 else if (num_remaining > 0)
821 if (threadIdx.x == 0)
827 if (running_total.
key > 0)
The BlockDiscontinuity class provides collective methods for flagging discontinuities within an order...
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void ScatterToStriped(T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
OffsetT OffsetT
[in] Total number of input data items
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_status
[in] Tile status interface
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
@ BLOCK_THREADS
Threads per thread block.
@ STORE_WARP_TIME_SLICING
Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block...
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run...
ReduceBySegmentOpT scan_op
Reduce-length-by-flag scan operator.
__device__ __forceinline__ AgentRle(TempStorage &temp_storage, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, EqualityOpT equality_op, OffsetT num_items)
__device__ __forceinline__ void ScatterTwoPhase(OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT(&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair(&lengths_and_offsets)[ITEMS_PER_THREAD], Int2Type< true > is_warp_time_slice)
__device__ __forceinline__ void Scatter(OffsetT tile_num_runs_aggregate, OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT(&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair(&lengths_and_offsets)[ITEMS_PER_THREAD])
__device__ __forceinline__ void WarpScanAllocations(LengthOffsetPair &tile_aggregate, LengthOffsetPair &warp_aggregate, LengthOffsetPair &warp_exclusive_in_tile, LengthOffsetPair &thread_exclusive_in_warp, LengthOffsetPair(&lengths_and_num_runs)[ITEMS_PER_THREAD])
__device__ __forceinline__ void ConsumeRange(int num_tiles, ScanTileStateT &tile_status, NumRunsIteratorT d_num_runs_out)
< Output iterator type for recording number of items selected
__device__ __forceinline__ void ScatterTwoPhase(OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT(&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair(&lengths_and_offsets)[ITEMS_PER_THREAD], Int2Type< false > is_warp_time_slice)
OffsetsOutputIteratorT d_offsets_out
Input run offsets.
@ STORE_WARP_TIME_SLICING
Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block...
@ SYNC_AFTER_LOAD
Whether or not to sync after loading data.
__device__ __forceinline__ void ScatterDirect(OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT(&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair(&lengths_and_offsets)[ITEMS_PER_THREAD])
LengthsOutputIteratorT d_lengths_out
Output run lengths.
WrappedInputIteratorT d_in
Pointer to input sequence of data items.
KeyValuePair< OffsetT, LengthT > LengthOffsetPair
Tuple type for scanning (pairs run-length and run-index)
EqualityOpT equality_op
T equality operator.
OffsetT num_items
Total number of input items.
std::iterator_traits< InputIteratorT >::value_type T
The input value type.
_TempStorage & temp_storage
Reference to temp_storage.
__device__ __forceinline__ LengthOffsetPair ConsumeTile(OffsetT num_items, OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_status)
ReduceByKeyScanTileState< LengthT, OffsetT > ScanTileStateT
Tile status descriptor interface type.
\smemstorage{BlockDiscontinuity}
Type selection (IF ? ThenType : ElseType)
ThenType Type
Conditional type result.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A key identifier paired with a corresponding value.
Reduce-by-segment functor.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
\smemstorage{WarpExchange}