39#include "../thread/thread_load.cuh"
40#include "../block/block_load.cuh"
41#include "../block/block_store.cuh"
42#include "../block/block_radix_rank.cuh"
43#include "../block/block_exchange.cuh"
44#include "../util_type.cuh"
45#include "../iterator/cache_modified_input_iterator.cuh"
46#include "../util_namespace.cuh"
74 int _ITEMS_PER_THREAD,
135 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
137 RADIX_DIGITS = 1 << RADIX_BITS,
146 typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
148 typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
165 LOAD_ALGORITHM> BlockLoadKeysT;
172 LOAD_ALGORITHM> BlockLoadValuesT;
175 typedef ValueT ValueExchangeT[TILE_ITEMS];
188 UnsignedBits exchange_keys[TILE_ITEMS];
189 OffsetT relative_bin_offsets[RADIX_DIGITS];
194 OffsetT exclusive_digit_prefix[RADIX_DIGITS];
207 _TempStorage &temp_storage;
212 UnsignedBits *d_keys_out;
213 ValueT *d_values_out;
235 template <
bool FULL_TILE>
237 UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
238 OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
239 int (&ranks)[ITEMS_PER_THREAD],
243 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
245 temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
251 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
253 UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
255 relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit];
261 (
static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
263 d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
272 template <
bool FULL_TILE>
274 ValueT (&values)[ITEMS_PER_THREAD],
275 OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
276 int (&ranks)[ITEMS_PER_THREAD],
281 ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
284 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
286 exchange_values[ranks[ITEM]] = values[ITEM];
292 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
294 ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
297 (
static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
299 d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
307 template <
int _RANK_ALGORITHM>
309 UnsignedBits (&keys)[ITEMS_PER_THREAD],
312 UnsignedBits oob_item,
326 template <
int _RANK_ALGORITHM>
328 UnsignedBits (&keys)[ITEMS_PER_THREAD],
331 UnsignedBits oob_item,
337 valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
350 UnsignedBits (&keys)[ITEMS_PER_THREAD],
353 UnsignedBits oob_item,
365 UnsignedBits (&keys)[ITEMS_PER_THREAD],
368 UnsignedBits oob_item,
374 valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
383 template <
int _RANK_ALGORITHM>
385 ValueT (&values)[ITEMS_PER_THREAD],
401 template <
int _RANK_ALGORITHM>
403 ValueT (&values)[ITEMS_PER_THREAD],
411 valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
424 ValueT (&values)[ITEMS_PER_THREAD],
438 ValueT (&values)[ITEMS_PER_THREAD],
446 valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
455 template <
bool FULL_TILE>
457 OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
458 int (&ranks)[ITEMS_PER_THREAD],
463 ValueT values[ITEMS_PER_THREAD];
474 ScatterValues<FULL_TILE>(
476 relative_bin_offsets,
485 template <
bool FULL_TILE>
488 int (&)[ITEMS_PER_THREAD],
498 template <
bool FULL_TILE>
501 const OffsetT &valid_items = TILE_ITEMS)
503 UnsignedBits keys[ITEMS_PER_THREAD];
504 int ranks[ITEMS_PER_THREAD];
505 OffsetT relative_bin_offsets[ITEMS_PER_THREAD];
508 UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
521 for (
int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
533 exclusive_digit_prefix);
542 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
545 temp_storage.exclusive_digit_prefix[bin_idx] =
546 exclusive_digit_prefix[track];
559 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
564 inclusive_digit_prefix[track] = (bin_idx == 0) ?
565 (BLOCK_THREADS * ITEMS_PER_THREAD) :
566 temp_storage.exclusive_digit_prefix[bin_idx - 1];
571 inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
572 (BLOCK_THREADS * ITEMS_PER_THREAD) :
573 temp_storage.exclusive_digit_prefix[bin_idx + 1];
585 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
587 bin_offset[track] -= exclusive_digit_prefix[track];
588 temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
589 bin_offset[track] += inclusive_digit_prefix[track];
596 ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
610 typename InputIteratorT,
612 __device__ __forceinline__
void Copy(
621 T items[ITEMS_PER_THREAD];
623 LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in +
block_offset, items);
635 T items[ITEMS_PER_THREAD];
637 LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in +
block_offset, items, valid_items);
639 StoreDirectStriped<BLOCK_THREADS>(threadIdx.x,
d_out +
block_offset, items, valid_items);
647 template <
typename InputIteratorT>
648 __device__ __forceinline__
void Copy(
667 const KeyT *d_keys_in,
669 const ValueT *d_values_in,
670 ValueT *d_values_out,
674 temp_storage(temp_storage.Alias()),
675 d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
686 this->bin_offset[track] = bin_offset[track];
689 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
692 short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] ==
num_items));
707 const KeyT *d_keys_in,
709 const ValueT *d_values_in,
710 ValueT *d_values_out,
714 temp_storage(temp_storage.Alias()),
715 d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
729 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
732 bin_idx = RADIX_DIGITS - bin_idx - 1;
736 short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset ==
num_items));
739 bin_offset[track] =
d_spine[(gridDim.x * bin_idx) + blockIdx.x];
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a warp-striped arrangement across the thread block.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ int CTA_SYNC_AND(int p)
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
Optional outer namespace(s)
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
OffsetT * d_spine
< [in] Input keys buffer
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
KeyT * d_keys_out
< [in] Input keys buffer
OffsetT int int num_bits
[in] Number of bits of current radix digit
KeyT const ValueT * d_values_in
[in] Input values buffer
OffsetT int current_bit
[in] Bit position of current radix digit
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
OffsetT OffsetT
[in] Total number of input data items
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< The number of radix bits, i.e., log2(bins)
static const RadixRankAlgorithm RANK_ALGORITHM
The radix ranking algorithm to use.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading keys (and values)
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
@ RADIX_BITS
The number of radix bits, i.e., log2(bins)
@ BLOCK_THREADS
Threads per thread block.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
Alias wrapper allowing storage to be unioned.
AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in ...
union __align__(16) _TempStorage
__device__ __forceinline__ void ScatterValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT(&relative_bin_offsets)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], OffsetT valid_items)
__device__ __forceinline__ void Copy(InputIteratorT, NullType *, OffsetT, OffsetT)
__device__ __forceinline__ void LoadValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< true > is_full_tile, Int2Type< _RANK_ALGORITHM > rank_algorithm)
__device__ __forceinline__ AgentRadixSortDownsweep(TempStorage &temp_storage, OffsetT num_items, OffsetT *d_spine, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int current_bit, int num_bits)
__device__ __forceinline__ void GatherScatterValues(OffsetT(&)[ITEMS_PER_THREAD], int(&)[ITEMS_PER_THREAD], OffsetT, OffsetT, Int2Type< true >)
__device__ __forceinline__ void ProcessTile(OffsetT block_offset, const OffsetT &valid_items=TILE_ITEMS)
__device__ __forceinline__ void LoadKeys(UnsignedBits(&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type< true > is_full_tile, Int2Type< RADIX_RANK_MATCH > rank_algorithm)
__device__ __forceinline__ void LoadValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< false > is_full_tile, Int2Type< _RANK_ALGORITHM > rank_algorithm)
__device__ __forceinline__ void ScatterKeys(UnsignedBits(&twiddled_keys)[ITEMS_PER_THREAD], OffsetT(&relative_bin_offsets)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], OffsetT valid_items)
__device__ __forceinline__ void LoadKeys(UnsignedBits(&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type< true > is_full_tile, Int2Type< _RANK_ALGORITHM > rank_algorithm)
__device__ __forceinline__ void LoadValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< false > is_full_tile, Int2Type< RADIX_RANK_MATCH > rank_algorithm)
__device__ __forceinline__ void GatherScatterValues(OffsetT(&relative_bin_offsets)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< false >)
__device__ __forceinline__ void LoadValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< true > is_full_tile, Int2Type< RADIX_RANK_MATCH > rank_algorithm)
__device__ __forceinline__ AgentRadixSortDownsweep(TempStorage &temp_storage, OffsetT(&bin_offset)[BINS_TRACKED_PER_THREAD], OffsetT num_items, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int current_bit, int num_bits)
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, OffsetT block_end)
__device__ __forceinline__ void LoadKeys(UnsignedBits(&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type< false > is_full_tile, Int2Type< _RANK_ALGORITHM > rank_algorithm)
__device__ __forceinline__ void Copy(InputIteratorT d_in, T *d_out, OffsetT block_offset, OffsetT block_end)
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
__device__ __forceinline__ void LoadKeys(UnsignedBits(&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type< false > is_full_tile, Int2Type< RADIX_RANK_MATCH > rank_algorithm)
Type selection (IF ? ThenType : ElseType)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A simple "NULL" marker type.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.