36#include "../thread/thread_reduce.cuh"
37#include "../thread/thread_load.cuh"
38#include "../warp/warp_reduce.cuh"
39#include "../block/block_load.cuh"
40#include "../util_type.cuh"
41#include "../iterator/cache_modified_input_iterator.cuh"
42#include "../util_namespace.cuh"
59 int _ITEMS_PER_THREAD,
96 typedef unsigned char DigitCounter;
99 typedef unsigned int PackedCounter;
109 RADIX_DIGITS = 1 << RADIX_BITS,
111 LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS,
112 WARP_THREADS = 1 << LOG_WARP_THREADS,
113 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
115 TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD,
117 BYTES_PER_COUNTER =
sizeof(DigitCounter),
120 PACKING_RATIO =
sizeof(PackedCounter) /
sizeof(DigitCounter),
123 LOG_COUNTER_LANES =
CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
124 COUNTER_LANES = 1 << LOG_COUNTER_LANES,
130 LANES_PER_WARP =
CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
133 UNROLL_COUNT =
CUB_MIN(64, 255 / KEYS_PER_THREAD),
134 UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
146 DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
147 PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
148 OffsetT block_counters[WARP_THREADS][RADIX_DIGITS];
161 _TempStorage &temp_storage;
164 OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO];
182 template <
int COUNT,
int MAX>
186 static __device__ __forceinline__
void BucketKeys(
188 UnsignedBits keys[KEYS_PER_THREAD])
202 static __device__ __forceinline__
void BucketKeys(
AgentRadixSortUpsweep &, UnsignedBits [KEYS_PER_THREAD]) {}
213 __device__ __forceinline__
void Bucket(UnsignedBits key)
222 UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
225 UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
228 temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
238 for (
int LANE = 0; LANE < COUNTER_LANES; LANE++)
240 temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
251 for (
int LANE = 0; LANE < LANES_PER_WARP; LANE++)
254 for (
int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
256 local_counts[LANE][UNPACKED_COUNTER] = 0;
268 unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
269 unsigned int warp_tid =
LaneId();
272 for (
int LANE = 0; LANE < LANES_PER_WARP; LANE++)
274 const int counter_lane = (LANE * WARPS) + warp_id;
275 if (counter_lane < COUNTER_LANES)
278 for (
int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
281 for (
int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
283 OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
284 local_counts[LANE][UNPACKED_COUNTER] += counter;
298 UnsignedBits keys[KEYS_PER_THREAD];
300 LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in +
block_offset, keys);
338 const KeyT *d_keys_in,
342 temp_storage(temp_storage.Alias()),
343 d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
363 for (
int i = 0; i < UNROLL_COUNT; ++i)
402 template <
bool IS_DESCENDING>
408 unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
409 unsigned int warp_tid =
LaneId();
413 for (
int LANE = 0; LANE < LANES_PER_WARP; LANE++)
415 int counter_lane = (LANE * WARPS) + warp_id;
416 if (counter_lane < COUNTER_LANES)
418 int digit_row = counter_lane << LOG_PACKING_RATIO;
421 for (
int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
423 int bin_idx = digit_row + UNPACKED_COUNTER;
425 temp_storage.block_counters[warp_tid][bin_idx] =
426 local_counts[LANE][UNPACKED_COUNTER];
437 for (
int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS;
438 (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
439 BIN_BASE += BLOCK_THREADS)
441 int bin_idx = BIN_BASE + threadIdx.x;
445 for (
int i = 0; i < WARP_THREADS; ++i)
446 bin_count += temp_storage.block_counters[i][bin_idx];
449 bin_idx = RADIX_DIGITS - bin_idx - 1;
451 counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
455 if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
457 int bin_idx = threadIdx.x;
461 for (
int i = 0; i < WARP_THREADS; ++i)
462 bin_count += temp_storage.block_counters[i][bin_idx];
465 bin_idx = RADIX_DIGITS - bin_idx - 1;
467 counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
475 template <
int BINS_TRACKED_PER_THREAD>
479 unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
480 unsigned int warp_tid =
LaneId();
484 for (
int LANE = 0; LANE < LANES_PER_WARP; LANE++)
486 int counter_lane = (LANE * WARPS) + warp_id;
487 if (counter_lane < COUNTER_LANES)
489 int digit_row = counter_lane << LOG_PACKING_RATIO;
492 for (
int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
494 int bin_idx = digit_row + UNPACKED_COUNTER;
496 temp_storage.block_counters[warp_tid][bin_idx] =
497 local_counts[LANE][UNPACKED_COUNTER];
510 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
512 bin_count[track] = 0;
515 for (
int i = 0; i < WARP_THREADS; ++i)
516 bin_count[track] += temp_storage.block_counters[i][bin_idx];
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
Optional outer namespace(s)
OffsetT int int num_bits
[in] Number of bits of current radix digit
OffsetT int current_bit
[in] Bit position of current radix digit
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
OffsetT OffsetT
[in] Total number of input data items
< The number of radix bits, i.e., log2(bins)
@ BLOCK_THREADS
Threads per thread block.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
@ RADIX_BITS
The number of radix bits, i.e., log2(bins)
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading keys.
Alias wrapper allowing storage to be unioned.
AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in de...
__device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
__device__ __forceinline__ void ExtractCounts(OffsetT(&bin_count)[BINS_TRACKED_PER_THREAD])
__device__ __forceinline__ void Bucket(UnsignedBits key)
__device__ __forceinline__ void ProcessPartialTile(OffsetT block_offset, const OffsetT &block_end)
__device__ __forceinline__ AgentRadixSortUpsweep(TempStorage &temp_storage, const KeyT *d_keys_in, int current_bit, int num_bits)
__device__ __forceinline__ void ResetUnpackedCounters()
union __align__(16) _TempStorage
__device__ __forceinline__ void UnpackDigitCounts()
__device__ __forceinline__ void ResetDigitCounters()
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, const OffsetT &block_end)
__device__ __forceinline__ void ExtractCounts(OffsetT *counters, int bin_stride=1, int bin_offset=0)
Statically determine log2(N), rounded up.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.