38#include "../thread/thread_reduce.cuh"
39#include "../thread/thread_scan.cuh"
40#include "../block/block_scan.cuh"
41#include "../util_ptx.cuh"
42#include "../util_arch.cuh"
43#include "../util_type.cuh"
44#include "../util_namespace.cuh"
91 bool MEMOIZE_OUTER_SCAN = (
CUB_PTX_ARCH >= 350) ?
true : false,
93 cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
106 typedef unsigned short DigitCounter;
109 typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
116 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
118 RADIX_DIGITS = 1 << RADIX_BITS,
121 WARP_THREADS = 1 << LOG_WARP_THREADS,
122 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
124 BYTES_PER_COUNTER =
sizeof(DigitCounter),
127 PACKING_RATIO =
sizeof(
PackedCounter) /
sizeof(DigitCounter),
130 LOG_COUNTER_LANES =
CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),
131 COUNTER_LANES = 1 << LOG_COUNTER_LANES,
134 PADDED_COUNTER_LANES = COUNTER_LANES + 1,
135 RAKING_SEGMENT = PADDED_COUNTER_LANES,
153 INNER_SCAN_ALGORITHM,
161 struct __align__(16) _TempStorage
165 DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
198 __shared__ _TempStorage private_storage;
199 return private_storage;
208 PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
211 if (MEMOIZE_OUTER_SCAN)
215 for (
int i = 0; i < RAKING_SEGMENT; i++)
217 cached_segment[i] = smem_raking_ptr[i];
219 raking_ptr = cached_segment;
223 raking_ptr = smem_raking_ptr;
226 return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr,
Sum());
234 PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
241 internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr,
Sum(), raking_partial);
243 if (MEMOIZE_OUTER_SCAN)
247 for (
int i = 0; i < RAKING_SEGMENT; i++)
249 smem_raking_ptr[i] = cached_segment[i];
262 for (
int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
264 *((
PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
280 for (
int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
282 block_prefix += block_aggregate << (
sizeof(DigitCounter) * 8 * PACKED);
301 BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
304 ExclusiveDownsweep(exclusive_partial);
323 temp_storage(PrivateStorage()),
324 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
334 temp_storage(temp_storage.Alias()),
335 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
349 typename UnsignedBits,
352 UnsignedBits (&keys)[KEYS_PER_THREAD],
353 int (&ranks)[KEYS_PER_THREAD],
357 DigitCounter thread_prefixes[KEYS_PER_THREAD];
358 DigitCounter* digit_counters[KEYS_PER_THREAD];
364 for (
int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
370 unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
373 unsigned int counter_lane = digit & (COUNTER_LANES - 1);
377 sub_counter = PACKING_RATIO - 1 - sub_counter;
378 counter_lane = COUNTER_LANES - 1 - counter_lane;
382 digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
385 thread_prefixes[ITEM] = *digit_counters[ITEM];
388 *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
399 for (
int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
402 ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
411 typename UnsignedBits,
414 UnsignedBits (&keys)[KEYS_PER_THREAD],
415 int (&ranks)[KEYS_PER_THREAD],
429 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
432 bin_idx = RADIX_DIGITS - bin_idx - 1;
436 unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1));
437 unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES);
439 exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
468 typedef int32_t RankT;
469 typedef int32_t DigitCounterT;
474 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
476 RADIX_DIGITS = 1 << RADIX_BITS,
479 WARP_THREADS = 1 << LOG_WARP_THREADS,
480 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
482 PADDED_WARPS = ((WARPS & 0x1) == 0) ?
486 COUNTERS = PADDED_WARPS * RADIX_DIGITS,
487 RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
488 PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ?
507 INNER_SCAN_ALGORITHM,
521 volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
522 DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
573 typename UnsignedBits,
576 UnsignedBits (&keys)[KEYS_PER_THREAD],
577 int (&ranks)[KEYS_PER_THREAD],
584 for (
int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
591 volatile DigitCounterT *digit_counters[KEYS_PER_THREAD];
592 uint32_t warp_id =
linear_tid >> LOG_WARP_THREADS;
596 for (
int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
602 digit = RADIX_DIGITS - digit - 1;
605 uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
608 digit_counters[ITEM] = &
temp_storage.aliasable.warp_digit_counters[digit][warp_id];
611 DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
617 int32_t digit_count = __popc(peer_mask);
620 int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
622 if (peer_digit_prefix == 0)
625 *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
632 ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
639 DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
642 for (
int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
648 for (
int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
655 for (
int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
656 ranks[ITEM] += *digit_counters[ITEM];
664 typename UnsignedBits,
667 UnsignedBits (&keys)[KEYS_PER_THREAD],
668 int (&ranks)[KEYS_PER_THREAD],
681 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
684 bin_idx = RADIX_DIGITS - bin_idx - 1;
686 exclusive_digit_prefix[track] =
temp_storage.aliasable.warp_digit_counters[bin_idx][0];
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
struct __align__(16) _TempStorage
Shared memory storage layout type for BlockRadixRank.
unsigned int linear_tid
Linear thread-id.
BlockScan< DigitCounterT, BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockScanT
BlockScan type.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
__device__ __forceinline__ BlockRadixRankMatch(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void ScanCounters()
__device__ __forceinline__ void ExclusiveDownsweep(PackedCounter raking_partial)
Performs exclusive downsweep raking scan.
__device__ __forceinline__ void ResetCounters()
BlockScan< PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockScan
BlockScan type.
__device__ __forceinline__ BlockRadixRank()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ PackedCounter Upsweep()
__device__ __forceinline__ _TempStorage & PrivateStorage()
__device__ __forceinline__ BlockRadixRank(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
#define CUB_MAX(a, b)
Select maximum(a, b)
__device__ __forceinline__ unsigned int LaneMaskLt()
Returns the warp lane mask of all lanes less than the calling thread.
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
OffsetT int int num_bits
[in] Number of bits of current radix digit
OffsetT int current_bit
[in] Bit position of current radix digit
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
Type selection (IF ? ThenType : ElseType)
Statically determine log2(N), rounded up.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...