38 #include "../thread/thread_reduce.cuh" 39 #include "../thread/thread_scan.cuh" 40 #include "../block/block_scan.cuh" 41 #include "../util_ptx.cuh" 42 #include "../util_arch.cuh" 43 #include "../util_type.cuh" 44 #include "../util_namespace.cuh" 91 bool MEMOIZE_OUTER_SCAN = (
CUB_PTX_ARCH >= 350) ?
true :
false,
93 cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
106 typedef unsigned short DigitCounter;
109 typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
111 unsigned int>::Type PackedCounter;
116 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
118 RADIX_DIGITS = 1 << RADIX_BITS,
121 WARP_THREADS = 1 << LOG_WARP_THREADS,
122 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
124 BYTES_PER_COUNTER =
sizeof(DigitCounter),
127 PACKING_RATIO =
sizeof(PackedCounter) /
sizeof(DigitCounter),
130 LOG_COUNTER_LANES =
CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),
131 COUNTER_LANES = 1 << LOG_COUNTER_LANES,
134 PADDED_COUNTER_LANES = COUNTER_LANES + 1,
135 RAKING_SEGMENT = PADDED_COUNTER_LANES,
153 INNER_SCAN_ALGORITHM,
161 struct __align__(16) _TempStorage
165 DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
166 PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
180 _TempStorage &temp_storage;
186 PackedCounter cached_segment[RAKING_SEGMENT];
198 __shared__ _TempStorage private_storage;
199 return private_storage;
206 __device__ __forceinline__ PackedCounter
Upsweep()
208 PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
209 PackedCounter *raking_ptr;
211 if (MEMOIZE_OUTER_SCAN)
215 for (
int i = 0; i < RAKING_SEGMENT; i++)
217 cached_segment[i] = smem_raking_ptr[i];
219 raking_ptr = cached_segment;
223 raking_ptr = smem_raking_ptr;
226 return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr,
Sum());
232 PackedCounter raking_partial)
234 PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
236 PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
241 internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr,
Sum(), raking_partial);
243 if (MEMOIZE_OUTER_SCAN)
247 for (
int i = 0; i < RAKING_SEGMENT; i++)
249 smem_raking_ptr[i] = cached_segment[i];
262 for (
int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
264 *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
274 __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
276 PackedCounter block_prefix = 0;
280 for (
int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
282 block_prefix += block_aggregate << (
sizeof(DigitCounter) * 8 * PACKED);
296 PackedCounter raking_partial = Upsweep();
299 PackedCounter exclusive_partial;
301 BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
304 ExclusiveDownsweep(exclusive_partial);
323 temp_storage(PrivateStorage()),
324 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
334 temp_storage(temp_storage.Alias()),
335 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
349 typename UnsignedBits,
352 UnsignedBits (&keys)[KEYS_PER_THREAD],
353 int (&ranks)[KEYS_PER_THREAD],
357 DigitCounter thread_prefixes[KEYS_PER_THREAD];
358 DigitCounter* digit_counters[KEYS_PER_THREAD];
364 for (
int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
370 unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
373 unsigned int counter_lane = digit & (COUNTER_LANES - 1);
377 sub_counter = PACKING_RATIO - 1 - sub_counter;
378 counter_lane = COUNTER_LANES - 1 - counter_lane;
382 digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
385 thread_prefixes[ITEM] = *digit_counters[ITEM];
388 *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
399 for (
int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
402 ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
411 typename UnsignedBits,
414 UnsignedBits (&keys)[KEYS_PER_THREAD],
415 int (&ranks)[KEYS_PER_THREAD],
429 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
432 bin_idx = RADIX_DIGITS - bin_idx - 1;
436 unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1));
437 unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES);
439 exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
468 typedef int32_t RankT;
469 typedef int32_t DigitCounterT;
474 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
476 RADIX_DIGITS = 1 << RADIX_BITS,
479 WARP_THREADS = 1 << LOG_WARP_THREADS,
480 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
482 PADDED_WARPS = ((WARPS & 0x1) == 0) ?
486 COUNTERS = PADDED_WARPS * RADIX_DIGITS,
487 RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
488 PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ?
507 INNER_SCAN_ALGORITHM,
521 volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
522 DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
573 typename UnsignedBits,
576 UnsignedBits (&keys)[KEYS_PER_THREAD],
577 int (&ranks)[KEYS_PER_THREAD],
584 for (
int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
591 volatile DigitCounterT *digit_counters[KEYS_PER_THREAD];
592 uint32_t warp_id =
linear_tid >> LOG_WARP_THREADS;
596 for (
int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
602 digit = RADIX_DIGITS - digit - 1;
605 uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
608 digit_counters[ITEM] = &
temp_storage.aliasable.warp_digit_counters[digit][warp_id];
611 DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
617 int32_t digit_count = __popc(peer_mask);
620 int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
622 if (peer_digit_prefix == 0)
625 *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
632 ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
639 DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
642 for (
int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
648 for (
int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
655 for (
int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
656 ranks[ITEM] += *digit_counters[ITEM];
664 typename UnsignedBits,
667 UnsignedBits (&keys)[KEYS_PER_THREAD],
668 int (&ranks)[KEYS_PER_THREAD],
681 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
684 bin_idx = RADIX_DIGITS - bin_idx - 1;
686 exclusive_digit_prefix[track] =
temp_storage.aliasable.warp_digit_counters[bin_idx][0];
__device__ __forceinline__ unsigned int LaneMaskLt()
Returns the warp lane mask of all lanes less than the calling thread.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
Optional outer namespace(s)
__device__ __forceinline__ void ResetCounters()
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
Number of bin-starting offsets tracked per thread.
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
__device__ __forceinline__ void ExclusiveDownsweep(PackedCounter raking_partial)
Performs exclusive downsweep raking scan.
OffsetT int current_bit
[in] Bit position of current radix digit
OffsetT int int num_bits
[in] Number of bits of current radix digit
__device__ __forceinline__ BlockRadixRank()
Collective constructor using a private static allocation of shared memory as temporary storage.
struct __align__(16) _TempStorage
Shared memory storage layout type for BlockRadixRank.
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ BlockRadixRank(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Number of bin-starting offsets tracked per thread.
__device__ __forceinline__ BlockRadixRankMatch(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
__device__ __forceinline__ _TempStorage & PrivateStorage()
__device__ __forceinline__ PackedCounter Upsweep()
Type selection (IF ? ThenType : ElseType)
Statically determine log2(N), rounded up.
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
BlockScan< DigitCounterT, BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockScanT
BlockScan type.
__device__ __forceinline__ void ScanCounters()
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
#define CUB_MAX(a, b)
Select maximum(a, b)
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
BlockScan< PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockScan
BlockScan type.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
_TempStorage & temp_storage
Shared storage reference.
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)