39#include "../util_ptx.cuh"
40#include "../util_arch.cuh"
41#include "../util_type.cuh"
42#include "../util_namespace.cuh"
123 int ITEMS_PER_THREAD,
124 typename ValueT = NullType,
126 bool MEMOIZE_OUTER_SCAN = (
CUB_PTX_ARCH >= 350) ?
true : false,
128 cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
143 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
151 typedef typename KeyTraits::UnsignedBits UnsignedBits;
159 INNER_SCAN_ALGORITHM,
172 INNER_SCAN_ALGORITHM,
213 return private_storage;
218 UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD],
219 int (&ranks)[ITEMS_PER_THREAD],
233 UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD],
234 int (&ranks)[ITEMS_PER_THREAD],
248 ValueT (&values)[ITEMS_PER_THREAD],
249 int (&ranks)[ITEMS_PER_THREAD],
261 ValueT (&values)[ITEMS_PER_THREAD],
262 int (&ranks)[ITEMS_PER_THREAD],
273 template <
int IS_BLOCKED>
275 ValueT (&)[ITEMS_PER_THREAD],
276 int (&)[ITEMS_PER_THREAD],
282 template <
int DESCENDING,
int KEYS_ONLY>
284 KeyT (&keys)[ITEMS_PER_THREAD],
285 ValueT (&values)[ITEMS_PER_THREAD],
291 UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
292 reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]
>(keys);
296 for (
int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
298 unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
307 int ranks[ITEMS_PER_THREAD];
308 RankKeys(unsigned_keys, ranks, begin_bit,
pass_bits, is_descending);
309 begin_bit += RADIX_BITS;
320 if (begin_bit >=
end_bit)
break;
327 for (
int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
329 unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
335#ifndef DOXYGEN_SHOULD_SKIP_THIS
338 template <
int DESCENDING,
int KEYS_ONLY>
340 KeyT (&keys)[ITEMS_PER_THREAD],
341 ValueT (&values)[ITEMS_PER_THREAD],
347 UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
348 reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]
>(keys);
352 for (
int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
354 unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
363 int ranks[ITEMS_PER_THREAD];
364 RankKeys(unsigned_keys, ranks, begin_bit,
pass_bits, is_descending);
365 begin_bit += RADIX_BITS;
393 for (
int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
395 unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
415 temp_storage(PrivateStorage()),
416 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
426 temp_storage(temp_storage.Alias()),
427 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
474 __device__ __forceinline__
void Sort(
475 KeyT (&keys)[ITEMS_PER_THREAD],
477 int end_bit =
sizeof(KeyT) * 8)
529 __device__ __forceinline__
void Sort(
530 KeyT (&keys)[ITEMS_PER_THREAD],
531 ValueT (&values)[ITEMS_PER_THREAD],
533 int end_bit =
sizeof(KeyT) * 8)
576 KeyT (&keys)[ITEMS_PER_THREAD],
578 int end_bit =
sizeof(KeyT) * 8)
631 KeyT (&keys)[ITEMS_PER_THREAD],
632 ValueT (&values)[ITEMS_PER_THREAD],
634 int end_bit =
sizeof(KeyT) * 8)
686 KeyT (&keys)[ITEMS_PER_THREAD],
688 int end_bit =
sizeof(KeyT) * 8)
741 KeyT (&keys)[ITEMS_PER_THREAD],
742 ValueT (&values)[ITEMS_PER_THREAD],
744 int end_bit =
sizeof(KeyT) * 8)
789 KeyT (&keys)[ITEMS_PER_THREAD],
791 int end_bit =
sizeof(KeyT) * 8)
844 KeyT (&keys)[ITEMS_PER_THREAD],
845 ValueT (&values)[ITEMS_PER_THREAD],
847 int end_bit =
sizeof(KeyT) * 8)
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thre...
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void ExchangeValues(ValueT(&values)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], Int2Type< false >, Int2Type< true >)
ExchangeValues (specialized for key-value sort, to-blocked arrangement)
__device__ __forceinline__ void RankKeys(UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], int begin_bit, int pass_bits, Int2Type< false >)
Rank keys (specialized for ascending sort)
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ void SortBlockedToStriped(KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit, int end_bit, Int2Type< DESCENDING > is_descending, Int2Type< KEYS_ONLY > is_keys_only)
Sort blocked -> striped arrangement.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void SortDescending(KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs a descending block-wide radix sort across a blocked arrangement of keys and values.
__device__ __forceinline__ void ExchangeValues(ValueT(&values)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], Int2Type< false >, Int2Type< false >)
ExchangeValues (specialized for key-value sort, to-striped arrangement)
__device__ __forceinline__ void SortDescending(KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs a descending block-wide radix sort over a blocked arrangement of keys.
__device__ __forceinline__ void SortDescendingBlockedToStriped(KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs a descending radix sort across a blocked arrangement of keys, leaving them in a striped arra...
BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, true, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > DescendingBlockRadixRank
Descending BlockRadixRank utility type.
__device__ __forceinline__ void Sort(KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs an ascending block-wide radix sort across a blocked arrangement of keys and values.
__device__ __forceinline__ void ExchangeValues(ValueT(&)[ITEMS_PER_THREAD], int(&)[ITEMS_PER_THREAD], Int2Type< true >, Int2Type< IS_BLOCKED >)
ExchangeValues (specialized for keys-only sort)
__device__ __forceinline__ void SortBlocked(KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit, int end_bit, Int2Type< DESCENDING > is_descending, Int2Type< KEYS_ONLY > is_keys_only)
Sort blocked arrangement.
__device__ __forceinline__ BlockRadixSort()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void RankKeys(UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], int begin_bit, int pass_bits, Int2Type< true >)
Rank keys (specialized for descending sort)
BlockExchange< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockExchangeKeys
BlockExchange utility type for keys.
__device__ __forceinline__ void SortDescendingBlockedToStriped(KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs a descending radix sort across a blocked arrangement of keys and values, leaving them in a s...
__device__ __forceinline__ void Sort(KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs an ascending block-wide radix sort over a blocked arrangement of keys.
__device__ __forceinline__ void SortBlockedToStriped(KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs an ascending radix sort across a blocked arrangement of keys and values, leaving them in a s...
__device__ __forceinline__ void SortBlockedToStriped(KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8)
Performs an ascending radix sort across a blocked arrangement of keys, leaving them in a striped arra...
BlockExchange< ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockExchangeValues
BlockExchange utility type for values.
BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, false, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > AscendingBlockRadixRank
Ascending BlockRadixRank utility type.
__device__ __forceinline__ BlockRadixSort(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int int pass_bits
< [in] Number of bits of current radix digit
\smemstorage{BlockExchange}
\smemstorage{BlockRadixSort}
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A simple "NULL" marker type.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Shared memory storage layout type.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...