template<typename KeyT,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD, typename ValueT = NullType,
int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thread block using a radix sorting method.
- Template Parameters
-
KeyT | KeyT type |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of items per thread |
ValueT | [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort) |
RADIX_BITS | [optional] The number of radix bits per digit place (default: 4 bits) |
MEMOIZE_OUTER_SCAN | [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). |
INNER_SCAN_ALGORITHM | [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) |
SMEM_CONFIG | [optional] Shared memory bank mode (default: cudaSharedMemBankSizeFourByte ) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
- The radix sorting method arranges items into ascending order. It relies upon a positional representation for keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, characters, etc.) specified from least-significant to most-significant. For a given input sequence of keys and a set of rules specifying a total ordering of the symbolic alphabet, the radix sorting method produces a lexicographic ordering of those keys.
- BlockRadixSort can sort all of the built-in C++ numeric primitive types (
unsigned char
, int
, double
, etc.) as well as CUDA's __half
half-precision floating-point type. Within each key, the implementation treats fixed-length bit-sequences of RADIX_BITS
as radix digit places. Although the direct radix sorting method can only be applied to unsigned integral types, BlockRadixSort is able to sort signed and floating-point types via simple bit-wise transformations that ensure lexicographic key ordering.
- \rowmajor
- Performance Considerations
-
- A Simple Example
- \blockcollective{BlockRadixSort}
- The code snippet below illustrates a sort of 512 integer keys that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int thread_keys[4];
...
...
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thre...
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ BlockRadixSort()
Collective constructor using a private static allocation of shared memory as temporary storage.
\smemstorage{BlockRadixSort}
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
Definition at line 132 of file block_radix_sort.cuh.
|
template<int DESCENDING, int KEYS_ONLY> |
__device__ __forceinline__ void | SortBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit, int end_bit, Int2Type< DESCENDING > is_descending, Int2Type< KEYS_ONLY > is_keys_only) |
| Sort blocked -> striped arrangement.
|
|
|
__device__ __forceinline__ | BlockRadixSort () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockRadixSort (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage.
|
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs an ascending block-wide radix sort over a blocked arrangement of keys.
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs an ascending block-wide radix sort across a blocked arrangement of keys and values.
|
|
__device__ __forceinline__ void | SortDescending (KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs a descending block-wide radix sort over a blocked arrangement of keys.
|
|
__device__ __forceinline__ void | SortDescending (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs a descending block-wide radix sort across a blocked arrangement of keys and values.
|
|
|
__device__ __forceinline__ void | SortBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs an ascending radix sort across a blocked arrangement of keys, leaving them in a striped arrangement.
|
|
__device__ __forceinline__ void | SortBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs an ascending radix sort across a blocked arrangement of keys and values, leaving them in a striped arrangement.
|
|
__device__ __forceinline__ void | SortDescendingBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs a descending radix sort across a blocked arrangement of keys, leaving them in a striped arrangement.
|
|
__device__ __forceinline__ void | SortDescendingBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs a descending radix sort across a blocked arrangement of keys and values, leaving them in a striped arrangement.
|
|
|
enum | { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z
, KEYS_ONLY = Equals<ValueT, NullType>::VALUE
} |
|
typedef Traits< KeyT > | KeyTraits |
|
typedef KeyTraits::UnsignedBits | UnsignedBits |
|
typedef BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, false, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | AscendingBlockRadixRank |
| Ascending BlockRadixRank utility type.
|
|
typedef BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, true, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | DescendingBlockRadixRank |
| Descending BlockRadixRank utility type.
|
|
typedef BlockExchange< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | BlockExchangeKeys |
| BlockExchange utility type for keys.
|
|
typedef BlockExchange< ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | BlockExchangeValues |
| BlockExchange utility type for values.
|
|
|
__device__ __forceinline__ _TempStorage & | PrivateStorage () |
| Internal storage allocator.
|
|
__device__ __forceinline__ void | RankKeys (UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], int begin_bit, int pass_bits, Int2Type< false >) |
| Rank keys (specialized for ascending sort)
|
|
__device__ __forceinline__ void | RankKeys (UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], int begin_bit, int pass_bits, Int2Type< true >) |
| Rank keys (specialized for descending sort)
|
|
__device__ __forceinline__ void | ExchangeValues (ValueT(&values)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], Int2Type< false >, Int2Type< true >) |
| ExchangeValues (specialized for key-value sort, to-blocked arrangement)
|
|
__device__ __forceinline__ void | ExchangeValues (ValueT(&values)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], Int2Type< false >, Int2Type< false >) |
| ExchangeValues (specialized for key-value sort, to-striped arrangement)
|
|
template<int IS_BLOCKED> |
__device__ __forceinline__ void | ExchangeValues (ValueT(&)[ITEMS_PER_THREAD], int(&)[ITEMS_PER_THREAD], Int2Type< true >, Int2Type< IS_BLOCKED >) |
| ExchangeValues (specialized for keys-only sort)
|
|
template<int DESCENDING, int KEYS_ONLY> |
__device__ __forceinline__ void | SortBlocked (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit, int end_bit, Int2Type< DESCENDING > is_descending, Int2Type< KEYS_ONLY > is_keys_only) |
| Sort blocked arrangement.
|
|