template<typename KeyT, int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thread block using a radix sorting method.
- Template Parameters
-
KeyT | KeyT type |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of items per thread |
ValueT | [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort) |
RADIX_BITS | [optional] The number of radix bits per digit place (default: 4 bits) |
MEMOIZE_OUTER_SCAN | [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). |
INNER_SCAN_ALGORITHM | [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) |
SMEM_CONFIG | [optional] Shared memory bank mode (default: cudaSharedMemBankSizeFourByte ) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
- The radix sorting method arranges items into ascending order. It relies upon a positional representation for keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, characters, etc.) specified from least-significant to most-significant. For a given input sequence of keys and a set of rules specifying a total ordering of the symbolic alphabet, the radix sorting method produces a lexicographic ordering of those keys.
- BlockRadixSort can sort all of the built-in C++ numeric primitive types (
unsigned char
, int
, double
, etc.) as well as CUDA's __half
half-precision floating-point type. Within each key, the implementation treats fixed-length bit-sequences of RADIX_BITS
as radix digit places. Although the direct radix sorting method can only be applied to unsigned integral types, BlockRadixSort is able to sort signed and floating-point types via simple bit-wise transformations that ensure lexicographic key ordering.
- \rowmajor
- Performance Considerations
-
- A Simple Example
- \blockcollective{BlockRadixSort}
- The code snippet below illustrates a sort of 512 integer keys that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive items.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
...
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
Definition at line 132 of file block_radix_sort.cuh.
|
template<int DESCENDING, int KEYS_ONLY> |
__device__ __forceinline__ void | SortBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit, int end_bit, Int2Type< DESCENDING > is_descending, Int2Type< KEYS_ONLY > is_keys_only) |
| Sort blocked -> striped arrangement. More...
|
|
|
__device__ __forceinline__ | BlockRadixSort () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockRadixSort (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage. More...
|
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs an ascending block-wide radix sort over a blocked arrangement of keys. More...
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs an ascending block-wide radix sort across a blocked arrangement of keys and values. More...
|
|
__device__ __forceinline__ void | SortDescending (KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs a descending block-wide radix sort over a blocked arrangement of keys. More...
|
|
__device__ __forceinline__ void | SortDescending (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs a descending block-wide radix sort across a blocked arrangement of keys and values. More...
|
|
|
__device__ __forceinline__ void | SortBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs an ascending radix sort across a blocked arrangement of keys, leaving them in a striped arrangement. More...
|
|
__device__ __forceinline__ void | SortBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs an ascending radix sort across a blocked arrangement of keys and values, leaving them in a striped arrangement. More...
|
|
__device__ __forceinline__ void | SortDescendingBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs a descending radix sort across a blocked arrangement of keys, leaving them in a striped arrangement. More...
|
|
__device__ __forceinline__ void | SortDescendingBlockedToStriped (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit=0, int end_bit=sizeof(KeyT) *8) |
| Performs a descending radix sort across a blocked arrangement of keys and values, leaving them in a striped arrangement. More...
|
|
|
enum | { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
KEYS_ONLY = Equals<ValueT, NullType>::VALUE
} |
|
typedef Traits< KeyT > | KeyTraits |
|
typedef KeyTraits::UnsignedBits | UnsignedBits |
|
typedef BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, false, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | AscendingBlockRadixRank |
| Ascending BlockRadixRank utility type.
|
|
typedef BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, true, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | DescendingBlockRadixRank |
| Descending BlockRadixRank utility type.
|
|
typedef BlockExchange< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | BlockExchangeKeys |
| BlockExchange utility type for keys.
|
|
typedef BlockExchange< ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | BlockExchangeValues |
| BlockExchange utility type for values.
|
|
|
__device__ __forceinline__ _TempStorage & | PrivateStorage () |
| Internal storage allocator.
|
|
__device__ __forceinline__ void | RankKeys (UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], int begin_bit, int pass_bits, Int2Type< false >) |
| Rank keys (specialized for ascending sort)
|
|
__device__ __forceinline__ void | RankKeys (UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], int begin_bit, int pass_bits, Int2Type< true >) |
| Rank keys (specialized for descending sort)
|
|
__device__ __forceinline__ void | ExchangeValues (ValueT(&values)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], Int2Type< false >, Int2Type< true >) |
| ExchangeValues (specialized for key-value sort, to-blocked arrangement)
|
|
__device__ __forceinline__ void | ExchangeValues (ValueT(&values)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], Int2Type< false >, Int2Type< false >) |
| ExchangeValues (specialized for key-value sort, to-striped arrangement)
|
|
template<int IS_BLOCKED> |
__device__ __forceinline__ void | ExchangeValues (ValueT(&)[ITEMS_PER_THREAD], int(&)[ITEMS_PER_THREAD], Int2Type< true >, Int2Type< IS_BLOCKED >) |
| ExchangeValues (specialized for keys-only sort)
|
|
template<int DESCENDING, int KEYS_ONLY> |
__device__ __forceinline__ void | SortBlocked (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&values)[ITEMS_PER_THREAD], int begin_bit, int end_bit, Int2Type< DESCENDING > is_descending, Int2Type< KEYS_ONLY > is_keys_only) |
| Sort blocked arrangement. More...
|
|
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockRadixSort |
( |
TempStorage & |
temp_storage | ) |
|
|
inline |
Collective constructor using the specified memory allocation as temporary storage.
- Parameters
-
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 423 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Sort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
int |
begin_bit = 0 , |
|
|
int |
end_bit = sizeof(KeyT) * 8 |
|
) |
| |
|
inline |
Performs an ascending block-wide radix sort over a blocked arrangement of keys.
-
- Snippet
- The code snippet below illustrates a sort of 512 integer keys that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive keys.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
- Parameters
-
[in,out] | keys | Keys to sort |
[in] | begin_bit | [optional] The beginning (least-significant) bit index needed for key comparison |
[in] | end_bit | [optional] The past-the-end (most-significant) bit index needed for key comparison |
Definition at line 474 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Sort |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
values[ITEMS_PER_THREAD], |
|
|
int |
begin_bit = 0 , |
|
|
int |
end_bit = sizeof(KeyT) * 8 |
|
) |
| |
|
inline |
Performs an ascending block-wide radix sort across a blocked arrangement of keys and values.
- BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory.
- \granularity
- \smemreuse
- Snippet
- The code snippet below illustrates a sort of 512 integer keys and values that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive pairs.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
int thread_values[4];
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
- Parameters
-
[in,out] | keys | Keys to sort |
[in,out] | values | Values to sort |
[in] | begin_bit | [optional] The beginning (least-significant) bit index needed for key comparison |
[in] | end_bit | [optional] The past-the-end (most-significant) bit index needed for key comparison |
Definition at line 529 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DESCENDING, int KEYS_ONLY>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::SortBlocked |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
values[ITEMS_PER_THREAD], |
|
|
int |
begin_bit, |
|
|
int |
end_bit, |
|
|
Int2Type< DESCENDING > |
is_descending, |
|
|
Int2Type< KEYS_ONLY > |
is_keys_only |
|
) |
| |
|
inlineprivate |
Sort blocked arrangement.
- Parameters
-
keys | Keys to sort |
values | Values to sort |
begin_bit | The beginning (least-significant) bit index needed for key comparison |
end_bit | The past-the-end (most-significant) bit index needed for key comparison |
is_descending | Tag whether is a descending-order sort |
is_keys_only | Tag whether is keys-only sort |
Definition at line 283 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DESCENDING, int KEYS_ONLY>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::SortBlockedToStriped |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
values[ITEMS_PER_THREAD], |
|
|
int |
begin_bit, |
|
|
int |
end_bit, |
|
|
Int2Type< DESCENDING > |
is_descending, |
|
|
Int2Type< KEYS_ONLY > |
is_keys_only |
|
) |
| |
|
inline |
Sort blocked -> striped arrangement.
- Parameters
-
keys | Keys to sort |
values | Values to sort |
begin_bit | The beginning (least-significant) bit index needed for key comparison |
end_bit | The past-the-end (most-significant) bit index needed for key comparison |
is_descending | Tag whether is a descending-order sort |
is_keys_only | Tag whether is keys-only sort |
Definition at line 339 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::SortBlockedToStriped |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
int |
begin_bit = 0 , |
|
|
int |
end_bit = sizeof(KeyT) * 8 |
|
) |
| |
|
inline |
Performs an ascending radix sort across a blocked arrangement of keys, leaving them in a striped arrangement.
-
- Snippet
- The code snippet below illustrates a sort of 512 integer keys that are initially partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive keys. The final partitioning is striped.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }
.
- Parameters
-
[in,out] | keys | Keys to sort |
[in] | begin_bit | [optional] The beginning (least-significant) bit index needed for key comparison |
[in] | end_bit | [optional] The past-the-end (most-significant) bit index needed for key comparison |
Definition at line 685 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::SortBlockedToStriped |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
values[ITEMS_PER_THREAD], |
|
|
int |
begin_bit = 0 , |
|
|
int |
end_bit = sizeof(KeyT) * 8 |
|
) |
| |
|
inline |
Performs an ascending radix sort across a blocked arrangement of keys and values, leaving them in a striped arrangement.
- BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory.
- \granularity
- \smemreuse
- Snippet
- The code snippet below illustrates a sort of 512 integer keys and values that are initially partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
int thread_values[4];
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }
.
- Parameters
-
[in,out] | keys | Keys to sort |
[in,out] | values | Values to sort |
[in] | begin_bit | [optional] The beginning (least-significant) bit index needed for key comparison |
[in] | end_bit | [optional] The past-the-end (most-significant) bit index needed for key comparison |
Definition at line 740 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::SortDescending |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
int |
begin_bit = 0 , |
|
|
int |
end_bit = sizeof(KeyT) * 8 |
|
) |
| |
|
inline |
Performs a descending block-wide radix sort over a blocked arrangement of keys.
-
- Snippet
- The code snippet below illustrates a sort of 512 integer keys that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive keys.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }
.
- Parameters
-
[in,out] | keys | Keys to sort |
[in] | begin_bit | [optional] The beginning (least-significant) bit index needed for key comparison |
[in] | end_bit | [optional] The past-the-end (most-significant) bit index needed for key comparison |
Definition at line 575 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::SortDescending |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
values[ITEMS_PER_THREAD], |
|
|
int |
begin_bit = 0 , |
|
|
int |
end_bit = sizeof(KeyT) * 8 |
|
) |
| |
|
inline |
Performs a descending block-wide radix sort across a blocked arrangement of keys and values.
- BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory.
- \granularity
- \smemreuse
- Snippet
- The code snippet below illustrates a sort of 512 integer keys and values that are partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive pairs.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
int thread_values[4];
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }
.
- Parameters
-
[in,out] | keys | Keys to sort |
[in,out] | values | Values to sort |
[in] | begin_bit | [optional] The beginning (least-significant) bit index needed for key comparison |
[in] | end_bit | [optional] The past-the-end (most-significant) bit index needed for key comparison |
Definition at line 630 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::SortDescendingBlockedToStriped |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
int |
begin_bit = 0 , |
|
|
int |
end_bit = sizeof(KeyT) * 8 |
|
) |
| |
|
inline |
Performs a descending radix sort across a blocked arrangement of keys, leaving them in a striped arrangement.
-
- Snippet
- The code snippet below illustrates a sort of 512 integer keys that are initially partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive keys. The final partitioning is striped.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }
.
- Parameters
-
[in,out] | keys | Keys to sort |
[in] | begin_bit | [optional] The beginning (least-significant) bit index needed for key comparison |
[in] | end_bit | [optional] The past-the-end (most-significant) bit index needed for key comparison |
Definition at line 788 of file block_radix_sort.cuh.
template<typename KeyT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, typename ValueT = NullType, int RADIX_BITS = 4, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::SortDescendingBlockedToStriped |
( |
KeyT(&) |
keys[ITEMS_PER_THREAD], |
|
|
ValueT(&) |
values[ITEMS_PER_THREAD], |
|
|
int |
begin_bit = 0 , |
|
|
int |
end_bit = sizeof(KeyT) * 8 |
|
) |
| |
|
inline |
Performs a descending radix sort across a blocked arrangement of keys and values, leaving them in a striped arrangement.
- BlockRadixSort can only accommodate one associated tile of values. To "truck along" more than one tile of values, simply perform a key-value sort of the keys paired with a temporary value array that enumerates the key indices. The reordered indices can then be used as a gather-vector for exchanging other associated tile data through shared memory.
- \granularity
- \smemreuse
- Snippet
- The code snippet below illustrates a sort of 512 integer keys and values that are initially partitioned in a blocked arrangement across 128 threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
__shared__
typename BlockRadixSort::TempStorage
temp_storage;
int thread_keys[4];
int thread_values[4];
...
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }
.
- Parameters
-
[in,out] | keys | Keys to sort |
[in,out] | values | Values to sort |
[in] | begin_bit | [optional] The beginning (least-significant) bit index needed for key comparison |
[in] | end_bit | [optional] The past-the-end (most-significant) bit index needed for key comparison |
Definition at line 843 of file block_radix_sort.cuh.