template<
int BLOCK_DIM_X,
int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
- Template Parameters
-
BLOCK_DIM_X | The thread block length in threads along the X dimension |
RADIX_BITS | The number of radix bits per digit place |
IS_DESCENDING | Whether or not the sorted-order is high-to-low |
MEMOIZE_OUTER_SCAN | [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. |
INNER_SCAN_ALGORITHM | [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) |
SMEM_CONFIG | [optional] Shared memory bank mode (default: cudaSharedMemBankSizeFourByte ) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
- Blah...
- Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
- \blocked
- Performance Considerations
-
- Examples
- Example 1: Simple radix rank of 32-bit integer keys
#include <cub/cub.cuh>
template <int BLOCK_THREADS>
__global__ void ExampleKernel(...)
{
Definition at line 97 of file block_radix_rank.cuh.
|
|
__device__ __forceinline__ | BlockRadixRank () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockRadixRank (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage.
|
|
|
template<typename UnsignedBits , int KEYS_PER_THREAD> |
__device__ __forceinline__ void | RankKeys (UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits) |
| Rank keys.
|
|
template<typename UnsignedBits , int KEYS_PER_THREAD> |
__device__ __forceinline__ void | RankKeys (UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) |
| Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
|
|
|
enum | {
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z
, RADIX_DIGITS = 1 << RADIX_BITS
, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH)
, WARP_THREADS = 1 << LOG_WARP_THREADS
,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS
, BYTES_PER_COUNTER = sizeof(DigitCounter)
, LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE
, PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter)
,
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE
, LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0)
, COUNTER_LANES = 1 << LOG_COUNTER_LANES
, PADDED_COUNTER_LANES = COUNTER_LANES + 1
,
RAKING_SEGMENT = PADDED_COUNTER_LANES
} |
|
typedef unsigned short | DigitCounter |
|
typedef If<(SMEM_CONFIG==cudaSharedMemBankSizeEightByte), unsignedlonglong, unsignedint >::Type | PackedCounter |
|
typedef BlockScan< PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > | BlockScan |
| BlockScan type.
|
|