OpenFPM_pdata  3.0.0
Project that contain the implementation of distributed structures
cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > Class Template Reference

BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. More...

Detailed Description

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >

BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.

Template Parameters
BLOCK_DIM_XThe thread block length in threads along the X dimension
RADIX_BITSThe number of radix bits per digit place
IS_DESCENDINGWhether or not the sorted-order is high-to-low
MEMOIZE_OUTER_SCAN[optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
INNER_SCAN_ALGORITHM[optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
SMEM_CONFIG[optional] Shared memory bank mode (default: cudaSharedMemBankSizeFourByte)
BLOCK_DIM_Y[optional] The thread block length in threads along the Y dimension (default: 1)
BLOCK_DIM_Z[optional] The thread block length in threads along the Z dimension (default: 1)
PTX_ARCH[optional] \ptxversion
Overview
Blah...
  • Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
  • \blocked
Performance Considerations
  • \granularity
Examples
  • Example 1: Simple radix rank of 32-bit integer keys
    #include <cub/cub.cuh>
    template <int BLOCK_THREADS>
    __global__ void ExampleKernel(...)
    {

Definition at line 97 of file block_radix_rank.cuh.

Data Structures

struct  PrefixCallBack
 
struct  TempStorage
 \smemstorage{BlockScan} More...
 

Public Types

enum  { BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS) }
 

Public Member Functions

Collective constructors
__device__ __forceinline__ BlockRadixRank ()
 Collective constructor using a private static allocation of shared memory as temporary storage.
 
__device__ __forceinline__ BlockRadixRank (TempStorage &temp_storage)
 Collective constructor using the specified memory allocation as temporary storage. More...
 
Raking
template<typename UnsignedBits , int KEYS_PER_THREAD>
__device__ __forceinline__ void RankKeys (UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
 Rank keys. More...
 
template<typename UnsignedBits , int KEYS_PER_THREAD>
__device__ __forceinline__ void RankKeys (UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
 Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. More...
 

Private Types

enum  {
  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, RADIX_DIGITS = 1 << RADIX_BITS, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), WARP_THREADS = 1 << LOG_WARP_THREADS,
  WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, BYTES_PER_COUNTER = sizeof(DigitCounter), LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE, PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
  LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE, LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), COUNTER_LANES = 1 << LOG_COUNTER_LANES, PADDED_COUNTER_LANES = COUNTER_LANES + 1,
  RAKING_SEGMENT = PADDED_COUNTER_LANES
}
 
typedef unsigned short DigitCounter
 
typedef If<(SMEM_CONFIG==cudaSharedMemBankSizeEightByte), unsigned long long, unsigned int >::Type PackedCounter
 
typedef BlockScan< PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockScan
 BlockScan type.
 

Private Member Functions

struct __align__ (16) _TempStorage
 Shared memory storage layout type for BlockRadixRank.
 
__device__ __forceinline__ _TempStorage & PrivateStorage ()
 
__device__ __forceinline__ PackedCounter Upsweep ()
 
__device__ __forceinline__ void ExclusiveDownsweep (PackedCounter raking_partial)
 Performs exclusive downsweep raking scan.
 
__device__ __forceinline__ void ResetCounters ()
 
__device__ __forceinline__ void ScanCounters ()
 

Private Attributes

_TempStorage & temp_storage
 Shared storage reference.
 
unsigned int linear_tid
 Linear thread-id.
 
PackedCounter cached_segment [RAKING_SEGMENT]
 Copy of raking segment, promoted to registers.
 

Member Enumeration Documentation

◆ anonymous enum

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
anonymous enum
Enumerator
BINS_TRACKED_PER_THREAD 

Number of bin-starting offsets tracked per thread.

Definition at line 140 of file block_radix_rank.cuh.

Constructor & Destructor Documentation

◆ BlockRadixRank()

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockRadixRank ( TempStorage temp_storage)
inline

Collective constructor using the specified memory allocation as temporary storage.

Parameters
[in]temp_storageReference to memory allocation having layout type TempStorage

Definition at line 331 of file block_radix_rank.cuh.

Member Function Documentation

◆ PrivateStorage()

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ _TempStorage& cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ( )
inlineprivate

Internal storage allocator

Definition at line 196 of file block_radix_rank.cuh.

◆ RankKeys() [1/2]

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename UnsignedBits , int KEYS_PER_THREAD>
__device__ __forceinline__ void cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::RankKeys ( UnsignedBits(&)  keys[KEYS_PER_THREAD],
int(&)  ranks[KEYS_PER_THREAD],
int  current_bit,
int  num_bits 
)
inline

Rank keys.

Parameters
[in]keysKeys for this tile
[out]ranksFor each key, the local rank within the tile
[in]current_bitThe least-significant bit position of the current digit to extract
[in]num_bitsThe number of bits in the current digit

Definition at line 351 of file block_radix_rank.cuh.

◆ RankKeys() [2/2]

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename UnsignedBits , int KEYS_PER_THREAD>
__device__ __forceinline__ void cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::RankKeys ( UnsignedBits(&)  keys[KEYS_PER_THREAD],
int(&)  ranks[KEYS_PER_THREAD],
int  current_bit,
int  num_bits,
int(&)  exclusive_digit_prefix[BINS_TRACKED_PER_THREAD] 
)
inline

Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.

Parameters
[in]keysKeys for this tile
[out]ranksFor each key, the local rank within the tile (out parameter)
[in]current_bitThe least-significant bit position of the current digit to extract
[in]num_bitsThe number of bits in the current digit
[out]exclusive_digit_prefixThe exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]

Definition at line 413 of file block_radix_rank.cuh.

◆ ResetCounters()

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ResetCounters ( )
inlineprivate

Reset shared memory digit counters

Definition at line 258 of file block_radix_rank.cuh.

◆ ScanCounters()

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ScanCounters ( )
inlineprivate

Scan shared memory digit counters.

Definition at line 293 of file block_radix_rank.cuh.

◆ Upsweep()

template<int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING, bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ PackedCounter cub::BlockRadixRank< BLOCK_DIM_X, RADIX_BITS, IS_DESCENDING, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Upsweep ( )
inlineprivate

Performs upsweep raking reduction, returning the aggregate

Definition at line 206 of file block_radix_rank.cuh.


The documentation for this class was generated from the following file: