OpenFPM_pdata  3.0.0
Project that contain the implementation of distributed structures
UtilPtx

Macros

#define __CUB_LP64__   0
 
#define _CUB_ASM_PTR_   "r"
 
#define _CUB_ASM_PTR_SIZE_   "u32"
 

Functions

__device__ __forceinline__ unsigned int cub::SHR_ADD (unsigned int x, unsigned int shift, unsigned int addend)
 Shift-right then add. Returns (x >> shift) + addend.
 
__device__ __forceinline__ unsigned int cub::SHL_ADD (unsigned int x, unsigned int shift, unsigned int addend)
 Shift-left then add. Returns (x << shift) + addend.
 
template<typename UnsignedBits , int BYTE_LEN>
__device__ __forceinline__ unsigned int cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
 
template<typename UnsignedBits >
__device__ __forceinline__ unsigned int cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< 8 >)
 
template<typename UnsignedBits >
__device__ __forceinline__ unsigned int cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits)
 Bitfield-extract. Extracts num_bits from source starting at bit-offset bit_start. The input source may be an 8b, 16b, 32b, or 64b unsigned integer type.
 
__device__ __forceinline__ void cub::BFI (unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
 Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start.
 
__device__ __forceinline__ unsigned int cub::IADD3 (unsigned int x, unsigned int y, unsigned int z)
 Three-operand add. Returns x + y + z.
 
__device__ __forceinline__ int cub::PRMT (unsigned int a, unsigned int b, unsigned int index)
 Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. More...
 
__device__ __forceinline__ void cub::BAR (int count)
 
 cub::CTA_SYNC ()
 
__device__ __forceinline__ int cub::CTA_SYNC_AND (int p)
 
__device__ __forceinline__ void cub::WARP_SYNC (unsigned int member_mask)
 
__device__ __forceinline__ int cub::WARP_ANY (int predicate, unsigned int member_mask)
 
__device__ __forceinline__ int cub::WARP_ALL (int predicate, unsigned int member_mask)
 
__device__ __forceinline__ int cub::WARP_BALLOT (int predicate, unsigned int member_mask)
 
__device__ __forceinline__ unsigned int cub::SHFL_UP_SYNC (unsigned int word, int src_offset, int flags, unsigned int member_mask)
 
__device__ __forceinline__ unsigned int cub::SHFL_DOWN_SYNC (unsigned int word, int src_offset, int flags, unsigned int member_mask)
 
__device__ __forceinline__ unsigned int cub::SHFL_IDX_SYNC (unsigned int word, int src_lane, int flags, unsigned int member_mask)
 
__device__ __forceinline__ float cub::FMUL_RZ (float a, float b)
 
__device__ __forceinline__ float cub::FFMA_RZ (float a, float b, float c)
 
__device__ __forceinline__ void cub::ThreadExit ()
 Terminates the calling thread.
 
__device__ __forceinline__ void cub::ThreadTrap ()
 Abort execution and generate an interrupt to the host CPU.
 
__device__ __forceinline__ int cub::RowMajorTid (int block_dim_x, int block_dim_y, int block_dim_z)
 Returns the row-major linear thread identifier for a multidimensional thread block.
 
__device__ __forceinline__ unsigned int cub::LaneId ()
 Returns the warp lane ID of the calling thread.
 
__device__ __forceinline__ unsigned int cub::WarpId ()
 Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
 
__device__ __forceinline__ unsigned int cub::LaneMaskLt ()
 Returns the warp lane mask of all lanes less than the calling thread.
 
__device__ __forceinline__ unsigned int cub::LaneMaskLe ()
 Returns the warp lane mask of all lanes less than or equal to the calling thread.
 
__device__ __forceinline__ unsigned int cub::LaneMaskGt ()
 Returns the warp lane mask of all lanes greater than the calling thread.
 
__device__ __forceinline__ unsigned int cub::LaneMaskGe ()
 Returns the warp lane mask of all lanes greater than or equal to the calling thread.
 

Detailed Description

Macro Definition Documentation

◆ __CUB_LP64__

#define __CUB_LP64__   0

Register modifier for pointer-types (for inlining PTX assembly)

Definition at line 71 of file util_ptx.cuh.

Function Documentation

◆ BAR()

__device__ __forceinline__ void cub::BAR ( int  count)

Sync-threads barrier.

Definition at line 247 of file util_ptx.cuh.

◆ BFE() [1/2]

template<typename UnsignedBits , int BYTE_LEN>
__device__ __forceinline__ unsigned int cub::BFE ( UnsignedBits  source,
unsigned int  bit_start,
unsigned int  num_bits,
Int2Type< BYTE_LEN >   
)

Bitfield-extract.

Definition at line 127 of file util_ptx.cuh.

◆ BFE() [2/2]

template<typename UnsignedBits >
__device__ __forceinline__ unsigned int cub::BFE ( UnsignedBits  source,
unsigned int  bit_start,
unsigned int  num_bits,
Int2Type< 8 >   
)

Bitfield-extract for 64-bit types.

Definition at line 148 of file util_ptx.cuh.

◆ CTA_SYNC()

__device__ __forceinline__ void cub::CTA_SYNC ( )

CTA barrier

Definition at line 255 of file util_ptx.cuh.

◆ CTA_SYNC_AND()

__device__ __forceinline__ int cub::CTA_SYNC_AND ( int  p)

CTA barrier with predicate

Definition at line 264 of file util_ptx.cuh.

◆ FFMA_RZ()

__device__ __forceinline__ float cub::FFMA_RZ ( float  a,
float  b,
float  c 
)

Floating point multiply-add. (Mantissa LSB rounds towards zero.)

Definition at line 381 of file util_ptx.cuh.

◆ FMUL_RZ()

__device__ __forceinline__ float cub::FMUL_RZ ( float  a,
float  b 
)

Floating point multiply. (Mantissa LSB rounds towards zero.)

Definition at line 370 of file util_ptx.cuh.

◆ PRMT()

__device__ __forceinline__ int cub::PRMT ( unsigned int  a,
unsigned int  b,
unsigned int  index 
)

Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later.

The bytes in the two source registers a and b are numbered from 0 to 7: {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within the four lower "nibbles" of index: {index } = {n7, n6, n5, n4, n3, n2, n1, n0}
Snippet
The code snippet below illustrates byte-permute.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int a = 0x03020100;
int b = 0x07060504;
int index = 0x00007531;
int selected = PRMT(a, b, index); // 0x07050301

Definition at line 235 of file util_ptx.cuh.

◆ SHFL_DOWN_SYNC()

__device__ __forceinline__ unsigned int cub::SHFL_DOWN_SYNC ( unsigned int  word,
int  src_offset,
int  flags,
unsigned int  member_mask 
)

Warp synchronous shfl_down

Definition at line 339 of file util_ptx.cuh.

◆ SHFL_IDX_SYNC()

__device__ __forceinline__ unsigned int cub::SHFL_IDX_SYNC ( unsigned int  word,
int  src_lane,
int  flags,
unsigned int  member_mask 
)

Warp synchronous shfl_idx

Definition at line 355 of file util_ptx.cuh.

◆ SHFL_UP_SYNC()

__device__ __forceinline__ unsigned int cub::SHFL_UP_SYNC ( unsigned int  word,
int  src_offset,
int  flags,
unsigned int  member_mask 
)

Warp synchronous shfl_up

Definition at line 323 of file util_ptx.cuh.

◆ WARP_ALL()

__device__ __forceinline__ int cub::WARP_ALL ( int  predicate,
unsigned int  member_mask 
)

Warp any

Definition at line 297 of file util_ptx.cuh.

◆ WARP_ANY()

__device__ __forceinline__ int cub::WARP_ANY ( int  predicate,
unsigned int  member_mask 
)

Warp any

Definition at line 284 of file util_ptx.cuh.

◆ WARP_BALLOT()

__device__ __forceinline__ int cub::WARP_BALLOT ( int  predicate,
unsigned int  member_mask 
)

Warp ballot

Definition at line 310 of file util_ptx.cuh.

◆ WARP_SYNC()

__device__ __forceinline__ void cub::WARP_SYNC ( unsigned int  member_mask)

Warp barrier

Definition at line 273 of file util_ptx.cuh.