Macros | |
#define | __CUB_LP64__ 0 |
#define | _CUB_ASM_PTR_ "r" |
#define | _CUB_ASM_PTR_SIZE_ "u32" |
Functions | |
__device__ __forceinline__ unsigned int | cub::SHR_ADD (unsigned int x, unsigned int shift, unsigned int addend) |
Shift-right then add. Returns (x >> shift ) + addend . | |
__device__ __forceinline__ unsigned int | cub::SHL_ADD (unsigned int x, unsigned int shift, unsigned int addend) |
Shift-left then add. Returns (x << shift ) + addend . | |
template<typename UnsignedBits , int BYTE_LEN> | |
__device__ __forceinline__ unsigned int | cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >) |
template<typename UnsignedBits > | |
__device__ __forceinline__ unsigned int | cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< 8 >) |
template<typename UnsignedBits > | |
__device__ __forceinline__ unsigned int | cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits) |
Bitfield-extract. Extracts num_bits from source starting at bit-offset bit_start . The input source may be an 8b, 16b, 32b, or 64b unsigned integer type. | |
__device__ __forceinline__ void | cub::BFI (unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits) |
Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start . | |
__device__ __forceinline__ unsigned int | cub::IADD3 (unsigned int x, unsigned int y, unsigned int z) |
Three-operand add. Returns x + y + z . | |
__device__ __forceinline__ int | cub::PRMT (unsigned int a, unsigned int b, unsigned int index) |
Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. | |
__device__ __forceinline__ void | cub::BAR (int count) |
cub::CTA_SYNC () | |
__device__ __forceinline__ int | cub::CTA_SYNC_AND (int p) |
__device__ __forceinline__ void | cub::WARP_SYNC (unsigned int member_mask) |
__device__ __forceinline__ int | cub::WARP_ANY (int predicate, unsigned int member_mask) |
__device__ __forceinline__ int | cub::WARP_ALL (int predicate, unsigned int member_mask) |
__device__ __forceinline__ int | cub::WARP_BALLOT (int predicate, unsigned int member_mask) |
__device__ __forceinline__ unsigned int | cub::SHFL_UP_SYNC (unsigned int word, int src_offset, int flags, unsigned int member_mask) |
__device__ __forceinline__ unsigned int | cub::SHFL_DOWN_SYNC (unsigned int word, int src_offset, int flags, unsigned int member_mask) |
__device__ __forceinline__ unsigned int | cub::SHFL_IDX_SYNC (unsigned int word, int src_lane, int flags, unsigned int member_mask) |
__device__ __forceinline__ float | cub::FMUL_RZ (float a, float b) |
__device__ __forceinline__ float | cub::FFMA_RZ (float a, float b, float c) |
__device__ __forceinline__ void | cub::ThreadExit () |
Terminates the calling thread. | |
__device__ __forceinline__ void | cub::ThreadTrap () |
Abort execution and generate an interrupt to the host CPU. | |
__device__ __forceinline__ int | cub::RowMajorTid (int block_dim_x, int block_dim_y, int block_dim_z) |
Returns the row-major linear thread identifier for a multidimensional thread block. | |
__device__ __forceinline__ unsigned int | cub::LaneId () |
Returns the warp lane ID of the calling thread. | |
__device__ __forceinline__ unsigned int | cub::WarpId () |
Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. | |
__device__ __forceinline__ unsigned int | cub::LaneMaskLt () |
Returns the warp lane mask of all lanes less than the calling thread. | |
__device__ __forceinline__ unsigned int | cub::LaneMaskLe () |
Returns the warp lane mask of all lanes less than or equal to the calling thread. | |
__device__ __forceinline__ unsigned int | cub::LaneMaskGt () |
Returns the warp lane mask of all lanes greater than the calling thread. | |
__device__ __forceinline__ unsigned int | cub::LaneMaskGe () |
Returns the warp lane mask of all lanes greater than or equal to the calling thread. | |
#define __CUB_LP64__ 0 |
Register modifier for pointer-types (for inlining PTX assembly)
Definition at line 71 of file util_ptx.cuh.
#define _CUB_ASM_PTR_ "r" |
Definition at line 73 of file util_ptx.cuh.
#define _CUB_ASM_PTR_SIZE_ "u32" |
Definition at line 74 of file util_ptx.cuh.
__device__ __forceinline__ void cub::BAR | ( | int | count | ) |
Sync-threads barrier.
Definition at line 247 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::BFE | ( | UnsignedBits | source, |
unsigned int | bit_start, | ||
unsigned int | num_bits | ||
) |
Bitfield-extract. Extracts num_bits
from source
starting at bit-offset bit_start
. The input source
may be an 8b, 16b, 32b, or 64b unsigned integer type.
Definition at line 164 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::BFE | ( | UnsignedBits | source, |
unsigned int | bit_start, | ||
unsigned int | num_bits, | ||
Int2Type< 8 > | |||
) |
Bitfield-extract for 64-bit types.
Definition at line 148 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::BFE | ( | UnsignedBits | source, |
unsigned int | bit_start, | ||
unsigned int | num_bits, | ||
Int2Type< BYTE_LEN > | |||
) |
Bitfield-extract.
Definition at line 127 of file util_ptx.cuh.
__device__ __forceinline__ void cub::BFI | ( | unsigned int & | ret, |
unsigned int | x, | ||
unsigned int | y, | ||
unsigned int | bit_start, | ||
unsigned int | num_bits | ||
) |
Bitfield insert. Inserts the num_bits
least significant bits of y
into x
at bit-offset bit_start
.
Definition at line 176 of file util_ptx.cuh.
__device__ __forceinline__ void cub::CTA_SYNC | ( | ) |
CTA barrier
Definition at line 255 of file util_ptx.cuh.
CTA barrier with predicate
Definition at line 264 of file util_ptx.cuh.
__device__ __forceinline__ float cub::FFMA_RZ | ( | float | a, |
float | b, | ||
float | c | ||
) |
Floating point multiply-add. (Mantissa LSB rounds towards zero.)
Definition at line 381 of file util_ptx.cuh.
__device__ __forceinline__ float cub::FMUL_RZ | ( | float | a, |
float | b | ||
) |
Floating point multiply. (Mantissa LSB rounds towards zero.)
Definition at line 370 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::IADD3 | ( | unsigned int | x, |
unsigned int | y, | ||
unsigned int | z | ||
) |
Three-operand add. Returns x
+ y
+ z
.
Definition at line 198 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::LaneId | ( | ) |
Returns the warp lane ID of the calling thread.
Definition at line 420 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::LaneMaskGe | ( | ) |
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
Definition at line 471 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::LaneMaskGt | ( | ) |
Returns the warp lane mask of all lanes greater than the calling thread.
Definition at line 461 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::LaneMaskLe | ( | ) |
Returns the warp lane mask of all lanes less than or equal to the calling thread.
Definition at line 451 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::LaneMaskLt | ( | ) |
Returns the warp lane mask of all lanes less than the calling thread.
Definition at line 441 of file util_ptx.cuh.
Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later.
a
and b
are numbered from 0 to 7: {b
, a}
= {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within the four lower "nibbles" of index:
{index
} = {n7, n6, n5, n4, n3, n2, n1, n0}Definition at line 235 of file util_ptx.cuh.
__device__ __forceinline__ int cub::RowMajorTid | ( | int | block_dim_x, |
int | block_dim_y, | ||
int | block_dim_z | ||
) |
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition at line 409 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::SHFL_DOWN_SYNC | ( | unsigned int | word, |
int | src_offset, | ||
int | flags, | ||
unsigned int | member_mask | ||
) |
Warp synchronous shfl_down
Definition at line 339 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::SHFL_IDX_SYNC | ( | unsigned int | word, |
int | src_lane, | ||
int | flags, | ||
unsigned int | member_mask | ||
) |
Warp synchronous shfl_idx
Definition at line 355 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::SHFL_UP_SYNC | ( | unsigned int | word, |
int | src_offset, | ||
int | flags, | ||
unsigned int | member_mask | ||
) |
Warp synchronous shfl_up
Definition at line 323 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::SHL_ADD | ( | unsigned int | x, |
unsigned int | shift, | ||
unsigned int | addend | ||
) |
Shift-left then add. Returns (x
<< shift
) + addend
.
Definition at line 106 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::SHR_ADD | ( | unsigned int | x, |
unsigned int | shift, | ||
unsigned int | addend | ||
) |
Shift-right then add. Returns (x
>> shift
) + addend
.
Definition at line 87 of file util_ptx.cuh.
__device__ __forceinline__ void cub::ThreadExit | ( | ) |
Terminates the calling thread.
Definition at line 393 of file util_ptx.cuh.
__device__ __forceinline__ void cub::ThreadTrap | ( | ) |
Abort execution and generate an interrupt to the host CPU.
Definition at line 401 of file util_ptx.cuh.
Warp any
Definition at line 297 of file util_ptx.cuh.
Warp any
Definition at line 284 of file util_ptx.cuh.
Warp ballot
Definition at line 310 of file util_ptx.cuh.
__device__ __forceinline__ void cub::WARP_SYNC | ( | unsigned int | member_mask | ) |
Warp barrier
Definition at line 273 of file util_ptx.cuh.
__device__ __forceinline__ unsigned int cub::WarpId | ( | ) |
Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
Definition at line 431 of file util_ptx.cuh.