#include "util_type.cuh"
#include "util_arch.cuh"
#include "util_namespace.cuh"
#include "util_debug.cuh"
Go to the source code of this file.
Namespaces | |
namespace | cub |
Optional outer namespace(s) | |
Macros | |
#define | __CUB_LP64__ 0 |
#define | _CUB_ASM_PTR_ "r" |
#define | _CUB_ASM_PTR_SIZE_ "u32" |
Functions | |
__device__ __forceinline__ unsigned int | cub::SHR_ADD (unsigned int x, unsigned int shift, unsigned int addend) |
Shift-right then add. Returns (x >> shift ) + addend . | |
__device__ __forceinline__ unsigned int | cub::SHL_ADD (unsigned int x, unsigned int shift, unsigned int addend) |
Shift-left then add. Returns (x << shift ) + addend . | |
template<typename UnsignedBits , int BYTE_LEN> | |
__device__ __forceinline__ unsigned int | cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >) |
template<typename UnsignedBits > | |
__device__ __forceinline__ unsigned int | cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< 8 >) |
template<typename UnsignedBits > | |
__device__ __forceinline__ unsigned int | cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits) |
Bitfield-extract. Extracts num_bits from source starting at bit-offset bit_start . The input source may be an 8b, 16b, 32b, or 64b unsigned integer type. | |
__device__ __forceinline__ void | cub::BFI (unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits) |
Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start . | |
__device__ __forceinline__ unsigned int | cub::IADD3 (unsigned int x, unsigned int y, unsigned int z) |
Three-operand add. Returns x + y + z . | |
__device__ __forceinline__ int | cub::PRMT (unsigned int a, unsigned int b, unsigned int index) |
Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. | |
__device__ __forceinline__ void | cub::BAR (int count) |
cub::CTA_SYNC () | |
__device__ __forceinline__ int | cub::CTA_SYNC_AND (int p) |
__device__ __forceinline__ void | cub::WARP_SYNC (unsigned int member_mask) |
__device__ __forceinline__ int | cub::WARP_ANY (int predicate, unsigned int member_mask) |
__device__ __forceinline__ int | cub::WARP_ALL (int predicate, unsigned int member_mask) |
__device__ __forceinline__ int | cub::WARP_BALLOT (int predicate, unsigned int member_mask) |
__device__ __forceinline__ unsigned int | cub::SHFL_UP_SYNC (unsigned int word, int src_offset, int flags, unsigned int member_mask) |
__device__ __forceinline__ unsigned int | cub::SHFL_DOWN_SYNC (unsigned int word, int src_offset, int flags, unsigned int member_mask) |
__device__ __forceinline__ unsigned int | cub::SHFL_IDX_SYNC (unsigned int word, int src_lane, int flags, unsigned int member_mask) |
__device__ __forceinline__ float | cub::FMUL_RZ (float a, float b) |
__device__ __forceinline__ float | cub::FFMA_RZ (float a, float b, float c) |
__device__ __forceinline__ void | cub::ThreadExit () |
Terminates the calling thread. | |
__device__ __forceinline__ void | cub::ThreadTrap () |
Abort execution and generate an interrupt to the host CPU. | |
__device__ __forceinline__ int | cub::RowMajorTid (int block_dim_x, int block_dim_y, int block_dim_z) |
Returns the row-major linear thread identifier for a multidimensional thread block. | |
__device__ __forceinline__ unsigned int | cub::LaneId () |
Returns the warp lane ID of the calling thread. | |
__device__ __forceinline__ unsigned int | cub::WarpId () |
Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. | |
__device__ __forceinline__ unsigned int | cub::LaneMaskLt () |
Returns the warp lane mask of all lanes less than the calling thread. | |
__device__ __forceinline__ unsigned int | cub::LaneMaskLe () |
Returns the warp lane mask of all lanes less than or equal to the calling thread. | |
__device__ __forceinline__ unsigned int | cub::LaneMaskGt () |
Returns the warp lane mask of all lanes greater than the calling thread. | |
__device__ __forceinline__ unsigned int | cub::LaneMaskGe () |
Returns the warp lane mask of all lanes greater than or equal to the calling thread. | |
template<int LOGICAL_WARP_THREADS, typename T > | |
__device__ __forceinline__ T | cub::ShuffleUp (T input, int src_offset, int first_thread, unsigned int member_mask) |
Shuffle-up for any data type. Each warp-lanei obtains the value input contributed by warp-lanei-src_offset . For thread lanes i < src_offset, the thread's own input is returned to the thread. | |
template<int LOGICAL_WARP_THREADS, typename T > | |
__device__ __forceinline__ T | cub::ShuffleDown (T input, int src_offset, int last_thread, unsigned int member_mask) |
Shuffle-down for any data type. Each warp-lanei obtains the value input contributed by warp-lanei+src_offset . For thread lanes i >= WARP_THREADS, the thread's own input is returned to the thread. | |
template<int LOGICAL_WARP_THREADS, typename T > | |
__device__ __forceinline__ T | cub::ShuffleIndex (T input, int src_lane, unsigned int member_mask) |
Shuffle-broadcast for any data type. Each warp-lanei obtains the value input contributed by warp-lanesrc_lane . For src_lane < 0 or src_lane >= WARP_THREADS, then the thread's own input is returned to the thread. | |
template<int LABEL_BITS> | |
__device__ unsigned int | cub::MatchAny (unsigned int label) |
PTX intrinsics
Definition in file util_ptx.cuh.