60 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 65 #if defined(_WIN64) || defined(__LP64__) 66 #define __CUB_LP64__ 1 68 #define _CUB_ASM_PTR_ "l" 69 #define _CUB_ASM_PTR_SIZE_ "u64" 71 #define __CUB_LP64__ 0 73 #define _CUB_ASM_PTR_ "r" 74 #define _CUB_ASM_PTR_SIZE_ "u32" 77 #endif // DOXYGEN_SHOULD_SKIP_THIS 87 __device__ __forceinline__
unsigned int SHR_ADD(
93 #if CUB_PTX_ARCH >= 200 94 asm (
"vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
95 "=r"(ret) :
"r"(x),
"r"(
shift),
"r"(addend));
97 ret = (x >>
shift) + addend;
106 __device__ __forceinline__
unsigned int SHL_ADD(
112 #if CUB_PTX_ARCH >= 200 113 asm (
"vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
114 "=r"(ret) :
"r"(x),
"r"(
shift),
"r"(addend));
116 ret = (x <<
shift) + addend;
121 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 126 template <
typename Un
signedBits,
int BYTE_LEN>
127 __device__ __forceinline__
unsigned int BFE(
129 unsigned int bit_start,
134 #if CUB_PTX_ARCH >= 200 135 asm (
"bfe.u32 %0, %1, %2, %3;" :
"=r"(bits) :
"r"((
unsigned int) source),
"r"(bit_start),
"r"(
num_bits));
137 const unsigned int MASK = (1 <<
num_bits) - 1;
138 bits = (source >> bit_start) & MASK;
147 template <
typename Un
signedBits>
148 __device__ __forceinline__
unsigned int BFE(
150 unsigned int bit_start,
154 const unsigned long long MASK = (1ull <<
num_bits) - 1;
155 return (source >> bit_start) & MASK;
158 #endif // DOXYGEN_SHOULD_SKIP_THIS 163 template <
typename Un
signedBits>
164 __device__ __forceinline__
unsigned int BFE(
166 unsigned int bit_start,
176 __device__ __forceinline__
void BFI(
180 unsigned int bit_start,
183 #if CUB_PTX_ARCH >= 200 184 asm (
"bfi.b32 %0, %1, %2, %3, %4;" :
185 "=r"(ret) :
"r"(y),
"r"(x),
"r"(bit_start),
"r"(
num_bits));
188 unsigned int MASK_X = ((1 <<
num_bits) - 1) << bit_start;
189 unsigned int MASK_Y = ~MASK_X;
190 ret = (y & MASK_Y) | (x & MASK_X);
198 __device__ __forceinline__
unsigned int IADD3(
unsigned int x,
unsigned int y,
unsigned int z)
200 #if CUB_PTX_ARCH >= 200 201 asm (
"vadd.u32.u32.u32.add %0, %1, %2, %3;" :
"=r"(x) :
"r"(x),
"r"(y),
"r"(z));
235 __device__ __forceinline__
int PRMT(
unsigned int a,
unsigned int b,
unsigned int index)
238 asm (
"prmt.b32 %0, %1, %2, %3;" :
"=r"(ret) :
"r"(a),
"r"(b),
"r"(index));
242 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 247 __device__ __forceinline__
void BAR(
int count)
249 asm volatile(
"bar.sync 1, %0;" : :
"r"(count));
266 return __syncthreads_and(p);
273 __device__ __forceinline__
void WARP_SYNC(
unsigned int member_mask)
275 #ifdef CUB_USE_COOPERATIVE_GROUPS 276 __syncwarp(member_mask);
284 __device__ __forceinline__
int WARP_ANY(
int predicate,
unsigned int member_mask)
286 #ifdef CUB_USE_COOPERATIVE_GROUPS 287 return __any_sync(member_mask, predicate);
289 return ::__any(predicate);
297 __device__ __forceinline__
int WARP_ALL(
int predicate,
unsigned int member_mask)
299 #ifdef CUB_USE_COOPERATIVE_GROUPS 300 return __all_sync(member_mask, predicate);
302 return ::__all(predicate);
310 __device__ __forceinline__
int WARP_BALLOT(
int predicate,
unsigned int member_mask)
312 #ifdef CUB_USE_COOPERATIVE_GROUPS 313 return __ballot_sync(member_mask, predicate);
315 return __ballot(predicate);
322 __device__ __forceinline__
323 unsigned int SHFL_UP_SYNC(
unsigned int word,
int src_offset,
int flags,
unsigned int member_mask)
325 #ifdef CUB_USE_COOPERATIVE_GROUPS 326 asm volatile(
"shfl.sync.up.b32 %0, %1, %2, %3, %4;" 327 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(flags),
"r"(member_mask));
329 asm volatile(
"shfl.up.b32 %0, %1, %2, %3;" 330 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(flags));
338 __device__ __forceinline__
339 unsigned int SHFL_DOWN_SYNC(
unsigned int word,
int src_offset,
int flags,
unsigned int member_mask)
341 #ifdef CUB_USE_COOPERATIVE_GROUPS 342 asm volatile(
"shfl.sync.down.b32 %0, %1, %2, %3, %4;" 343 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(flags),
"r"(member_mask));
345 asm volatile(
"shfl.down.b32 %0, %1, %2, %3;" 346 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(flags));
354 __device__ __forceinline__
355 unsigned int SHFL_IDX_SYNC(
unsigned int word,
int src_lane,
int flags,
unsigned int member_mask)
357 #ifdef CUB_USE_COOPERATIVE_GROUPS 358 asm volatile(
"shfl.sync.idx.b32 %0, %1, %2, %3, %4;" 359 :
"=r"(word) :
"r"(word),
"r"(src_lane),
"r"(flags),
"r"(member_mask));
361 asm volatile(
"shfl.idx.b32 %0, %1, %2, %3;" 362 :
"=r"(word) :
"r"(word),
"r"(src_lane),
"r"(flags));
370 __device__ __forceinline__
float FMUL_RZ(
float a,
float b)
373 asm (
"mul.rz.f32 %0, %1, %2;" :
"=f"(d) :
"f"(a),
"f"(b));
381 __device__ __forceinline__
float FFMA_RZ(
float a,
float b,
float c)
384 asm (
"fma.rz.f32 %0, %1, %2, %3;" :
"=f"(d) :
"f"(a),
"f"(b),
"f"(c));
388 #endif // DOXYGEN_SHOULD_SKIP_THIS 394 asm volatile(
"exit;");
402 asm volatile(
"trap;");
409 __device__ __forceinline__
int RowMajorTid(
int block_dim_x,
int block_dim_y,
int block_dim_z)
411 return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
412 ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
420 __device__ __forceinline__
unsigned int LaneId()
423 asm (
"mov.u32 %0, %%laneid;" :
"=r"(ret) );
431 __device__ __forceinline__
unsigned int WarpId()
434 asm (
"mov.u32 %0, %%warpid;" :
"=r"(ret) );
444 asm (
"mov.u32 %0, %%lanemask_lt;" :
"=r"(ret) );
454 asm (
"mov.u32 %0, %%lanemask_le;" :
"=r"(ret) );
464 asm (
"mov.u32 %0, %%lanemask_gt;" :
"=r"(ret) );
474 asm (
"mov.u32 %0, %%lanemask_ge;" :
"=r"(ret) );
515 int LOGICAL_WARP_THREADS,
521 unsigned int member_mask)
525 SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
530 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
533 ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
534 ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
536 unsigned int shuffle_word;
537 shuffle_word =
SHFL_UP_SYNC((
unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
538 output_alias[0] = shuffle_word;
541 for (
int WORD = 1; WORD < WORDS; ++WORD)
543 shuffle_word =
SHFL_UP_SYNC((
unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
544 output_alias[WORD] = shuffle_word;
583 int LOGICAL_WARP_THREADS,
589 unsigned int member_mask)
593 SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
598 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
601 ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
602 ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
604 unsigned int shuffle_word;
605 shuffle_word =
SHFL_DOWN_SYNC((
unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
606 output_alias[0] = shuffle_word;
609 for (
int WORD = 1; WORD < WORDS; ++WORD)
611 shuffle_word =
SHFL_DOWN_SYNC((
unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
612 output_alias[WORD] = shuffle_word;
654 int LOGICAL_WARP_THREADS,
659 unsigned int member_mask)
663 SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
668 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
671 ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
672 ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
674 unsigned int shuffle_word;
680 output_alias[0] = shuffle_word;
683 for (
int WORD = 1; WORD < WORDS; ++WORD)
685 shuffle_word =
SHFL_IDX_SYNC((
unsigned int)input_alias[WORD],
690 output_alias[WORD] = shuffle_word;
702 template <
int LABEL_BITS>
703 inline __device__
unsigned int MatchAny(
unsigned int label)
709 for (
int BIT = 0; BIT < LABEL_BITS; ++BIT)
715 " and.b32 %0, %1, %2;" 716 " setp.eq.u32 p, %0, %2;\n" 717 #ifdef CUB_USE_COOPERATIVE_GROUPS 718 " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" 720 " vote.ballot.b32 %0, p;\n" 722 " @!p not.b32 %0, %0;\n" 723 "}\n" :
"=r"(mask) :
"r"(label),
"r"(
current_bit));
726 retval = (BIT == 0) ? mask : retval & mask;
__device__ __forceinline__ unsigned int LaneMaskLt()
Returns the warp lane mask of all lanes less than the calling thread.
__device__ __forceinline__ int CTA_SYNC_AND(int p)
__device__ __forceinline__ T ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask)
Shuffle-down for any data type. Each warp-lanei obtains the value input contributed by warp-lanei+src...
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit ...
__device__ __forceinline__ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
Optional outer namespace(s)
__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
__device__ __forceinline__ T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
Shuffle-broadcast for any data type. Each warp-lanei obtains the value input contributed by warp-lane...
OffsetT int current_bit
[in] Bit position of current radix digit
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
OffsetT int int num_bits
[in] Number of bits of current radix digit
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneMaskLe()
Returns the warp lane mask of all lanes less than or equal to the calling thread.
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
__device__ __forceinline__ void ThreadTrap()
Abort execution and generate an interrupt to the host CPU.
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ unsigned int MatchAny(unsigned int label)
__device__ __forceinline__ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
__device__ __forceinline__ void ThreadExit()
Terminates the calling thread.
__device__ __forceinline__ unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-left then add. Returns (x << shift) + addend.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ T ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask)
Shuffle-up for any data type. Each warp-lanei obtains the value input contributed by warp-lanei-src_o...
__device__ __forceinline__ unsigned int WarpId()
Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps,...
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
__device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
__device__ __forceinline__ void BAR(int count)
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
Three-operand add. Returns x + y + z.
__device__ __forceinline__ float FMUL_RZ(float a, float b)
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
__device__ __forceinline__ void BFI(unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start.
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)