60#ifndef DOXYGEN_SHOULD_SKIP_THIS
65#if defined(_WIN64) || defined(__LP64__)
66 #define __CUB_LP64__ 1
68 #define _CUB_ASM_PTR_ "l"
69 #define _CUB_ASM_PTR_SIZE_ "u64"
71 #define __CUB_LP64__ 0
73 #define _CUB_ASM_PTR_ "r"
74 #define _CUB_ASM_PTR_SIZE_ "u32"
87__device__ __forceinline__
unsigned int SHR_ADD(
93#if CUB_PTX_ARCH >= 200
94 asm (
"vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
95 "=r"(ret) :
"r"(x),
"r"(
shift),
"r"(addend));
97 ret = (x >>
shift) + addend;
106__device__ __forceinline__
unsigned int SHL_ADD(
112#if CUB_PTX_ARCH >= 200
113 asm (
"vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
114 "=r"(ret) :
"r"(x),
"r"(
shift),
"r"(addend));
116 ret = (x <<
shift) + addend;
121#ifndef DOXYGEN_SHOULD_SKIP_THIS
126template <
typename Un
signedBits,
int BYTE_LEN>
127__device__ __forceinline__
unsigned int BFE(
129 unsigned int bit_start,
134#if CUB_PTX_ARCH >= 200
135 asm (
"bfe.u32 %0, %1, %2, %3;" :
"=r"(bits) :
"r"((
unsigned int) source),
"r"(bit_start),
"r"(
num_bits));
137 const unsigned int MASK = (1 <<
num_bits) - 1;
138 bits = (source >> bit_start) & MASK;
147template <
typename Un
signedBits>
148__device__ __forceinline__
unsigned int BFE(
150 unsigned int bit_start,
154 const unsigned long long MASK = (1ull <<
num_bits) - 1;
155 return (source >> bit_start) & MASK;
163template <
typename Un
signedBits>
164__device__ __forceinline__
unsigned int BFE(
166 unsigned int bit_start,
176__device__ __forceinline__
void BFI(
180 unsigned int bit_start,
183#if CUB_PTX_ARCH >= 200
184 asm (
"bfi.b32 %0, %1, %2, %3, %4;" :
185 "=r"(ret) :
"r"(y),
"r"(x),
"r"(bit_start),
"r"(
num_bits));
188 unsigned int MASK_X = ((1 <<
num_bits) - 1) << bit_start;
189 unsigned int MASK_Y = ~MASK_X;
190 ret = (y & MASK_Y) | (x & MASK_X);
198__device__ __forceinline__
unsigned int IADD3(
unsigned int x,
unsigned int y,
unsigned int z)
200#if CUB_PTX_ARCH >= 200
201 asm (
"vadd.u32.u32.u32.add %0, %1, %2, %3;" :
"=r"(x) :
"r"(x),
"r"(y),
"r"(z));
235__device__ __forceinline__
int PRMT(
unsigned int a,
unsigned int b,
unsigned int index)
238 asm (
"prmt.b32 %0, %1, %2, %3;" :
"=r"(ret) :
"r"(a),
"r"(b),
"r"(index));
242#ifndef DOXYGEN_SHOULD_SKIP_THIS
247__device__ __forceinline__
void BAR(
int count)
249 asm volatile(
"bar.sync 1, %0;" : :
"r"(count));
266 return __syncthreads_and(p);
273__device__ __forceinline__
void WARP_SYNC(
unsigned int member_mask)
275#ifdef CUB_USE_COOPERATIVE_GROUPS
276 __syncwarp(member_mask);
284__device__ __forceinline__
int WARP_ANY(
int predicate,
unsigned int member_mask)
286#ifdef CUB_USE_COOPERATIVE_GROUPS
287 return __any_sync(member_mask, predicate);
289 return ::__any(predicate);
297__device__ __forceinline__
int WARP_ALL(
int predicate,
unsigned int member_mask)
299#ifdef CUB_USE_COOPERATIVE_GROUPS
300 return __all_sync(member_mask, predicate);
302 return ::__all(predicate);
310__device__ __forceinline__
int WARP_BALLOT(
int predicate,
unsigned int member_mask)
312#ifdef CUB_USE_COOPERATIVE_GROUPS
313 return __ballot_sync(member_mask, predicate);
315 return __ballot(predicate);
322__device__ __forceinline__
323unsigned int SHFL_UP_SYNC(
unsigned int word,
int src_offset,
int flags,
unsigned int member_mask)
325#ifdef CUB_USE_COOPERATIVE_GROUPS
326 asm volatile(
"shfl.sync.up.b32 %0, %1, %2, %3, %4;"
327 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(flags),
"r"(member_mask));
329 asm volatile(
"shfl.up.b32 %0, %1, %2, %3;"
330 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(flags));
338__device__ __forceinline__
339unsigned int SHFL_DOWN_SYNC(
unsigned int word,
int src_offset,
int flags,
unsigned int member_mask)
341#ifdef CUB_USE_COOPERATIVE_GROUPS
342 asm volatile(
"shfl.sync.down.b32 %0, %1, %2, %3, %4;"
343 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(flags),
"r"(member_mask));
345 asm volatile(
"shfl.down.b32 %0, %1, %2, %3;"
346 :
"=r"(word) :
"r"(word),
"r"(src_offset),
"r"(flags));
354__device__ __forceinline__
355unsigned int SHFL_IDX_SYNC(
unsigned int word,
int src_lane,
int flags,
unsigned int member_mask)
357#ifdef CUB_USE_COOPERATIVE_GROUPS
358 asm volatile(
"shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
359 :
"=r"(word) :
"r"(word),
"r"(src_lane),
"r"(flags),
"r"(member_mask));
361 asm volatile(
"shfl.idx.b32 %0, %1, %2, %3;"
362 :
"=r"(word) :
"r"(word),
"r"(src_lane),
"r"(flags));
370__device__ __forceinline__
float FMUL_RZ(
float a,
float b)
373 asm (
"mul.rz.f32 %0, %1, %2;" :
"=f"(d) :
"f"(a),
"f"(b));
381__device__ __forceinline__
float FFMA_RZ(
float a,
float b,
float c)
384 asm (
"fma.rz.f32 %0, %1, %2, %3;" :
"=f"(d) :
"f"(a),
"f"(b),
"f"(c));
394 asm volatile(
"exit;");
402 asm volatile(
"trap;");
409__device__ __forceinline__
int RowMajorTid(
int block_dim_x,
int block_dim_y,
int block_dim_z)
411 return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
412 ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
420__device__ __forceinline__
unsigned int LaneId()
423 asm (
"mov.u32 %0, %%laneid;" :
"=r"(ret) );
431__device__ __forceinline__
unsigned int WarpId()
434 asm (
"mov.u32 %0, %%warpid;" :
"=r"(ret) );
444 asm (
"mov.u32 %0, %%lanemask_lt;" :
"=r"(ret) );
454 asm (
"mov.u32 %0, %%lanemask_le;" :
"=r"(ret) );
464 asm (
"mov.u32 %0, %%lanemask_gt;" :
"=r"(ret) );
474 asm (
"mov.u32 %0, %%lanemask_ge;" :
"=r"(ret) );
515 int LOGICAL_WARP_THREADS,
521 unsigned int member_mask)
525 SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
530 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
533 ShuffleWord *output_alias =
reinterpret_cast<ShuffleWord *
>(&output);
534 ShuffleWord *input_alias =
reinterpret_cast<ShuffleWord *
>(&input);
536 unsigned int shuffle_word;
537 shuffle_word =
SHFL_UP_SYNC((
unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
538 output_alias[0] = shuffle_word;
541 for (
int WORD = 1; WORD < WORDS; ++WORD)
543 shuffle_word =
SHFL_UP_SYNC((
unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
544 output_alias[WORD] = shuffle_word;
583 int LOGICAL_WARP_THREADS,
589 unsigned int member_mask)
593 SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
598 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
601 ShuffleWord *output_alias =
reinterpret_cast<ShuffleWord *
>(&output);
602 ShuffleWord *input_alias =
reinterpret_cast<ShuffleWord *
>(&input);
604 unsigned int shuffle_word;
605 shuffle_word =
SHFL_DOWN_SYNC((
unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
606 output_alias[0] = shuffle_word;
609 for (
int WORD = 1; WORD < WORDS; ++WORD)
611 shuffle_word =
SHFL_DOWN_SYNC((
unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
612 output_alias[WORD] = shuffle_word;
654 int LOGICAL_WARP_THREADS,
659 unsigned int member_mask)
663 SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
668 const int WORDS = (
sizeof(T) +
sizeof(ShuffleWord) - 1) /
sizeof(ShuffleWord);
671 ShuffleWord *output_alias =
reinterpret_cast<ShuffleWord *
>(&output);
672 ShuffleWord *input_alias =
reinterpret_cast<ShuffleWord *
>(&input);
674 unsigned int shuffle_word;
680 output_alias[0] = shuffle_word;
683 for (
int WORD = 1; WORD < WORDS; ++WORD)
685 shuffle_word =
SHFL_IDX_SYNC((
unsigned int)input_alias[WORD],
690 output_alias[WORD] = shuffle_word;
702template <
int LABEL_BITS>
703inline __device__
unsigned int MatchAny(
unsigned int label)
709 for (
int BIT = 0; BIT < LABEL_BITS; ++BIT)
715 " and.b32 %0, %1, %2;"
716 " setp.eq.u32 p, %0, %2;\n"
717#ifdef CUB_USE_COOPERATIVE_GROUPS
718 " vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
720 " vote.ballot.b32 %0, p;\n"
722 " @!p not.b32 %0, %0;\n"
723 "}\n" :
"=r"(mask) :
"r"(label),
"r"(
current_bit));
726 retval = (BIT == 0) ? mask : retval & mask;
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
__device__ __forceinline__ void BFI(unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start.
__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
__device__ __forceinline__ void BAR(int count)
__device__ __forceinline__ unsigned int LaneMaskLt()
Returns the warp lane mask of all lanes less than the calling thread.
__device__ __forceinline__ unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-left then add. Returns (x << shift) + addend.
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit ...
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
Three-operand add. Returns x + y + z.
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
__device__ __forceinline__ int CTA_SYNC_AND(int p)
__device__ __forceinline__ void ThreadExit()
Terminates the calling thread.
__device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneMaskLe()
Returns the warp lane mask of all lanes less than or equal to the calling thread.
__device__ __forceinline__ void ThreadTrap()
Abort execution and generate an interrupt to the host CPU.
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
__device__ __forceinline__ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
__device__ __forceinline__ unsigned int WarpId()
Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps,...
__device__ __forceinline__ float FMUL_RZ(float a, float b)
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
__device__ __forceinline__ T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
Shuffle-broadcast for any data type. Each warp-lanei obtains the value input contributed by warp-lane...
__device__ __forceinline__ T ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask)
Shuffle-up for any data type. Each warp-lanei obtains the value input contributed by warp-lanei-src_o...
__device__ __forceinline__ T ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask)
Shuffle-down for any data type. Each warp-lanei obtains the value input contributed by warp-lanei+src...
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ unsigned int MatchAny(unsigned int label)
OffsetT int int num_bits
[in] Number of bits of current radix digit
OffsetT int current_bit
[in] Bit position of current radix digit
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...