OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
util_ptx.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
35#pragma once
36
37#include "util_type.cuh"
38#include "util_arch.cuh"
39#include "util_namespace.cuh"
40#include "util_debug.cuh"
41
42
44CUB_NS_PREFIX
45
47namespace cub {
48
49
56/******************************************************************************
57 * PTX helper macros
58 ******************************************************************************/
59
60#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
61
65#if defined(_WIN64) || defined(__LP64__)
66 #define __CUB_LP64__ 1
67 // 64-bit register modifier for inlined asm
68 #define _CUB_ASM_PTR_ "l"
69 #define _CUB_ASM_PTR_SIZE_ "u64"
70#else
71 #define __CUB_LP64__ 0
72 // 32-bit register modifier for inlined asm
73 #define _CUB_ASM_PTR_ "r"
74 #define _CUB_ASM_PTR_SIZE_ "u32"
75#endif
76
77#endif // DOXYGEN_SHOULD_SKIP_THIS
78
79
80/******************************************************************************
81 * Inlined PTX intrinsics
82 ******************************************************************************/
83
87__device__ __forceinline__ unsigned int SHR_ADD(
88 unsigned int x,
89 unsigned int shift,
90 unsigned int addend)
91{
92 unsigned int ret;
93#if CUB_PTX_ARCH >= 200
94 asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
95 "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
96#else
97 ret = (x >> shift) + addend;
98#endif
99 return ret;
100}
101
102
106__device__ __forceinline__ unsigned int SHL_ADD(
107 unsigned int x,
108 unsigned int shift,
109 unsigned int addend)
110{
111 unsigned int ret;
112#if CUB_PTX_ARCH >= 200
113 asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
114 "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
115#else
116 ret = (x << shift) + addend;
117#endif
118 return ret;
119}
120
121#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
122
126template <typename UnsignedBits, int BYTE_LEN>
127__device__ __forceinline__ unsigned int BFE(
128 UnsignedBits source,
129 unsigned int bit_start,
130 unsigned int num_bits,
131 Int2Type<BYTE_LEN> /*byte_len*/)
132{
133 unsigned int bits;
134#if CUB_PTX_ARCH >= 200
135 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
136#else
137 const unsigned int MASK = (1 << num_bits) - 1;
138 bits = (source >> bit_start) & MASK;
139#endif
140 return bits;
141}
142
143
147template <typename UnsignedBits>
148__device__ __forceinline__ unsigned int BFE(
149 UnsignedBits source,
150 unsigned int bit_start,
151 unsigned int num_bits,
152 Int2Type<8> /*byte_len*/)
153{
154 const unsigned long long MASK = (1ull << num_bits) - 1;
155 return (source >> bit_start) & MASK;
156}
157
158#endif // DOXYGEN_SHOULD_SKIP_THIS
159
163template <typename UnsignedBits>
164__device__ __forceinline__ unsigned int BFE(
165 UnsignedBits source,
166 unsigned int bit_start,
167 unsigned int num_bits)
168{
169 return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
170}
171
172
176__device__ __forceinline__ void BFI(
177 unsigned int &ret,
178 unsigned int x,
179 unsigned int y,
180 unsigned int bit_start,
181 unsigned int num_bits)
182{
183#if CUB_PTX_ARCH >= 200
184 asm ("bfi.b32 %0, %1, %2, %3, %4;" :
185 "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
186#else
187 x <<= bit_start;
188 unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
189 unsigned int MASK_Y = ~MASK_X;
190 ret = (y & MASK_Y) | (x & MASK_X);
191#endif
192}
193
194
198__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
199{
200#if CUB_PTX_ARCH >= 200
201 asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
202#else
203 x = x + y + z;
204#endif
205 return x;
206}
207
208
235__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
236{
237 int ret;
238 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
239 return ret;
240}
241
242#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
243
247__device__ __forceinline__ void BAR(int count)
248{
249 asm volatile("bar.sync 1, %0;" : : "r"(count));
250}
251
255__device__ __forceinline__ void CTA_SYNC()
256{
257 __syncthreads();
258}
259
260
264__device__ __forceinline__ int CTA_SYNC_AND(int p)
265{
266 return __syncthreads_and(p);
267}
268
269
273__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
274{
275#ifdef CUB_USE_COOPERATIVE_GROUPS
276 __syncwarp(member_mask);
277#endif
278}
279
280
284__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
285{
286#ifdef CUB_USE_COOPERATIVE_GROUPS
287 return __any_sync(member_mask, predicate);
288#else
289 return ::__any(predicate);
290#endif
291}
292
293
297__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
298{
299#ifdef CUB_USE_COOPERATIVE_GROUPS
300 return __all_sync(member_mask, predicate);
301#else
302 return ::__all(predicate);
303#endif
304}
305
306
310__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
311{
312#ifdef CUB_USE_COOPERATIVE_GROUPS
313 return __ballot_sync(member_mask, predicate);
314#else
315 return __ballot(predicate);
316#endif
317}
318
322__device__ __forceinline__
323unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
324{
325#ifdef CUB_USE_COOPERATIVE_GROUPS
326 asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
327 : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
328#else
329 asm volatile("shfl.up.b32 %0, %1, %2, %3;"
330 : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
331#endif
332 return word;
333}
334
338__device__ __forceinline__
339unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
340{
341#ifdef CUB_USE_COOPERATIVE_GROUPS
342 asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
343 : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
344#else
345 asm volatile("shfl.down.b32 %0, %1, %2, %3;"
346 : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
347#endif
348 return word;
349}
350
354__device__ __forceinline__
355unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
356{
357#ifdef CUB_USE_COOPERATIVE_GROUPS
358 asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
359 : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
360#else
361 asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
362 : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
363#endif
364 return word;
365}
366
370__device__ __forceinline__ float FMUL_RZ(float a, float b)
371{
372 float d;
373 asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
374 return d;
375}
376
377
381__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
382{
383 float d;
384 asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
385 return d;
386}
387
388#endif // DOXYGEN_SHOULD_SKIP_THIS
389
393__device__ __forceinline__ void ThreadExit() {
394 asm volatile("exit;");
395}
396
397
401__device__ __forceinline__ void ThreadTrap() {
402 asm volatile("trap;");
403}
404
405
409__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
410{
411 return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
412 ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
413 threadIdx.x;
414}
415
416
420__device__ __forceinline__ unsigned int LaneId()
421{
422 unsigned int ret;
423 asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
424 return ret;
425}
426
427
431__device__ __forceinline__ unsigned int WarpId()
432{
433 unsigned int ret;
434 asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
435 return ret;
436}
437
441__device__ __forceinline__ unsigned int LaneMaskLt()
442{
443 unsigned int ret;
444 asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
445 return ret;
446}
447
451__device__ __forceinline__ unsigned int LaneMaskLe()
452{
453 unsigned int ret;
454 asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
455 return ret;
456}
457
461__device__ __forceinline__ unsigned int LaneMaskGt()
462{
463 unsigned int ret;
464 asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
465 return ret;
466}
467
471__device__ __forceinline__ unsigned int LaneMaskGe()
472{
473 unsigned int ret;
474 asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
475 return ret;
476}
477 // end group UtilPtx
479
480
481
482
514template <
515 int LOGICAL_WARP_THREADS,
516 typename T>
517__device__ __forceinline__ T ShuffleUp(
518 T input,
519 int src_offset,
520 int first_thread,
521 unsigned int member_mask)
522{
524 enum {
525 SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
526 };
527
528 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
529
530 const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
531
532 T output;
533 ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
534 ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
535
536 unsigned int shuffle_word;
537 shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
538 output_alias[0] = shuffle_word;
539
540 #pragma unroll
541 for (int WORD = 1; WORD < WORDS; ++WORD)
542 {
543 shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
544 output_alias[WORD] = shuffle_word;
545 }
546
547 return output;
548}
549
550
582template <
583 int LOGICAL_WARP_THREADS,
584 typename T>
585__device__ __forceinline__ T ShuffleDown(
586 T input,
587 int src_offset,
588 int last_thread,
589 unsigned int member_mask)
590{
592 enum {
593 SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
594 };
595
596 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
597
598 const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
599
600 T output;
601 ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
602 ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
603
604 unsigned int shuffle_word;
605 shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
606 output_alias[0] = shuffle_word;
607
608 #pragma unroll
609 for (int WORD = 1; WORD < WORDS; ++WORD)
610 {
611 shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
612 output_alias[WORD] = shuffle_word;
613 }
614
615 return output;
616}
617
618
653template <
654 int LOGICAL_WARP_THREADS,
655 typename T>
656__device__ __forceinline__ T ShuffleIndex(
657 T input,
658 int src_lane,
659 unsigned int member_mask)
660{
662 enum {
663 SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
664 };
665
666 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
667
668 const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
669
670 T output;
671 ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
672 ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
673
674 unsigned int shuffle_word;
675 shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
676 src_lane,
677 SHFL_C,
678 member_mask);
679
680 output_alias[0] = shuffle_word;
681
682 #pragma unroll
683 for (int WORD = 1; WORD < WORDS; ++WORD)
684 {
685 shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
686 src_lane,
687 SHFL_C,
688 member_mask);
689
690 output_alias[WORD] = shuffle_word;
691 }
692
693 return output;
694}
695
696
697
702template <int LABEL_BITS>
703inline __device__ unsigned int MatchAny(unsigned int label)
704{
705 unsigned int retval;
706
707 // Extract masks of common threads for each bit
708 #pragma unroll
709 for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
710 {
711 unsigned int mask;
712 unsigned int current_bit = 1 << BIT;
713 asm ("{\n"
714 " .reg .pred p;\n"
715 " and.b32 %0, %1, %2;"
716 " setp.eq.u32 p, %0, %2;\n"
717#ifdef CUB_USE_COOPERATIVE_GROUPS
718 " vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
719#else
720 " vote.ballot.b32 %0, p;\n"
721#endif
722 " @!p not.b32 %0, %0;\n"
723 "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
724
725 // Remove peers who differ
726 retval = (BIT == 0) ? mask : retval & mask;
727 }
728
729 return retval;
730
731// // VOLTA match
732// unsigned int retval;
733// asm ("{\n"
734// " match.any.sync.b32 %0, %1, 0xffffffff;\n"
735// "}\n" : "=r"(retval) : "r"(label));
736// return retval;
737
738}
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757} // CUB namespace
758CUB_NS_POSTFIX // Optional outer namespace(s)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
Definition util_ptx.cuh:323
__device__ __forceinline__ void BFI(unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start.
Definition util_ptx.cuh:176
__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
Definition util_ptx.cuh:381
__device__ __forceinline__ void BAR(int count)
Definition util_ptx.cuh:247
__device__ __forceinline__ unsigned int LaneMaskLt()
Returns the warp lane mask of all lanes less than the calling thread.
Definition util_ptx.cuh:441
__device__ __forceinline__ unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-left then add. Returns (x << shift) + addend.
Definition util_ptx.cuh:106
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit ...
Definition util_ptx.cuh:235
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
Three-operand add. Returns x + y + z.
Definition util_ptx.cuh:198
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:284
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
Definition util_ptx.cuh:461
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:310
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
Definition util_ptx.cuh:87
__device__ __forceinline__ int CTA_SYNC_AND(int p)
Definition util_ptx.cuh:264
__device__ __forceinline__ void ThreadExit()
Terminates the calling thread.
Definition util_ptx.cuh:393
__device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
Definition util_ptx.cuh:355
__device__ __forceinline__ unsigned int LaneMaskLe()
Returns the warp lane mask of all lanes less than or equal to the calling thread.
Definition util_ptx.cuh:451
__device__ __forceinline__ void ThreadTrap()
Abort execution and generate an interrupt to the host CPU.
Definition util_ptx.cuh:401
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420
__device__ __forceinline__ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
Definition util_ptx.cuh:339
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
Definition util_ptx.cuh:471
__device__ __forceinline__ unsigned int WarpId()
Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps,...
Definition util_ptx.cuh:431
__device__ __forceinline__ float FMUL_RZ(float a, float b)
Definition util_ptx.cuh:370
CTA_SYNC()
Definition util_ptx.cuh:255
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:297
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
Definition util_ptx.cuh:127
__device__ __forceinline__ T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
Shuffle-broadcast for any data type. Each warp-lanei obtains the value input contributed by warp-lane...
Definition util_ptx.cuh:656
__device__ __forceinline__ T ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask)
Shuffle-up for any data type. Each warp-lanei obtains the value input contributed by warp-lanei-src_o...
Definition util_ptx.cuh:517
__device__ __forceinline__ T ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask)
Shuffle-down for any data type. Each warp-lanei obtains the value input contributed by warp-lanei+src...
Definition util_ptx.cuh:585
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ unsigned int MatchAny(unsigned int label)
Definition util_ptx.cuh:703
OffsetT int int num_bits
[in] Number of bits of current radix digit
OffsetT int current_bit
[in] Bit position of current radix digit
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...