OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
util_ptx.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
35 #pragma once
36 
37 #include "util_type.cuh"
38 #include "util_arch.cuh"
39 #include "util_namespace.cuh"
40 #include "util_debug.cuh"
41 
42 
44 CUB_NS_PREFIX
45 
47 namespace cub {
48 
49 
56 /******************************************************************************
57  * PTX helper macros
58  ******************************************************************************/
59 
60 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
61 
65 #if defined(_WIN64) || defined(__LP64__)
66  #define __CUB_LP64__ 1
67  // 64-bit register modifier for inlined asm
68  #define _CUB_ASM_PTR_ "l"
69  #define _CUB_ASM_PTR_SIZE_ "u64"
70 #else
71  #define __CUB_LP64__ 0
72  // 32-bit register modifier for inlined asm
73  #define _CUB_ASM_PTR_ "r"
74  #define _CUB_ASM_PTR_SIZE_ "u32"
75 #endif
76 
77 #endif // DOXYGEN_SHOULD_SKIP_THIS
78 
79 
80 /******************************************************************************
81  * Inlined PTX intrinsics
82  ******************************************************************************/
83 
87 __device__ __forceinline__ unsigned int SHR_ADD(
88  unsigned int x,
89  unsigned int shift,
90  unsigned int addend)
91 {
92  unsigned int ret;
93 #if CUB_PTX_ARCH >= 200
94  asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
95  "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
96 #else
97  ret = (x >> shift) + addend;
98 #endif
99  return ret;
100 }
101 
102 
106 __device__ __forceinline__ unsigned int SHL_ADD(
107  unsigned int x,
108  unsigned int shift,
109  unsigned int addend)
110 {
111  unsigned int ret;
112 #if CUB_PTX_ARCH >= 200
113  asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
114  "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
115 #else
116  ret = (x << shift) + addend;
117 #endif
118  return ret;
119 }
120 
121 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
122 
126 template <typename UnsignedBits, int BYTE_LEN>
127 __device__ __forceinline__ unsigned int BFE(
128  UnsignedBits source,
129  unsigned int bit_start,
130  unsigned int num_bits,
131  Int2Type<BYTE_LEN> /*byte_len*/)
132 {
133  unsigned int bits;
134 #if CUB_PTX_ARCH >= 200
135  asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
136 #else
137  const unsigned int MASK = (1 << num_bits) - 1;
138  bits = (source >> bit_start) & MASK;
139 #endif
140  return bits;
141 }
142 
143 
147 template <typename UnsignedBits>
148 __device__ __forceinline__ unsigned int BFE(
149  UnsignedBits source,
150  unsigned int bit_start,
151  unsigned int num_bits,
152  Int2Type<8> /*byte_len*/)
153 {
154  const unsigned long long MASK = (1ull << num_bits) - 1;
155  return (source >> bit_start) & MASK;
156 }
157 
158 #endif // DOXYGEN_SHOULD_SKIP_THIS
159 
163 template <typename UnsignedBits>
164 __device__ __forceinline__ unsigned int BFE(
165  UnsignedBits source,
166  unsigned int bit_start,
167  unsigned int num_bits)
168 {
169  return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
170 }
171 
172 
176 __device__ __forceinline__ void BFI(
177  unsigned int &ret,
178  unsigned int x,
179  unsigned int y,
180  unsigned int bit_start,
181  unsigned int num_bits)
182 {
183 #if CUB_PTX_ARCH >= 200
184  asm ("bfi.b32 %0, %1, %2, %3, %4;" :
185  "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
186 #else
187  x <<= bit_start;
188  unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
189  unsigned int MASK_Y = ~MASK_X;
190  ret = (y & MASK_Y) | (x & MASK_X);
191 #endif
192 }
193 
194 
198 __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
199 {
200 #if CUB_PTX_ARCH >= 200
201  asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
202 #else
203  x = x + y + z;
204 #endif
205  return x;
206 }
207 
208 
235 __device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
236 {
237  int ret;
238  asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
239  return ret;
240 }
241 
242 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
243 
247 __device__ __forceinline__ void BAR(int count)
248 {
249  asm volatile("bar.sync 1, %0;" : : "r"(count));
250 }
251 
255 __device__ __forceinline__ void CTA_SYNC()
256 {
257  __syncthreads();
258 }
259 
260 
264 __device__ __forceinline__ int CTA_SYNC_AND(int p)
265 {
266  return __syncthreads_and(p);
267 }
268 
269 
273 __device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
274 {
275 #ifdef CUB_USE_COOPERATIVE_GROUPS
276  __syncwarp(member_mask);
277 #endif
278 }
279 
280 
284 __device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
285 {
286 #ifdef CUB_USE_COOPERATIVE_GROUPS
287  return __any_sync(member_mask, predicate);
288 #else
289  return ::__any(predicate);
290 #endif
291 }
292 
293 
297 __device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
298 {
299 #ifdef CUB_USE_COOPERATIVE_GROUPS
300  return __all_sync(member_mask, predicate);
301 #else
302  return ::__all(predicate);
303 #endif
304 }
305 
306 
310 __device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
311 {
312 #ifdef CUB_USE_COOPERATIVE_GROUPS
313  return __ballot_sync(member_mask, predicate);
314 #else
315  return __ballot(predicate);
316 #endif
317 }
318 
322 __device__ __forceinline__
323 unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
324 {
325 #ifdef CUB_USE_COOPERATIVE_GROUPS
326  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
327  : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
328 #else
329  asm volatile("shfl.up.b32 %0, %1, %2, %3;"
330  : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
331 #endif
332  return word;
333 }
334 
338 __device__ __forceinline__
339 unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
340 {
341 #ifdef CUB_USE_COOPERATIVE_GROUPS
342  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
343  : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
344 #else
345  asm volatile("shfl.down.b32 %0, %1, %2, %3;"
346  : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
347 #endif
348  return word;
349 }
350 
354 __device__ __forceinline__
355 unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
356 {
357 #ifdef CUB_USE_COOPERATIVE_GROUPS
358  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
359  : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
360 #else
361  asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
362  : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
363 #endif
364  return word;
365 }
366 
370 __device__ __forceinline__ float FMUL_RZ(float a, float b)
371 {
372  float d;
373  asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
374  return d;
375 }
376 
377 
381 __device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
382 {
383  float d;
384  asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
385  return d;
386 }
387 
388 #endif // DOXYGEN_SHOULD_SKIP_THIS
389 
393 __device__ __forceinline__ void ThreadExit() {
394  asm volatile("exit;");
395 }
396 
397 
401 __device__ __forceinline__ void ThreadTrap() {
402  asm volatile("trap;");
403 }
404 
405 
409 __device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
410 {
411  return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
412  ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
413  threadIdx.x;
414 }
415 
416 
420 __device__ __forceinline__ unsigned int LaneId()
421 {
422  unsigned int ret;
423  asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
424  return ret;
425 }
426 
427 
431 __device__ __forceinline__ unsigned int WarpId()
432 {
433  unsigned int ret;
434  asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
435  return ret;
436 }
437 
441 __device__ __forceinline__ unsigned int LaneMaskLt()
442 {
443  unsigned int ret;
444  asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
445  return ret;
446 }
447 
451 __device__ __forceinline__ unsigned int LaneMaskLe()
452 {
453  unsigned int ret;
454  asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
455  return ret;
456 }
457 
461 __device__ __forceinline__ unsigned int LaneMaskGt()
462 {
463  unsigned int ret;
464  asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
465  return ret;
466 }
467 
471 __device__ __forceinline__ unsigned int LaneMaskGe()
472 {
473  unsigned int ret;
474  asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
475  return ret;
476 }
477  // end group UtilPtx
479 
480 
481 
482 
514 template <
515  int LOGICAL_WARP_THREADS,
516  typename T>
517 __device__ __forceinline__ T ShuffleUp(
518  T input,
519  int src_offset,
520  int first_thread,
521  unsigned int member_mask)
522 {
524  enum {
525  SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
526  };
527 
528  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
529 
530  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
531 
532  T output;
533  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
534  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
535 
536  unsigned int shuffle_word;
537  shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
538  output_alias[0] = shuffle_word;
539 
540  #pragma unroll
541  for (int WORD = 1; WORD < WORDS; ++WORD)
542  {
543  shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
544  output_alias[WORD] = shuffle_word;
545  }
546 
547  return output;
548 }
549 
550 
582 template <
583  int LOGICAL_WARP_THREADS,
584  typename T>
585 __device__ __forceinline__ T ShuffleDown(
586  T input,
587  int src_offset,
588  int last_thread,
589  unsigned int member_mask)
590 {
592  enum {
593  SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
594  };
595 
596  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
597 
598  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
599 
600  T output;
601  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
602  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
603 
604  unsigned int shuffle_word;
605  shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
606  output_alias[0] = shuffle_word;
607 
608  #pragma unroll
609  for (int WORD = 1; WORD < WORDS; ++WORD)
610  {
611  shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
612  output_alias[WORD] = shuffle_word;
613  }
614 
615  return output;
616 }
617 
618 
653 template <
654  int LOGICAL_WARP_THREADS,
655  typename T>
656 __device__ __forceinline__ T ShuffleIndex(
657  T input,
658  int src_lane,
659  unsigned int member_mask)
660 {
662  enum {
663  SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
664  };
665 
666  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
667 
668  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
669 
670  T output;
671  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
672  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
673 
674  unsigned int shuffle_word;
675  shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
676  src_lane,
677  SHFL_C,
678  member_mask);
679 
680  output_alias[0] = shuffle_word;
681 
682  #pragma unroll
683  for (int WORD = 1; WORD < WORDS; ++WORD)
684  {
685  shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
686  src_lane,
687  SHFL_C,
688  member_mask);
689 
690  output_alias[WORD] = shuffle_word;
691  }
692 
693  return output;
694 }
695 
696 
697 
702 template <int LABEL_BITS>
703 inline __device__ unsigned int MatchAny(unsigned int label)
704 {
705  unsigned int retval;
706 
707  // Extract masks of common threads for each bit
708  #pragma unroll
709  for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
710  {
711  unsigned int mask;
712  unsigned int current_bit = 1 << BIT;
713  asm ("{\n"
714  " .reg .pred p;\n"
715  " and.b32 %0, %1, %2;"
716  " setp.eq.u32 p, %0, %2;\n"
717 #ifdef CUB_USE_COOPERATIVE_GROUPS
718  " vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
719 #else
720  " vote.ballot.b32 %0, p;\n"
721 #endif
722  " @!p not.b32 %0, %0;\n"
723  "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
724 
725  // Remove peers who differ
726  retval = (BIT == 0) ? mask : retval & mask;
727  }
728 
729  return retval;
730 
731 // // VOLTA match
732 // unsigned int retval;
733 // asm ("{\n"
734 // " match.any.sync.b32 %0, %1, 0xffffffff;\n"
735 // "}\n" : "=r"(retval) : "r"(label));
736 // return retval;
737 
738 }
739 
740 
741 
742 
743 
744 
745 
746 
747 
748 
749 
750 
751 
752 
753 
754 
755 
756 
757 } // CUB namespace
758 CUB_NS_POSTFIX // Optional outer namespace(s)
__device__ __forceinline__ unsigned int LaneMaskLt()
Returns the warp lane mask of all lanes less than the calling thread.
Definition: util_ptx.cuh:441
__device__ __forceinline__ int CTA_SYNC_AND(int p)
Definition: util_ptx.cuh:264
__device__ __forceinline__ T ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask)
Shuffle-down for any data type. Each warp-lanei obtains the value input contributed by warp-lanei+src...
Definition: util_ptx.cuh:585
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit ...
Definition: util_ptx.cuh:235
__device__ __forceinline__ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
Definition: util_ptx.cuh:323
Optional outer namespace(s)
__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
Definition: util_ptx.cuh:381
__device__ __forceinline__ T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
Shuffle-broadcast for any data type. Each warp-lanei obtains the value input contributed by warp-lane...
Definition: util_ptx.cuh:656
OffsetT int current_bit
[in] Bit position of current radix digit
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition: util_ptx.cuh:310
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
Definition: util_ptx.cuh:284
OffsetT int int num_bits
[in] Number of bits of current radix digit
CTA_SYNC()
Definition: util_ptx.cuh:255
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition: util_ptx.cuh:420
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
Definition: util_ptx.cuh:297
__device__ __forceinline__ unsigned int LaneMaskLe()
Returns the warp lane mask of all lanes less than or equal to the calling thread.
Definition: util_ptx.cuh:451
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
Definition: util_ptx.cuh:471
__device__ __forceinline__ void ThreadTrap()
Abort execution and generate an interrupt to the host CPU.
Definition: util_ptx.cuh:401
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ unsigned int MatchAny(unsigned int label)
Definition: util_ptx.cuh:703
__device__ __forceinline__ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
Definition: util_ptx.cuh:339
__device__ __forceinline__ void ThreadExit()
Terminates the calling thread.
Definition: util_ptx.cuh:393
__device__ __forceinline__ unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-left then add. Returns (x << shift) + addend.
Definition: util_ptx.cuh:106
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
__device__ __forceinline__ T ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask)
Shuffle-up for any data type. Each warp-lanei obtains the value input contributed by warp-lanei-src_o...
Definition: util_ptx.cuh:517
__device__ __forceinline__ unsigned int WarpId()
Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps,...
Definition: util_ptx.cuh:431
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
Definition: util_ptx.cuh:87
__device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
Definition: util_ptx.cuh:355
__device__ __forceinline__ void BAR(int count)
Definition: util_ptx.cuh:247
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
Three-operand add. Returns x + y + z.
Definition: util_ptx.cuh:198
__device__ __forceinline__ float FMUL_RZ(float a, float b)
Definition: util_ptx.cuh:370
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
Definition: util_ptx.cuh:461
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition: util_ptx.cuh:273
__device__ __forceinline__ void BFI(unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start.
Definition: util_ptx.cuh:176
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
Definition: util_ptx.cuh:127