OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
warp_reduce_shfl.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../../thread/thread_operators.cuh"
37 #include "../../util_ptx.cuh"
38 #include "../../util_type.cuh"
39 #include "../../util_macro.cuh"
40 #include "../../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
48 
54 template <
55  typename T,
56  int LOGICAL_WARP_THREADS,
57  int PTX_ARCH>
59 {
60  //---------------------------------------------------------------------
61  // Constants and type definitions
62  //---------------------------------------------------------------------
63 
64  enum
65  {
67  IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
68 
71 
73  LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
74 
76  SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
77 
78  };
79 
80  template <typename S>
81  struct IsInteger
82  {
83  enum {
85  IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
86  };
87  };
88 
89 
92 
93 
94  //---------------------------------------------------------------------
95  // Thread fields
96  //---------------------------------------------------------------------
97 
99  unsigned int lane_id;
100 
102  unsigned int warp_id;
103 
105  unsigned int member_mask;
106 
107 
108  //---------------------------------------------------------------------
109  // Construction
110  //---------------------------------------------------------------------
111 
113  __device__ __forceinline__ WarpReduceShfl(
114  TempStorage &/*temp_storage*/)
115  {
116  lane_id = LaneId();
117  warp_id = 0;
118  member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
119 
120  if (!IS_ARCH_WARP)
121  {
122  warp_id = lane_id / LOGICAL_WARP_THREADS;
123  lane_id = lane_id % LOGICAL_WARP_THREADS;
124  member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
125  }
126  }
127 
128 
129  //---------------------------------------------------------------------
130  // Reduction steps
131  //---------------------------------------------------------------------
132 
134  __device__ __forceinline__ unsigned int ReduceStep(
135  unsigned int input,
136  cub::Sum /*reduction_op*/,
137  int last_lane,
138  int offset)
139  {
140  unsigned int output;
141  int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
142 
143  // Use predicate set from SHFL to guard against invalid peers
144 #ifdef CUB_USE_COOPERATIVE_GROUPS
145  asm volatile(
146  "{"
147  " .reg .u32 r0;"
148  " .reg .pred p;"
149  " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
150  " @p add.u32 r0, r0, %4;"
151  " mov.u32 %0, r0;"
152  "}"
153  : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
154 #else
155  asm volatile(
156  "{"
157  " .reg .u32 r0;"
158  " .reg .pred p;"
159  " shfl.down.b32 r0|p, %1, %2, %3;"
160  " @p add.u32 r0, r0, %4;"
161  " mov.u32 %0, r0;"
162  "}"
163  : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
164 #endif
165 
166  return output;
167  }
168 
169 
171  __device__ __forceinline__ float ReduceStep(
172  float input,
173  cub::Sum /*reduction_op*/,
174  int last_lane,
175  int offset)
176  {
177  float output;
178  int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
179 
180  // Use predicate set from SHFL to guard against invalid peers
181 #ifdef CUB_USE_COOPERATIVE_GROUPS
182  asm volatile(
183  "{"
184  " .reg .f32 r0;"
185  " .reg .pred p;"
186  " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
187  " @p add.f32 r0, r0, %4;"
188  " mov.f32 %0, r0;"
189  "}"
190  : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
191 #else
192  asm volatile(
193  "{"
194  " .reg .f32 r0;"
195  " .reg .pred p;"
196  " shfl.down.b32 r0|p, %1, %2, %3;"
197  " @p add.f32 r0, r0, %4;"
198  " mov.f32 %0, r0;"
199  "}"
200  : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
201 #endif
202 
203  return output;
204  }
205 
206 
208  __device__ __forceinline__ unsigned long long ReduceStep(
209  unsigned long long input,
210  cub::Sum /*reduction_op*/,
211  int last_lane,
212  int offset)
213  {
214  unsigned long long output;
215  int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
216 
217 #ifdef CUB_USE_COOPERATIVE_GROUPS
218  asm volatile(
219  "{"
220  " .reg .u32 lo;"
221  " .reg .u32 hi;"
222  " .reg .pred p;"
223  " mov.b64 {lo, hi}, %1;"
224  " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
225  " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
226  " mov.b64 %0, {lo, hi};"
227  " @p add.u64 %0, %0, %1;"
228  "}"
229  : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
230 #else
231  asm volatile(
232  "{"
233  " .reg .u32 lo;"
234  " .reg .u32 hi;"
235  " .reg .pred p;"
236  " mov.b64 {lo, hi}, %1;"
237  " shfl.down.b32 lo|p, lo, %2, %3;"
238  " shfl.down.b32 hi|p, hi, %2, %3;"
239  " mov.b64 %0, {lo, hi};"
240  " @p add.u64 %0, %0, %1;"
241  "}"
242  : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
243 #endif
244 
245  return output;
246  }
247 
248 
250  __device__ __forceinline__ long long ReduceStep(
251  long long input,
252  cub::Sum /*reduction_op*/,
253  int last_lane,
254  int offset)
255  {
256  long long output;
257  int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
258 
259  // Use predicate set from SHFL to guard against invalid peers
260 #ifdef CUB_USE_COOPERATIVE_GROUPS
261  asm volatile(
262  "{"
263  " .reg .u32 lo;"
264  " .reg .u32 hi;"
265  " .reg .pred p;"
266  " mov.b64 {lo, hi}, %1;"
267  " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
268  " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
269  " mov.b64 %0, {lo, hi};"
270  " @p add.s64 %0, %0, %1;"
271  "}"
272  : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
273 #else
274  asm volatile(
275  "{"
276  " .reg .u32 lo;"
277  " .reg .u32 hi;"
278  " .reg .pred p;"
279  " mov.b64 {lo, hi}, %1;"
280  " shfl.down.b32 lo|p, lo, %2, %3;"
281  " shfl.down.b32 hi|p, hi, %2, %3;"
282  " mov.b64 %0, {lo, hi};"
283  " @p add.s64 %0, %0, %1;"
284  "}"
285  : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
286 #endif
287 
288  return output;
289  }
290 
291 
293  __device__ __forceinline__ double ReduceStep(
294  double input,
295  cub::Sum /*reduction_op*/,
296  int last_lane,
297  int offset)
298  {
299  double output;
300  int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
301 
302  // Use predicate set from SHFL to guard against invalid peers
303 #ifdef CUB_USE_COOPERATIVE_GROUPS
304  asm volatile(
305  "{"
306  " .reg .u32 lo;"
307  " .reg .u32 hi;"
308  " .reg .pred p;"
309  " .reg .f64 r0;"
310  " mov.b64 %0, %1;"
311  " mov.b64 {lo, hi}, %1;"
312  " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
313  " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
314  " mov.b64 r0, {lo, hi};"
315  " @p add.f64 %0, %0, r0;"
316  "}"
317  : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
318 #else
319  asm volatile(
320  "{"
321  " .reg .u32 lo;"
322  " .reg .u32 hi;"
323  " .reg .pred p;"
324  " .reg .f64 r0;"
325  " mov.b64 %0, %1;"
326  " mov.b64 {lo, hi}, %1;"
327  " shfl.down.b32 lo|p, lo, %2, %3;"
328  " shfl.down.b32 hi|p, hi, %2, %3;"
329  " mov.b64 r0, {lo, hi};"
330  " @p add.f64 %0, %0, r0;"
331  "}"
332  : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
333 #endif
334 
335  return output;
336  }
337 
338 
340  template <typename ValueT, typename KeyT>
341  __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
343  SwizzleScanOp<ReduceByKeyOp<cub::Sum> > /*reduction_op*/,
344  int last_lane,
345  int offset)
346  {
348 
349  KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);
350 
351  output.key = input.key;
352  output.value = ReduceStep(
353  input.value,
354  cub::Sum(),
355  last_lane,
356  offset,
358 
359  if (input.key != other_key)
360  output.value = input.value;
361 
362  return output;
363  }
364 
365 
366 
368  template <typename ValueT, typename OffsetT>
369  __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
371  SwizzleScanOp<ReduceBySegmentOp<cub::Sum> > /*reduction_op*/,
372  int last_lane,
373  int offset)
374  {
376 
377  output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
378  output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
379 
380  if (input.key > 0)
381  output.value = input.value;
382 
383  return output;
384  }
385 
386 
388  template <typename _T, typename ReductionOp>
389  __device__ __forceinline__ _T ReduceStep(
390  _T input,
391  ReductionOp reduction_op,
392  int last_lane,
393  int offset)
394  {
395  _T output = input;
396 
397  _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);
398 
399  // Perform reduction op if valid
400  if (offset + lane_id <= last_lane)
401  output = reduction_op(input, temp);
402 
403  return output;
404  }
405 
406 
408  template <typename _T, typename ReductionOp>
409  __device__ __forceinline__ _T ReduceStep(
410  _T input,
411  ReductionOp reduction_op,
412  int last_lane,
413  int offset,
414  Int2Type<true> /*is_small_unsigned*/)
415  {
416  return ReduceStep(input, reduction_op, last_lane, offset);
417  }
418 
419 
421  template <typename _T, typename ReductionOp>
422  __device__ __forceinline__ _T ReduceStep(
423  _T input,
424  ReductionOp reduction_op,
425  int last_lane,
426  int offset,
427  Int2Type<false> /*is_small_unsigned*/)
428  {
429  return ReduceStep(input, reduction_op, last_lane, offset);
430  }
431 
432 
433  //---------------------------------------------------------------------
434  // Templated inclusive scan iteration
435  //---------------------------------------------------------------------
436 
437  template <typename ReductionOp, int STEP>
438  __device__ __forceinline__ void ReduceStep(
439  T& input,
440  ReductionOp reduction_op,
441  int last_lane,
442  Int2Type<STEP> /*step*/)
443  {
444  input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
445 
446  ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
447  }
448 
449  template <typename ReductionOp>
450  __device__ __forceinline__ void ReduceStep(
451  T& /*input*/,
452  ReductionOp /*reduction_op*/,
453  int /*last_lane*/,
454  Int2Type<STEPS> /*step*/)
455  {}
456 
457 
458  //---------------------------------------------------------------------
459  // Reduction operations
460  //---------------------------------------------------------------------
461 
463  template <
464  bool ALL_LANES_VALID,
465  typename ReductionOp>
466  __device__ __forceinline__ T Reduce(
467  T input,
468  int valid_items,
469  ReductionOp reduction_op)
470  {
471  int last_lane = (ALL_LANES_VALID) ?
472  LOGICAL_WARP_THREADS - 1 :
473  valid_items - 1;
474 
475  T output = input;
476 
477 // // Iterate reduction steps
478 // #pragma unroll
479 // for (int STEP = 0; STEP < STEPS; STEP++)
480 // {
481 // output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
482 // }
483 
484  // Template-iterate reduction steps
485  ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
486 
487  return output;
488  }
489 
490 
492  template <
493  bool HEAD_SEGMENTED,
494  typename FlagT,
495  typename ReductionOp>
496  __device__ __forceinline__ T SegmentedReduce(
497  T input,
498  FlagT flag,
499  ReductionOp reduction_op)
500  {
501  // Get the start flags for each thread in the warp.
502  int warp_flags = WARP_BALLOT(flag, member_mask);
503 
504  // Convert to tail-segmented
505  if (HEAD_SEGMENTED)
506  warp_flags >>= 1;
507 
508  // Mask out the bits below the current thread
509  warp_flags &= LaneMaskGe();
510 
511  // Mask of physical lanes outside the logical warp and convert to logical lanemask
512  if (!IS_ARCH_WARP)
513  {
514  warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);
515  }
516 
517  // Mask in the last lane of logical warp
518  warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
519 
520  // Find the next set flag
521  int last_lane = __clz(__brev(warp_flags));
522 
523  T output = input;
524 
525 // // Iterate reduction steps
526 // #pragma unroll
527 // for (int STEP = 0; STEP < STEPS; STEP++)
528 // {
529 // output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
530 // }
531 
532  // Template-iterate reduction steps
533  ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
534 
535  return output;
536  }
537 };
538 
539 
540 } // CUB namespace
541 CUB_NS_POSTFIX // Optional outer namespace(s)
__device__ __forceinline__ WarpReduceShfl(TempStorage &)
Constructor.
WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA ...
Key key
Item key.
Definition: util_type.cuh:671
Type traits.
Definition: util_type.cuh:1158
Value value
Item value.
Definition: util_type.cuh:672
Optional outer namespace(s)
Number of logical warps in a PTX warp.
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< true >)
Reduction step (specialized for small unsigned integers size 32b or less)
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
Reduction.
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition: util_ptx.cuh:310
unsigned int lane_id
Lane index in logical warp.
A key identifier paired with a corresponding value.
Definition: util_type.cuh:666
unsigned int member_mask
32-thread physical warp member mask of logical warp
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition: util_ptx.cuh:420
__device__ __forceinline__ unsigned long long ReduceStep(unsigned long long input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across unsigned long long types)
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
Definition: util_ptx.cuh:471
__device__ __forceinline__ long long ReduceStep(long long input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across long long types)
NullType TempStorage
Shared memory storage layout type.
unsigned int warp_id
Logical warp index in 32-thread physical warp.
Whether the logical warp size and the PTX warp size coincide.
The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up.
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< false >)
Reduction step (specialized for types other than small unsigned integers size 32b or less)
__device__ __forceinline__ KeyValuePair< OffsetT, ValueT > ReduceStep(KeyValuePair< OffsetT, ValueT > input, SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >, int last_lane, int offset)
Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT,...
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ __forceinline__ float ReduceStep(float input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across fp32 types)
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset)
Reduction step (generic)
Statically determine log2(N), rounded up.
Definition: util_type.cuh:132
__device__ __forceinline__ void ReduceStep(T &input, ReductionOp reduction_op, int last_lane, Int2Type< STEP >)
A simple "NULL" marker type.
Definition: util_type.cuh:256
Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction...
The number of warp reduction steps.
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
Segmented reduction.
__device__ __forceinline__ double ReduceStep(double input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across double types)
Default sum functor.
__device__ __forceinline__ KeyValuePair< KeyT, ValueT > ReduceStep(KeyValuePair< KeyT, ValueT > input, SwizzleScanOp< ReduceByKeyOp< cub::Sum > >, int last_lane, int offset)
Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT,...
Binary operator wrapper for switching non-commutative scan arguments.
__device__ __forceinline__ unsigned int ReduceStep(unsigned int input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across uint32 types)