OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
warp_scan_shfl.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../../thread/thread_operators.cuh"
37 #include "../../util_type.cuh"
38 #include "../../util_ptx.cuh"
39 #include "../../util_namespace.cuh"
40 
42 CUB_NS_PREFIX
43 
45 namespace cub {
46 
52 template <
53  typename T,
54  int LOGICAL_WARP_THREADS,
55  int PTX_ARCH>
57 {
58  //---------------------------------------------------------------------
59  // Constants and type definitions
60  //---------------------------------------------------------------------
61 
62  enum
63  {
65  IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
66 
69 
71  SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
72  };
73 
74  template <typename S>
76  {
77  enum {
79  IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
80  };
81  };
82 
84  struct TempStorage {};
85 
86 
87  //---------------------------------------------------------------------
88  // Thread fields
89  //---------------------------------------------------------------------
90 
92  unsigned int lane_id;
93 
95  unsigned int warp_id;
96 
98  unsigned int member_mask;
99 
100  //---------------------------------------------------------------------
101  // Construction
102  //---------------------------------------------------------------------
103 
105  __device__ __forceinline__ WarpScanShfl(
106  TempStorage &/*temp_storage*/)
107  {
108  lane_id = LaneId();
109  warp_id = 0;
110  member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
111 
112  if (!IS_ARCH_WARP)
113  {
114  warp_id = lane_id / LOGICAL_WARP_THREADS;
115  lane_id = lane_id % LOGICAL_WARP_THREADS;
116  member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
117  }
118  }
119 
120 
121  //---------------------------------------------------------------------
122  // Inclusive scan steps
123  //---------------------------------------------------------------------
124 
126  __device__ __forceinline__ int InclusiveScanStep(
127  int input,
128  cub::Sum /*scan_op*/,
129  int first_lane,
130  int offset)
131  {
132  int output;
133  int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
134 
135  // Use predicate set from SHFL to guard against invalid peers
136 #ifdef CUB_USE_COOPERATIVE_GROUPS
137  asm volatile(
138  "{"
139  " .reg .s32 r0;"
140  " .reg .pred p;"
141  " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
142  " @p add.s32 r0, r0, %4;"
143  " mov.s32 %0, r0;"
144  "}"
145  : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
146 #else
147  asm volatile(
148  "{"
149  " .reg .s32 r0;"
150  " .reg .pred p;"
151  " shfl.up.b32 r0|p, %1, %2, %3;"
152  " @p add.s32 r0, r0, %4;"
153  " mov.s32 %0, r0;"
154  "}"
155  : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
156 #endif
157 
158  return output;
159  }
160 
162  __device__ __forceinline__ unsigned int InclusiveScanStep(
163  unsigned int input,
164  cub::Sum /*scan_op*/,
165  int first_lane,
166  int offset)
167  {
168  unsigned int output;
169  int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
170 
171  // Use predicate set from SHFL to guard against invalid peers
172 #ifdef CUB_USE_COOPERATIVE_GROUPS
173  asm volatile(
174  "{"
175  " .reg .u32 r0;"
176  " .reg .pred p;"
177  " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
178  " @p add.u32 r0, r0, %4;"
179  " mov.u32 %0, r0;"
180  "}"
181  : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
182 #else
183  asm volatile(
184  "{"
185  " .reg .u32 r0;"
186  " .reg .pred p;"
187  " shfl.up.b32 r0|p, %1, %2, %3;"
188  " @p add.u32 r0, r0, %4;"
189  " mov.u32 %0, r0;"
190  "}"
191  : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
192 #endif
193 
194  return output;
195  }
196 
197 
199  __device__ __forceinline__ float InclusiveScanStep(
200  float input,
201  cub::Sum /*scan_op*/,
202  int first_lane,
203  int offset)
204  {
205  float output;
206  int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
207 
208  // Use predicate set from SHFL to guard against invalid peers
209 #ifdef CUB_USE_COOPERATIVE_GROUPS
210  asm volatile(
211  "{"
212  " .reg .f32 r0;"
213  " .reg .pred p;"
214  " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
215  " @p add.f32 r0, r0, %4;"
216  " mov.f32 %0, r0;"
217  "}"
218  : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
219 #else
220  asm volatile(
221  "{"
222  " .reg .f32 r0;"
223  " .reg .pred p;"
224  " shfl.up.b32 r0|p, %1, %2, %3;"
225  " @p add.f32 r0, r0, %4;"
226  " mov.f32 %0, r0;"
227  "}"
228  : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
229 #endif
230 
231  return output;
232  }
233 
234 
236  __device__ __forceinline__ unsigned long long InclusiveScanStep(
237  unsigned long long input,
238  cub::Sum /*scan_op*/,
239  int first_lane,
240  int offset)
241  {
242  unsigned long long output;
243  int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
244 
245  // Use predicate set from SHFL to guard against invalid peers
246 #ifdef CUB_USE_COOPERATIVE_GROUPS
247  asm volatile(
248  "{"
249  " .reg .u64 r0;"
250  " .reg .u32 lo;"
251  " .reg .u32 hi;"
252  " .reg .pred p;"
253  " mov.b64 {lo, hi}, %1;"
254  " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
255  " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
256  " mov.b64 r0, {lo, hi};"
257  " @p add.u64 r0, r0, %4;"
258  " mov.u64 %0, r0;"
259  "}"
260  : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
261 #else
262  asm volatile(
263  "{"
264  " .reg .u64 r0;"
265  " .reg .u32 lo;"
266  " .reg .u32 hi;"
267  " .reg .pred p;"
268  " mov.b64 {lo, hi}, %1;"
269  " shfl.up.b32 lo|p, lo, %2, %3;"
270  " shfl.up.b32 hi|p, hi, %2, %3;"
271  " mov.b64 r0, {lo, hi};"
272  " @p add.u64 r0, r0, %4;"
273  " mov.u64 %0, r0;"
274  "}"
275  : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
276 #endif
277 
278  return output;
279  }
280 
281 
283  __device__ __forceinline__ long long InclusiveScanStep(
284  long long input,
285  cub::Sum /*scan_op*/,
286  int first_lane,
287  int offset)
288  {
289  long long output;
290  int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
291 
292  // Use predicate set from SHFL to guard against invalid peers
293 #ifdef CUB_USE_COOPERATIVE_GROUPS
294  asm volatile(
295  "{"
296  " .reg .s64 r0;"
297  " .reg .u32 lo;"
298  " .reg .u32 hi;"
299  " .reg .pred p;"
300  " mov.b64 {lo, hi}, %1;"
301  " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
302  " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
303  " mov.b64 r0, {lo, hi};"
304  " @p add.s64 r0, r0, %4;"
305  " mov.s64 %0, r0;"
306  "}"
307  : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
308 #else
309  asm volatile(
310  "{"
311  " .reg .s64 r0;"
312  " .reg .u32 lo;"
313  " .reg .u32 hi;"
314  " .reg .pred p;"
315  " mov.b64 {lo, hi}, %1;"
316  " shfl.up.b32 lo|p, lo, %2, %3;"
317  " shfl.up.b32 hi|p, hi, %2, %3;"
318  " mov.b64 r0, {lo, hi};"
319  " @p add.s64 r0, r0, %4;"
320  " mov.s64 %0, r0;"
321  "}"
322  : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
323 #endif
324 
325  return output;
326  }
327 
328 
330  __device__ __forceinline__ double InclusiveScanStep(
331  double input,
332  cub::Sum /*scan_op*/,
333  int first_lane,
334  int offset)
335  {
336  double output;
337  int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
338 
339  // Use predicate set from SHFL to guard against invalid peers
340 #ifdef CUB_USE_COOPERATIVE_GROUPS
341  asm volatile(
342  "{"
343  " .reg .u32 lo;"
344  " .reg .u32 hi;"
345  " .reg .pred p;"
346  " .reg .f64 r0;"
347  " mov.b64 %0, %1;"
348  " mov.b64 {lo, hi}, %1;"
349  " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
350  " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
351  " mov.b64 r0, {lo, hi};"
352  " @p add.f64 %0, %0, r0;"
353  "}"
354  : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
355 #else
356  asm volatile(
357  "{"
358  " .reg .u32 lo;"
359  " .reg .u32 hi;"
360  " .reg .pred p;"
361  " .reg .f64 r0;"
362  " mov.b64 %0, %1;"
363  " mov.b64 {lo, hi}, %1;"
364  " shfl.up.b32 lo|p, lo, %2, %3;"
365  " shfl.up.b32 hi|p, hi, %2, %3;"
366  " mov.b64 r0, {lo, hi};"
367  " @p add.f64 %0, %0, r0;"
368  "}"
369  : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
370 #endif
371 
372  return output;
373  }
374 
375 
376 /*
378  template <typename Value, typename OffsetT>
379  __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
380  KeyValuePair<OffsetT, Value> input, ///< [in] Calling thread's input item.
381  ReduceBySegmentOp<cub::Sum> scan_op, ///< [in] Binary scan operator
382  int first_lane, ///< [in] Index of first lane in segment
383  int offset) ///< [in] Up-offset to pull from
384  {
385  KeyValuePair<OffsetT, Value> output;
386 
387  output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
388  output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
389 
390  if (input.key > 0)
391  output.value = input.value;
392 
393  return output;
394  }
395 */
396 
398  template <typename _T, typename ScanOpT>
399  __device__ __forceinline__ _T InclusiveScanStep(
400  _T input,
401  ScanOpT scan_op,
402  int first_lane,
403  int offset)
404  {
405  _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
406 
407  // Perform scan op if from a valid peer
408  _T output = scan_op(temp, input);
409  if (static_cast<int>(lane_id) < first_lane + offset)
410  output = input;
411 
412  return output;
413  }
414 
415 
417  template <typename _T, typename ScanOpT>
418  __device__ __forceinline__ _T InclusiveScanStep(
419  _T input,
420  ScanOpT scan_op,
421  int first_lane,
422  int offset,
423  Int2Type<true> /*is_small_unsigned*/)
424  {
425  return InclusiveScanStep(input, scan_op, first_lane, offset);
426  }
427 
428 
430  template <typename _T, typename ScanOpT>
431  __device__ __forceinline__ _T InclusiveScanStep(
432  _T input,
433  ScanOpT scan_op,
434  int first_lane,
435  int offset,
436  Int2Type<false> /*is_small_unsigned*/)
437  {
438  return InclusiveScanStep(input, scan_op, first_lane, offset);
439  }
440 
441 
442  /******************************************************************************
443  * Interface
444  ******************************************************************************/
445 
446  //---------------------------------------------------------------------
447  // Broadcast
448  //---------------------------------------------------------------------
449 
451  __device__ __forceinline__ T Broadcast(
452  T input,
453  int src_lane)
454  {
455  return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
456  }
457 
458 
459  //---------------------------------------------------------------------
460  // Inclusive operations
461  //---------------------------------------------------------------------
462 
464  template <typename _T, typename ScanOpT>
465  __device__ __forceinline__ void InclusiveScan(
466  _T input,
467  _T &inclusive_output,
468  ScanOpT scan_op)
469  {
470  inclusive_output = input;
471 
472  // Iterate scan steps
473  int segment_first_lane = 0;
474 
475  // Iterate scan steps
476  #pragma unroll
477  for (int STEP = 0; STEP < STEPS; STEP++)
478  {
479  inclusive_output = InclusiveScanStep(
480  inclusive_output,
481  scan_op,
482  segment_first_lane,
483  (1 << STEP),
485  }
486 
487  }
488 
490  template <typename KeyT, typename ValueT, typename ReductionOpT>
491  __device__ __forceinline__ void InclusiveScan(
493  KeyValuePair<KeyT, ValueT> &inclusive_output,
495  {
496  inclusive_output = input;
497 
498  KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
499 
500  unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
501 
502  // Mask away all lanes greater than ours
503  ballot = ballot & LaneMaskLe();
504 
505  // Find index of first set bit
506  int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
507 
508  // Iterate scan steps
509  #pragma unroll
510  for (int STEP = 0; STEP < STEPS; STEP++)
511  {
512  inclusive_output.value = InclusiveScanStep(
513  inclusive_output.value,
514  scan_op.op,
515  segment_first_lane,
516  (1 << STEP),
518  }
519  }
520 
521 
523  template <typename ScanOpT>
524  __device__ __forceinline__ void InclusiveScan(
525  T input,
526  T &inclusive_output,
527  ScanOpT scan_op,
528  T &warp_aggregate)
529  {
530  InclusiveScan(input, inclusive_output, scan_op);
531 
532  // Grab aggregate from last warp lane
533  warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
534  }
535 
536 
537  //---------------------------------------------------------------------
538  // Get exclusive from inclusive
539  //---------------------------------------------------------------------
540 
542  template <typename ScanOpT, typename IsIntegerT>
543  __device__ __forceinline__ void Update(
544  T /*input*/,
545  T &inclusive,
546  T &exclusive,
547  ScanOpT /*scan_op*/,
548  IsIntegerT /*is_integer*/)
549  {
550  // initial value unknown
551  exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
552  }
553 
555  __device__ __forceinline__ void Update(
556  T input,
557  T &inclusive,
558  T &exclusive,
559  cub::Sum /*scan_op*/,
560  Int2Type<true> /*is_integer*/)
561  {
562  // initial value presumed 0
563  exclusive = inclusive - input;
564  }
565 
567  template <typename ScanOpT, typename IsIntegerT>
568  __device__ __forceinline__ void Update (
569  T /*input*/,
570  T &inclusive,
571  T &exclusive,
572  ScanOpT scan_op,
573  T initial_value,
574  IsIntegerT /*is_integer*/)
575  {
576  inclusive = scan_op(initial_value, inclusive);
577  exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
578 
579  if (lane_id == 0)
580  exclusive = initial_value;
581  }
582 
584  __device__ __forceinline__ void Update (
585  T input,
586  T &inclusive,
587  T &exclusive,
589  T initial_value,
590  Int2Type<true> /*is_integer*/)
591  {
592  inclusive = scan_op(initial_value, inclusive);
593  exclusive = inclusive - input;
594  }
595 
596 
598  template <typename ScanOpT, typename IsIntegerT>
599  __device__ __forceinline__ void Update (
600  T input,
601  T &inclusive,
602  T &exclusive,
603  T &warp_aggregate,
604  ScanOpT scan_op,
605  IsIntegerT is_integer)
606  {
607  warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
608  Update(input, inclusive, exclusive, scan_op, is_integer);
609  }
610 
612  template <typename ScanOpT, typename IsIntegerT>
613  __device__ __forceinline__ void Update (
614  T input,
615  T &inclusive,
616  T &exclusive,
617  T &warp_aggregate,
618  ScanOpT scan_op,
619  T initial_value,
620  IsIntegerT is_integer)
621  {
622  warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
623  Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
624  }
625 
626 
627 
628 };
629 
630 
631 } // CUB namespace
632 CUB_NS_POSTFIX // Optional outer namespace(s)
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, cub::Sum scan_op, T initial_value, Int2Type< true >)
Update inclusive and exclusive using initial value using input and inclusive (specialized for summati...
unsigned int lane_id
Lane index in logical warp.
The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up.
Key key
Item key.
Definition: util_type.cuh:671
< Binary reduction operator to apply to values
__device__ __forceinline__ void InclusiveScan(KeyValuePair< KeyT, ValueT > input, KeyValuePair< KeyT, ValueT > &inclusive_output, ReduceByKeyOp< ReductionOpT > scan_op)
Inclusive scan, specialized for reduce-value-by-key.
WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA ...
Type traits.
Definition: util_type.cuh:1158
Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction...
Value value
Item value.
Definition: util_type.cuh:672
Optional outer namespace(s)
__device__ __forceinline__ long long InclusiveScanStep(long long input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across long long types)
__device__ __forceinline__ _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset, Int2Type< false >)
Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, cub::Sum, Int2Type< true >)
Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition: util_ptx.cuh:310
A key identifier paired with a corresponding value.
Definition: util_type.cuh:666
__device__ __forceinline__ unsigned long long InclusiveScanStep(unsigned long long input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across unsigned long long types)
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition: util_ptx.cuh:420
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT is_integer)
Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value.
__device__ __forceinline__ unsigned int LaneMaskLe()
Returns the warp lane mask of all lanes less than or equal to the calling thread.
Definition: util_ptx.cuh:451
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, IsIntegerT is_integer)
Update inclusive, exclusive, and warp aggregate using input and inclusive.
Whether the logical warp size and the PTX warp size coincide.
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, ScanOpT scan_op, T initial_value, IsIntegerT)
Update inclusive and exclusive using initial value using input, inclusive, and initial value.
__device__ __forceinline__ WarpScanShfl(TempStorage &)
Constructor.
unsigned int member_mask
32-thread physical warp member mask of logical warp
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
The number of warp scan steps.
Shared memory storage layout type.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ __forceinline__ void InclusiveScan(_T input, _T &inclusive_output, ScanOpT scan_op)
Inclusive scan.
__device__ __forceinline__ _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset)
Inclusive prefix scan step (generic)
__device__ __forceinline__ unsigned int InclusiveScanStep(unsigned int input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across uint32 types)
__device__ __forceinline__ int InclusiveScanStep(int input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across int32 types)
Statically determine log2(N), rounded up.
Definition: util_type.cuh:132
unsigned int warp_id
Logical warp index in 32-thread physical warp.
__device__ __forceinline__ float InclusiveScanStep(float input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across fp32 types)
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOpT scan_op, T &warp_aggregate)
Inclusive scan with aggregate.
__device__ __forceinline__ T Broadcast(T input, int src_lane)
Broadcast.
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, ScanOpT, IsIntegerT)
Update inclusive and exclusive using input and inclusive.
__device__ __forceinline__ double InclusiveScanStep(double input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across fp64 types)
Default sum functor.
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition: util_macro.cuh:61
__device__ __forceinline__ _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset, Int2Type< true >)
Inclusive prefix scan step (specialized for small integers size 32b or less)