OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
warp_scan_shfl.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include "../../thread/thread_operators.cuh"
37#include "../../util_type.cuh"
38#include "../../util_ptx.cuh"
39#include "../../util_namespace.cuh"
40
42CUB_NS_PREFIX
43
45namespace cub {
46
52template <
53 typename T,
54 int LOGICAL_WARP_THREADS,
55 int PTX_ARCH>
57{
58 //---------------------------------------------------------------------
59 // Constants and type definitions
60 //---------------------------------------------------------------------
61
62 enum
63 {
65 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
66
69
71 SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
72 };
73
74 template <typename S>
76 {
77 enum {
79 IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
80 };
81 };
82
84 struct TempStorage {};
85
86
87 //---------------------------------------------------------------------
88 // Thread fields
89 //---------------------------------------------------------------------
90
92 unsigned int lane_id;
93
95 unsigned int warp_id;
96
98 unsigned int member_mask;
99
100 //---------------------------------------------------------------------
101 // Construction
102 //---------------------------------------------------------------------
103
105 __device__ __forceinline__ WarpScanShfl(
106 TempStorage &/*temp_storage*/)
107 {
108 lane_id = LaneId();
109 warp_id = 0;
110 member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
111
112 if (!IS_ARCH_WARP)
113 {
114 warp_id = lane_id / LOGICAL_WARP_THREADS;
115 lane_id = lane_id % LOGICAL_WARP_THREADS;
116 member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
117 }
118 }
119
120
121 //---------------------------------------------------------------------
122 // Inclusive scan steps
123 //---------------------------------------------------------------------
124
126 __device__ __forceinline__ int InclusiveScanStep(
127 int input,
128 cub::Sum /*scan_op*/,
129 int first_lane,
130 int offset)
131 {
132 int output;
133 int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
134
135 // Use predicate set from SHFL to guard against invalid peers
136#ifdef CUB_USE_COOPERATIVE_GROUPS
137 asm volatile(
138 "{"
139 " .reg .s32 r0;"
140 " .reg .pred p;"
141 " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
142 " @p add.s32 r0, r0, %4;"
143 " mov.s32 %0, r0;"
144 "}"
145 : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
146#else
147 asm volatile(
148 "{"
149 " .reg .s32 r0;"
150 " .reg .pred p;"
151 " shfl.up.b32 r0|p, %1, %2, %3;"
152 " @p add.s32 r0, r0, %4;"
153 " mov.s32 %0, r0;"
154 "}"
155 : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
156#endif
157
158 return output;
159 }
160
162 __device__ __forceinline__ unsigned int InclusiveScanStep(
163 unsigned int input,
164 cub::Sum /*scan_op*/,
165 int first_lane,
166 int offset)
167 {
168 unsigned int output;
169 int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
170
171 // Use predicate set from SHFL to guard against invalid peers
172#ifdef CUB_USE_COOPERATIVE_GROUPS
173 asm volatile(
174 "{"
175 " .reg .u32 r0;"
176 " .reg .pred p;"
177 " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
178 " @p add.u32 r0, r0, %4;"
179 " mov.u32 %0, r0;"
180 "}"
181 : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
182#else
183 asm volatile(
184 "{"
185 " .reg .u32 r0;"
186 " .reg .pred p;"
187 " shfl.up.b32 r0|p, %1, %2, %3;"
188 " @p add.u32 r0, r0, %4;"
189 " mov.u32 %0, r0;"
190 "}"
191 : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
192#endif
193
194 return output;
195 }
196
197
199 __device__ __forceinline__ float InclusiveScanStep(
200 float input,
201 cub::Sum /*scan_op*/,
202 int first_lane,
203 int offset)
204 {
205 float output;
206 int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
207
208 // Use predicate set from SHFL to guard against invalid peers
209#ifdef CUB_USE_COOPERATIVE_GROUPS
210 asm volatile(
211 "{"
212 " .reg .f32 r0;"
213 " .reg .pred p;"
214 " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
215 " @p add.f32 r0, r0, %4;"
216 " mov.f32 %0, r0;"
217 "}"
218 : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
219#else
220 asm volatile(
221 "{"
222 " .reg .f32 r0;"
223 " .reg .pred p;"
224 " shfl.up.b32 r0|p, %1, %2, %3;"
225 " @p add.f32 r0, r0, %4;"
226 " mov.f32 %0, r0;"
227 "}"
228 : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
229#endif
230
231 return output;
232 }
233
234
236 __device__ __forceinline__ unsigned long long InclusiveScanStep(
237 unsigned long long input,
238 cub::Sum /*scan_op*/,
239 int first_lane,
240 int offset)
241 {
242 unsigned long long output;
243 int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
244
245 // Use predicate set from SHFL to guard against invalid peers
246#ifdef CUB_USE_COOPERATIVE_GROUPS
247 asm volatile(
248 "{"
249 " .reg .u64 r0;"
250 " .reg .u32 lo;"
251 " .reg .u32 hi;"
252 " .reg .pred p;"
253 " mov.b64 {lo, hi}, %1;"
254 " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
255 " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
256 " mov.b64 r0, {lo, hi};"
257 " @p add.u64 r0, r0, %4;"
258 " mov.u64 %0, r0;"
259 "}"
260 : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
261#else
262 asm volatile(
263 "{"
264 " .reg .u64 r0;"
265 " .reg .u32 lo;"
266 " .reg .u32 hi;"
267 " .reg .pred p;"
268 " mov.b64 {lo, hi}, %1;"
269 " shfl.up.b32 lo|p, lo, %2, %3;"
270 " shfl.up.b32 hi|p, hi, %2, %3;"
271 " mov.b64 r0, {lo, hi};"
272 " @p add.u64 r0, r0, %4;"
273 " mov.u64 %0, r0;"
274 "}"
275 : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
276#endif
277
278 return output;
279 }
280
281
283 __device__ __forceinline__ long long InclusiveScanStep(
284 long long input,
285 cub::Sum /*scan_op*/,
286 int first_lane,
287 int offset)
288 {
289 long long output;
290 int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
291
292 // Use predicate set from SHFL to guard against invalid peers
293#ifdef CUB_USE_COOPERATIVE_GROUPS
294 asm volatile(
295 "{"
296 " .reg .s64 r0;"
297 " .reg .u32 lo;"
298 " .reg .u32 hi;"
299 " .reg .pred p;"
300 " mov.b64 {lo, hi}, %1;"
301 " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
302 " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
303 " mov.b64 r0, {lo, hi};"
304 " @p add.s64 r0, r0, %4;"
305 " mov.s64 %0, r0;"
306 "}"
307 : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
308#else
309 asm volatile(
310 "{"
311 " .reg .s64 r0;"
312 " .reg .u32 lo;"
313 " .reg .u32 hi;"
314 " .reg .pred p;"
315 " mov.b64 {lo, hi}, %1;"
316 " shfl.up.b32 lo|p, lo, %2, %3;"
317 " shfl.up.b32 hi|p, hi, %2, %3;"
318 " mov.b64 r0, {lo, hi};"
319 " @p add.s64 r0, r0, %4;"
320 " mov.s64 %0, r0;"
321 "}"
322 : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
323#endif
324
325 return output;
326 }
327
328
330 __device__ __forceinline__ double InclusiveScanStep(
331 double input,
332 cub::Sum /*scan_op*/,
333 int first_lane,
334 int offset)
335 {
336 double output;
337 int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
338
339 // Use predicate set from SHFL to guard against invalid peers
340#ifdef CUB_USE_COOPERATIVE_GROUPS
341 asm volatile(
342 "{"
343 " .reg .u32 lo;"
344 " .reg .u32 hi;"
345 " .reg .pred p;"
346 " .reg .f64 r0;"
347 " mov.b64 %0, %1;"
348 " mov.b64 {lo, hi}, %1;"
349 " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
350 " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
351 " mov.b64 r0, {lo, hi};"
352 " @p add.f64 %0, %0, r0;"
353 "}"
354 : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
355#else
356 asm volatile(
357 "{"
358 " .reg .u32 lo;"
359 " .reg .u32 hi;"
360 " .reg .pred p;"
361 " .reg .f64 r0;"
362 " mov.b64 %0, %1;"
363 " mov.b64 {lo, hi}, %1;"
364 " shfl.up.b32 lo|p, lo, %2, %3;"
365 " shfl.up.b32 hi|p, hi, %2, %3;"
366 " mov.b64 r0, {lo, hi};"
367 " @p add.f64 %0, %0, r0;"
368 "}"
369 : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
370#endif
371
372 return output;
373 }
374
375
376/*
378 template <typename Value, typename OffsetT>
379 __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
380 KeyValuePair<OffsetT, Value> input, ///< [in] Calling thread's input item.
381 ReduceBySegmentOp<cub::Sum> scan_op, ///< [in] Binary scan operator
382 int first_lane, ///< [in] Index of first lane in segment
383 int offset) ///< [in] Up-offset to pull from
384 {
385 KeyValuePair<OffsetT, Value> output;
386
387 output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
388 output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
389
390 if (input.key > 0)
391 output.value = input.value;
392
393 return output;
394 }
395*/
396
398 template <typename _T, typename ScanOpT>
399 __device__ __forceinline__ _T InclusiveScanStep(
400 _T input,
401 ScanOpT scan_op,
402 int first_lane,
403 int offset)
404 {
405 _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
406
407 // Perform scan op if from a valid peer
408 _T output = scan_op(temp, input);
409 if (static_cast<int>(lane_id) < first_lane + offset)
410 output = input;
411
412 return output;
413 }
414
415
417 template <typename _T, typename ScanOpT>
418 __device__ __forceinline__ _T InclusiveScanStep(
419 _T input,
420 ScanOpT scan_op,
421 int first_lane,
422 int offset,
423 Int2Type<true> /*is_small_unsigned*/)
424 {
425 return InclusiveScanStep(input, scan_op, first_lane, offset);
426 }
427
428
430 template <typename _T, typename ScanOpT>
431 __device__ __forceinline__ _T InclusiveScanStep(
432 _T input,
433 ScanOpT scan_op,
434 int first_lane,
435 int offset,
436 Int2Type<false> /*is_small_unsigned*/)
437 {
438 return InclusiveScanStep(input, scan_op, first_lane, offset);
439 }
440
441
442 /******************************************************************************
443 * Interface
444 ******************************************************************************/
445
446 //---------------------------------------------------------------------
447 // Broadcast
448 //---------------------------------------------------------------------
449
451 __device__ __forceinline__ T Broadcast(
452 T input,
453 int src_lane)
454 {
455 return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
456 }
457
458
459 //---------------------------------------------------------------------
460 // Inclusive operations
461 //---------------------------------------------------------------------
462
464 template <typename _T, typename ScanOpT>
465 __device__ __forceinline__ void InclusiveScan(
466 _T input,
467 _T &inclusive_output,
468 ScanOpT scan_op)
469 {
470 inclusive_output = input;
471
472 // Iterate scan steps
473 int segment_first_lane = 0;
474
475 // Iterate scan steps
476 #pragma unroll
477 for (int STEP = 0; STEP < STEPS; STEP++)
478 {
479 inclusive_output = InclusiveScanStep(
480 inclusive_output,
481 scan_op,
482 segment_first_lane,
483 (1 << STEP),
485 }
486
487 }
488
490 template <typename KeyT, typename ValueT, typename ReductionOpT>
491 __device__ __forceinline__ void InclusiveScan(
493 KeyValuePair<KeyT, ValueT> &inclusive_output,
495 {
496 inclusive_output = input;
497
498 KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
499
500 unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
501
502 // Mask away all lanes greater than ours
503 ballot = ballot & LaneMaskLe();
504
505 // Find index of first set bit
506 int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
507
508 // Iterate scan steps
509 #pragma unroll
510 for (int STEP = 0; STEP < STEPS; STEP++)
511 {
512 inclusive_output.value = InclusiveScanStep(
513 inclusive_output.value,
514 scan_op.op,
515 segment_first_lane,
516 (1 << STEP),
518 }
519 }
520
521
523 template <typename ScanOpT>
524 __device__ __forceinline__ void InclusiveScan(
525 T input,
526 T &inclusive_output,
527 ScanOpT scan_op,
528 T &warp_aggregate)
529 {
530 InclusiveScan(input, inclusive_output, scan_op);
531
532 // Grab aggregate from last warp lane
533 warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
534 }
535
536
537 //---------------------------------------------------------------------
538 // Get exclusive from inclusive
539 //---------------------------------------------------------------------
540
542 template <typename ScanOpT, typename IsIntegerT>
543 __device__ __forceinline__ void Update(
544 T /*input*/,
545 T &inclusive,
546 T &exclusive,
547 ScanOpT /*scan_op*/,
548 IsIntegerT /*is_integer*/)
549 {
550 // initial value unknown
551 exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
552 }
553
555 __device__ __forceinline__ void Update(
556 T input,
557 T &inclusive,
558 T &exclusive,
559 cub::Sum /*scan_op*/,
560 Int2Type<true> /*is_integer*/)
561 {
562 // initial value presumed 0
563 exclusive = inclusive - input;
564 }
565
567 template <typename ScanOpT, typename IsIntegerT>
568 __device__ __forceinline__ void Update (
569 T /*input*/,
570 T &inclusive,
571 T &exclusive,
572 ScanOpT scan_op,
573 T initial_value,
574 IsIntegerT /*is_integer*/)
575 {
576 inclusive = scan_op(initial_value, inclusive);
577 exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
578
579 if (lane_id == 0)
580 exclusive = initial_value;
581 }
582
584 __device__ __forceinline__ void Update (
585 T input,
586 T &inclusive,
587 T &exclusive,
589 T initial_value,
590 Int2Type<true> /*is_integer*/)
591 {
592 inclusive = scan_op(initial_value, inclusive);
593 exclusive = inclusive - input;
594 }
595
596
598 template <typename ScanOpT, typename IsIntegerT>
599 __device__ __forceinline__ void Update (
600 T input,
601 T &inclusive,
602 T &exclusive,
603 T &warp_aggregate,
604 ScanOpT scan_op,
605 IsIntegerT is_integer)
606 {
607 warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
608 Update(input, inclusive, exclusive, scan_op, is_integer);
609 }
610
612 template <typename ScanOpT, typename IsIntegerT>
613 __device__ __forceinline__ void Update (
614 T input,
615 T &inclusive,
616 T &exclusive,
617 T &warp_aggregate,
618 ScanOpT scan_op,
619 T initial_value,
620 IsIntegerT is_integer)
621 {
622 warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
623 Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
624 }
625
626
627
628};
629
630
631} // CUB namespace
632CUB_NS_POSTFIX // Optional outer namespace(s)
#define CUB_MAX(a, b)
Select maximum(a, b)
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:310
__device__ __forceinline__ unsigned int LaneMaskLe()
Returns the warp lane mask of all lanes less than or equal to the calling thread.
Definition util_ptx.cuh:451
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A key identifier paired with a corresponding value.
Value value
Item value.
Key key
Item key.
Statically determine log2(N), rounded up.
< Binary reduction operator to apply to values
Default sum functor.
Type traits.
@ IS_SMALL_UNSIGNED
Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction...
Shared memory storage layout type.
WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA ...
unsigned int member_mask
32-thread physical warp member mask of logical warp
__device__ __forceinline__ WarpScanShfl(TempStorage &)
Constructor.
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, ScanOpT, IsIntegerT)
Update inclusive and exclusive using input and inclusive.
__device__ __forceinline__ _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset)
Inclusive prefix scan step (generic)
__device__ __forceinline__ _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset, Int2Type< false >)
Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
__device__ __forceinline__ T Broadcast(T input, int src_lane)
Broadcast.
__device__ __forceinline__ void InclusiveScan(KeyValuePair< KeyT, ValueT > input, KeyValuePair< KeyT, ValueT > &inclusive_output, ReduceByKeyOp< ReductionOpT > scan_op)
Inclusive scan, specialized for reduce-value-by-key.
unsigned int lane_id
Lane index in logical warp.
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOpT scan_op, T &warp_aggregate)
Inclusive scan with aggregate.
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
@ STEPS
The number of warp scan steps.
@ SHFL_C
The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up.
__device__ __forceinline__ void InclusiveScan(_T input, _T &inclusive_output, ScanOpT scan_op)
Inclusive scan.
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, cub::Sum scan_op, T initial_value, Int2Type< true >)
Update inclusive and exclusive using initial value using input and inclusive (specialized for summati...
unsigned int warp_id
Logical warp index in 32-thread physical warp.
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, cub::Sum, Int2Type< true >)
Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
__device__ __forceinline__ float InclusiveScanStep(float input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across fp32 types)
__device__ __forceinline__ _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset, Int2Type< true >)
Inclusive prefix scan step (specialized for small integers size 32b or less)
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT is_integer)
Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value.
__device__ __forceinline__ int InclusiveScanStep(int input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across int32 types)
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, ScanOpT scan_op, T initial_value, IsIntegerT)
Update inclusive and exclusive using initial value using input, inclusive, and initial value.
__device__ __forceinline__ unsigned long long InclusiveScanStep(unsigned long long input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across unsigned long long types)
__device__ __forceinline__ unsigned int InclusiveScanStep(unsigned int input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across uint32 types)
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, IsIntegerT is_integer)
Update inclusive, exclusive, and warp aggregate using input and inclusive.
__device__ __forceinline__ long long InclusiveScanStep(long long input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across long long types)
__device__ __forceinline__ double InclusiveScanStep(double input, cub::Sum, int first_lane, int offset)
Inclusive prefix scan step (specialized for summation across fp64 types)