OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_scan_raking.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29
35#pragma once
36
37#include "../../util_ptx.cuh"
38#include "../../util_arch.cuh"
39#include "../../block/block_raking_layout.cuh"
40#include "../../thread/thread_reduce.cuh"
41#include "../../thread/thread_scan.cuh"
42#include "../../warp/warp_scan.cuh"
43#include "../../util_namespace.cuh"
44
46CUB_NS_PREFIX
47
49namespace cub {
50
51
55template <
56 typename T,
57 int BLOCK_DIM_X,
58 int BLOCK_DIM_Y,
59 int BLOCK_DIM_Z,
60 bool MEMOIZE,
61 int PTX_ARCH>
63{
64 //---------------------------------------------------------------------
65 // Types and constants
66 //---------------------------------------------------------------------
67
69 enum
70 {
72 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
73 };
74
77
79 enum
80 {
83
86
89 };
90
93
96 {
100 };
101
102
104 struct TempStorage : Uninitialized<_TempStorage> {};
105
106
107 //---------------------------------------------------------------------
108 // Per-thread fields
109 //---------------------------------------------------------------------
110
111 // Thread fields
112 _TempStorage &temp_storage;
113 unsigned int linear_tid;
114 T cached_segment[SEGMENT_LENGTH];
115
116
117 //---------------------------------------------------------------------
118 // Utility methods
119 //---------------------------------------------------------------------
120
122 template <int ITERATION, typename ScanOp>
123 __device__ __forceinline__ T GuardedReduce(
124 T* raking_ptr,
125 ScanOp scan_op,
126 T raking_partial,
127 Int2Type<ITERATION> /*iteration*/)
128 {
129 if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
130 {
131 T addend = raking_ptr[ITERATION];
132 raking_partial = scan_op(raking_partial, addend);
133 }
134
135 return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
136 }
137
138
140 template <typename ScanOp>
141 __device__ __forceinline__ T GuardedReduce(
142 T* /*raking_ptr*/,
143 ScanOp /*scan_op*/,
144 T raking_partial,
145 Int2Type<SEGMENT_LENGTH> /*iteration*/)
146 {
147 return raking_partial;
148 }
149
150
152 template <int ITERATION>
153 __device__ __forceinline__ void CopySegment(
154 T* out,
155 T* in,
156 Int2Type<ITERATION> /*iteration*/)
157 {
158 out[ITERATION] = in[ITERATION];
160 }
161
162
164 __device__ __forceinline__ void CopySegment(
165 T* /*out*/,
166 T* /*in*/,
167 Int2Type<SEGMENT_LENGTH> /*iteration*/)
168 {}
169
170
172 template <typename ScanOp>
173 __device__ __forceinline__ T Upsweep(
174 ScanOp scan_op)
175 {
176 T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
177
178 // Read data into registers
179 CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
180
181 T raking_partial = cached_segment[0];
182
183 return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
184 }
185
186
188 template <typename ScanOp>
189 __device__ __forceinline__ void ExclusiveDownsweep(
190 ScanOp scan_op,
191 T raking_partial,
192 bool apply_prefix = true)
193 {
194 T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
195
196 // Read data back into registers
197 if (!MEMOIZE)
198 {
199 CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
200 }
201
202 internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
203
204 // Write data back to smem
205 CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
206 }
207
208
210 template <typename ScanOp>
211 __device__ __forceinline__ void InclusiveDownsweep(
212 ScanOp scan_op,
213 T raking_partial,
214 bool apply_prefix = true)
215 {
216 T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
217
218 // Read data back into registers
219 if (!MEMOIZE)
220 {
221 CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
222 }
223
224 internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
225
226 // Write data back to smem
227 CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
228 }
229
230
231 //---------------------------------------------------------------------
232 // Constructors
233 //---------------------------------------------------------------------
234
236 __device__ __forceinline__ BlockScanRaking(
237 TempStorage &temp_storage)
238 :
239 temp_storage(temp_storage.Alias()),
240 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
241 {}
242
243
244 //---------------------------------------------------------------------
245 // Exclusive scans
246 //---------------------------------------------------------------------
247
249 template <typename ScanOp>
250 __device__ __forceinline__ void ExclusiveScan(
251 T input,
252 T &exclusive_output,
253 ScanOp scan_op)
254 {
256 {
257 // Short-circuit directly to warp-synchronous scan
258 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
259 }
260 else
261 {
262 // Place thread partial into shared memory raking grid
263 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
264 *placement_ptr = input;
265
266 CTA_SYNC();
267
268 // Reduce parallelism down to just raking threads
269 if (linear_tid < RAKING_THREADS)
270 {
271 // Raking upsweep reduction across shared partials
272 T upsweep_partial = Upsweep(scan_op);
273
274 // Warp-synchronous scan
275 T exclusive_partial;
276 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
277
278 // Exclusive raking downsweep scan
279 ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
280 }
281
282 CTA_SYNC();
283
284 // Grab thread prefix from shared memory
285 exclusive_output = *placement_ptr;
286 }
287 }
288
290 template <typename ScanOp>
291 __device__ __forceinline__ void ExclusiveScan(
292 T input,
293 T &output,
294 const T &initial_value,
295 ScanOp scan_op)
296 {
298 {
299 // Short-circuit directly to warp-synchronous scan
300 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
301 }
302 else
303 {
304 // Place thread partial into shared memory raking grid
305 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
306 *placement_ptr = input;
307
308 CTA_SYNC();
309
310 // Reduce parallelism down to just raking threads
311 if (linear_tid < RAKING_THREADS)
312 {
313 // Raking upsweep reduction across shared partials
314 T upsweep_partial = Upsweep(scan_op);
315
316 // Exclusive Warp-synchronous scan
317 T exclusive_partial;
318 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
319
320 // Exclusive raking downsweep scan
321 ExclusiveDownsweep(scan_op, exclusive_partial);
322 }
323
324 CTA_SYNC();
325
326 // Grab exclusive partial from shared memory
327 output = *placement_ptr;
328 }
329 }
330
331
333 template <typename ScanOp>
334 __device__ __forceinline__ void ExclusiveScan(
335 T input,
336 T &output,
337 ScanOp scan_op,
338 T &block_aggregate)
339 {
341 {
342 // Short-circuit directly to warp-synchronous scan
343 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
344 }
345 else
346 {
347 // Place thread partial into shared memory raking grid
348 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
349 *placement_ptr = input;
350
351 CTA_SYNC();
352
353 // Reduce parallelism down to just raking threads
354 if (linear_tid < RAKING_THREADS)
355 {
356 // Raking upsweep reduction across shared partials
357 T upsweep_partial= Upsweep(scan_op);
358
359 // Warp-synchronous scan
360 T inclusive_partial;
361 T exclusive_partial;
362 WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
363
364 // Exclusive raking downsweep scan
365 ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
366
367 // Broadcast aggregate to all threads
368 if (linear_tid == RAKING_THREADS - 1)
369 temp_storage.block_aggregate = inclusive_partial;
370 }
371
372 CTA_SYNC();
373
374 // Grab thread prefix from shared memory
375 output = *placement_ptr;
376
377 // Retrieve block aggregate
378 block_aggregate = temp_storage.block_aggregate;
379 }
380 }
381
382
384 template <typename ScanOp>
385 __device__ __forceinline__ void ExclusiveScan(
386 T input,
387 T &output,
388 const T &initial_value,
389 ScanOp scan_op,
390 T &block_aggregate)
391 {
393 {
394 // Short-circuit directly to warp-synchronous scan
395 WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
396 }
397 else
398 {
399 // Place thread partial into shared memory raking grid
400 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
401 *placement_ptr = input;
402
403 CTA_SYNC();
404
405 // Reduce parallelism down to just raking threads
406 if (linear_tid < RAKING_THREADS)
407 {
408 // Raking upsweep reduction across shared partials
409 T upsweep_partial = Upsweep(scan_op);
410
411 // Warp-synchronous scan
412 T exclusive_partial;
413 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
414
415 // Exclusive raking downsweep scan
416 ExclusiveDownsweep(scan_op, exclusive_partial);
417
418 // Broadcast aggregate to other threads
419 if (linear_tid == 0)
420 temp_storage.block_aggregate = block_aggregate;
421 }
422
423 CTA_SYNC();
424
425 // Grab exclusive partial from shared memory
426 output = *placement_ptr;
427
428 // Retrieve block aggregate
429 block_aggregate = temp_storage.block_aggregate;
430 }
431 }
432
433
435 template <
436 typename ScanOp,
437 typename BlockPrefixCallbackOp>
438 __device__ __forceinline__ void ExclusiveScan(
439 T input,
440 T &output,
441 ScanOp scan_op,
442 BlockPrefixCallbackOp &block_prefix_callback_op)
443 {
445 {
446 // Short-circuit directly to warp-synchronous scan
447 T block_aggregate;
448 WarpScan warp_scan(temp_storage.warp_scan);
449 warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
450
451 // Obtain warp-wide prefix in lane0, then broadcast to other lanes
452 T block_prefix = block_prefix_callback_op(block_aggregate);
453 block_prefix = warp_scan.Broadcast(block_prefix, 0);
454
455 output = scan_op(block_prefix, output);
456 if (linear_tid == 0)
457 output = block_prefix;
458 }
459 else
460 {
461 // Place thread partial into shared memory raking grid
462 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
463 *placement_ptr = input;
464
465 CTA_SYNC();
466
467 // Reduce parallelism down to just raking threads
468 if (linear_tid < RAKING_THREADS)
469 {
470 WarpScan warp_scan(temp_storage.warp_scan);
471
472 // Raking upsweep reduction across shared partials
473 T upsweep_partial = Upsweep(scan_op);
474
475 // Warp-synchronous scan
476 T exclusive_partial, block_aggregate;
477 warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
478
479 // Obtain block-wide prefix in lane0, then broadcast to other lanes
480 T block_prefix = block_prefix_callback_op(block_aggregate);
481 block_prefix = warp_scan.Broadcast(block_prefix, 0);
482
483 // Update prefix with warpscan exclusive partial
484 T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
485 if (linear_tid == 0)
486 downsweep_prefix = block_prefix;
487
488 // Exclusive raking downsweep scan
489 ExclusiveDownsweep(scan_op, downsweep_prefix);
490 }
491
492 CTA_SYNC();
493
494 // Grab thread prefix from shared memory
495 output = *placement_ptr;
496 }
497 }
498
499
500 //---------------------------------------------------------------------
501 // Inclusive scans
502 //---------------------------------------------------------------------
503
505 template <typename ScanOp>
506 __device__ __forceinline__ void InclusiveScan(
507 T input,
508 T &output,
509 ScanOp scan_op)
510 {
512 {
513 // Short-circuit directly to warp-synchronous scan
514 WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
515 }
516 else
517 {
518 // Place thread partial into shared memory raking grid
519 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
520 *placement_ptr = input;
521
522 CTA_SYNC();
523
524 // Reduce parallelism down to just raking threads
525 if (linear_tid < RAKING_THREADS)
526 {
527 // Raking upsweep reduction across shared partials
528 T upsweep_partial = Upsweep(scan_op);
529
530 // Exclusive Warp-synchronous scan
531 T exclusive_partial;
532 WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
533
534 // Inclusive raking downsweep scan
535 InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
536 }
537
538 CTA_SYNC();
539
540 // Grab thread prefix from shared memory
541 output = *placement_ptr;
542 }
543 }
544
545
547 template <typename ScanOp>
548 __device__ __forceinline__ void InclusiveScan(
549 T input,
550 T &output,
551 ScanOp scan_op,
552 T &block_aggregate)
553 {
555 {
556 // Short-circuit directly to warp-synchronous scan
557 WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
558 }
559 else
560 {
561 // Place thread partial into shared memory raking grid
562 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
563 *placement_ptr = input;
564
565 CTA_SYNC();
566
567 // Reduce parallelism down to just raking threads
568 if (linear_tid < RAKING_THREADS)
569 {
570 // Raking upsweep reduction across shared partials
571 T upsweep_partial = Upsweep(scan_op);
572
573 // Warp-synchronous scan
574 T inclusive_partial;
575 T exclusive_partial;
576 WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
577
578 // Inclusive raking downsweep scan
579 InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
580
581 // Broadcast aggregate to all threads
582 if (linear_tid == RAKING_THREADS - 1)
583 temp_storage.block_aggregate = inclusive_partial;
584 }
585
586 CTA_SYNC();
587
588 // Grab thread prefix from shared memory
589 output = *placement_ptr;
590
591 // Retrieve block aggregate
592 block_aggregate = temp_storage.block_aggregate;
593 }
594 }
595
596
598 template <
599 typename ScanOp,
600 typename BlockPrefixCallbackOp>
601 __device__ __forceinline__ void InclusiveScan(
602 T input,
603 T &output,
604 ScanOp scan_op,
605 BlockPrefixCallbackOp &block_prefix_callback_op)
606 {
608 {
609 // Short-circuit directly to warp-synchronous scan
610 T block_aggregate;
611 WarpScan warp_scan(temp_storage.warp_scan);
612 warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
613
614 // Obtain warp-wide prefix in lane0, then broadcast to other lanes
615 T block_prefix = block_prefix_callback_op(block_aggregate);
616 block_prefix = warp_scan.Broadcast(block_prefix, 0);
617
618 // Update prefix with exclusive warpscan partial
619 output = scan_op(block_prefix, output);
620 }
621 else
622 {
623 // Place thread partial into shared memory raking grid
624 T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
625 *placement_ptr = input;
626
627 CTA_SYNC();
628
629 // Reduce parallelism down to just raking threads
630 if (linear_tid < RAKING_THREADS)
631 {
632 WarpScan warp_scan(temp_storage.warp_scan);
633
634 // Raking upsweep reduction across shared partials
635 T upsweep_partial = Upsweep(scan_op);
636
637 // Warp-synchronous scan
638 T exclusive_partial, block_aggregate;
639 warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
640
641 // Obtain block-wide prefix in lane0, then broadcast to other lanes
642 T block_prefix = block_prefix_callback_op(block_aggregate);
643 block_prefix = warp_scan.Broadcast(block_prefix, 0);
644
645 // Update prefix with warpscan exclusive partial
646 T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
647 if (linear_tid == 0)
648 downsweep_prefix = block_prefix;
649
650 // Inclusive raking downsweep scan
651 InclusiveDownsweep(scan_op, downsweep_prefix);
652 }
653
654 CTA_SYNC();
655
656 // Grab thread prefix from shared memory
657 output = *placement_ptr;
658 }
659 }
660
661};
662
663
664} // CUB namespace
665CUB_NS_POSTFIX // Optional outer namespace(s)
666
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ T ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
@ UNGUARDED
Whether or not we need bounds checking during raking (the number of reduction elements is not a multi...
@ SEGMENT_LENGTH
Number of raking elements per warp-synchronous raking thread (rounded up)
@ RAKING_THREADS
Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LE...
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
Alias wrapper allowing storage to be unioned.
Shared memory storage layout type.
BlockRakingLayout::TempStorage raking_grid
Padded thread block raking grid.
WarpScan::TempStorage warp_scan
Buffer for warp-synchronous scan.
BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > BlockRakingLayout
Layout type for padded thread block raking grid.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T GuardedReduce(T *, ScanOp, T raking_partial, Int2Type< SEGMENT_LENGTH >)
Templated reduction (base case)
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ BlockScanRaking(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ void CopySegment(T *out, T *in, Int2Type< ITERATION >)
Templated copy.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix=true)
Performs inclusive downsweep raking scan.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ WARP_SYNCHRONOUS
Cooperative work can be entirely warp synchronous.
@ SEGMENT_LENGTH
Number of raking elements per warp synchronous raking thread.
@ RAKING_THREADS
Number of raking threads.
WarpScan< T, RAKING_THREADS, PTX_ARCH > WarpScan
WarpScan utility type.
__device__ __forceinline__ void CopySegment(T *, T *, Int2Type< SEGMENT_LENGTH >)
Templated copy (base case)
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T Upsweep(ScanOp scan_op)
Performs upsweep raking reduction, returning the aggregate.
__device__ __forceinline__ T GuardedReduce(T *raking_ptr, ScanOp scan_op, T raking_partial, Int2Type< ITERATION >)
Templated reduction.
__device__ __forceinline__ void ExclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix=true)
Performs exclusive downsweep raking scan.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
\smemstorage{WarpScan}