OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_scan_raking.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 
35 #pragma once
36 
37 #include "../../util_ptx.cuh"
38 #include "../../util_arch.cuh"
39 #include "../../block/block_raking_layout.cuh"
40 #include "../../thread/thread_reduce.cuh"
41 #include "../../thread/thread_scan.cuh"
42 #include "../../warp/warp_scan.cuh"
43 #include "../../util_namespace.cuh"
44 
46 CUB_NS_PREFIX
47 
49 namespace cub {
50 
51 
55 template <
56  typename T,
57  int BLOCK_DIM_X,
58  int BLOCK_DIM_Y,
59  int BLOCK_DIM_Z,
60  bool MEMOIZE,
61  int PTX_ARCH>
63 {
64  //---------------------------------------------------------------------
65  // Types and constants
66  //---------------------------------------------------------------------
67 
69  enum
70  {
72  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
73  };
74 
77 
79  enum
80  {
83 
86 
89  };
90 
93 
95  struct _TempStorage
96  {
100  };
101 
102 
104  struct TempStorage : Uninitialized<_TempStorage> {};
105 
106 
107  //---------------------------------------------------------------------
108  // Per-thread fields
109  //---------------------------------------------------------------------
110 
111  // Thread fields
112  _TempStorage &temp_storage;
113  unsigned int linear_tid;
114  T cached_segment[SEGMENT_LENGTH];
115 
116 
117  //---------------------------------------------------------------------
118  // Utility methods
119  //---------------------------------------------------------------------
120 
122  template <int ITERATION, typename ScanOp>
123  __device__ __forceinline__ T GuardedReduce(
124  T* raking_ptr,
125  ScanOp scan_op,
126  T raking_partial,
127  Int2Type<ITERATION> /*iteration*/)
128  {
129  if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
130  {
131  T addend = raking_ptr[ITERATION];
132  raking_partial = scan_op(raking_partial, addend);
133  }
134 
135  return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
136  }
137 
138 
140  template <typename ScanOp>
141  __device__ __forceinline__ T GuardedReduce(
142  T* /*raking_ptr*/,
143  ScanOp /*scan_op*/,
144  T raking_partial,
145  Int2Type<SEGMENT_LENGTH> /*iteration*/)
146  {
147  return raking_partial;
148  }
149 
150 
152  template <int ITERATION>
153  __device__ __forceinline__ void CopySegment(
154  T* out,
155  T* in,
156  Int2Type<ITERATION> /*iteration*/)
157  {
158  out[ITERATION] = in[ITERATION];
160  }
161 
162 
164  __device__ __forceinline__ void CopySegment(
165  T* /*out*/,
166  T* /*in*/,
167  Int2Type<SEGMENT_LENGTH> /*iteration*/)
168  {}
169 
170 
172  template <typename ScanOp>
173  __device__ __forceinline__ T Upsweep(
174  ScanOp scan_op)
175  {
176  T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
177 
178  // Read data into registers
179  CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
180 
181  T raking_partial = cached_segment[0];
182 
183  return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
184  }
185 
186 
188  template <typename ScanOp>
189  __device__ __forceinline__ void ExclusiveDownsweep(
190  ScanOp scan_op,
191  T raking_partial,
192  bool apply_prefix = true)
193  {
194  T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
195 
196  // Read data back into registers
197  if (!MEMOIZE)
198  {
199  CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
200  }
201 
202  internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
203 
204  // Write data back to smem
205  CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
206  }
207 
208 
210  template <typename ScanOp>
211  __device__ __forceinline__ void InclusiveDownsweep(
212  ScanOp scan_op,
213  T raking_partial,
214  bool apply_prefix = true)
215  {
216  T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
217 
218  // Read data back into registers
219  if (!MEMOIZE)
220  {
221  CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
222  }
223 
224  internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
225 
226  // Write data back to smem
227  CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
228  }
229 
230 
231  //---------------------------------------------------------------------
232  // Constructors
233  //---------------------------------------------------------------------
234 
236  __device__ __forceinline__ BlockScanRaking(
237  TempStorage &temp_storage)
238  :
239  temp_storage(temp_storage.Alias()),
240  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
241  {}
242 
243 
244  //---------------------------------------------------------------------
245  // Exclusive scans
246  //---------------------------------------------------------------------
247 
249  template <typename ScanOp>
250  __device__ __forceinline__ void ExclusiveScan(
251  T input,
252  T &exclusive_output,
253  ScanOp scan_op)
254  {
255  if (WARP_SYNCHRONOUS)
256  {
257  // Short-circuit directly to warp-synchronous scan
258  WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
259  }
260  else
261  {
262  // Place thread partial into shared memory raking grid
263  T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
264  *placement_ptr = input;
265 
266  CTA_SYNC();
267 
268  // Reduce parallelism down to just raking threads
269  if (linear_tid < RAKING_THREADS)
270  {
271  // Raking upsweep reduction across shared partials
272  T upsweep_partial = Upsweep(scan_op);
273 
274  // Warp-synchronous scan
275  T exclusive_partial;
276  WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
277 
278  // Exclusive raking downsweep scan
279  ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
280  }
281 
282  CTA_SYNC();
283 
284  // Grab thread prefix from shared memory
285  exclusive_output = *placement_ptr;
286  }
287  }
288 
290  template <typename ScanOp>
291  __device__ __forceinline__ void ExclusiveScan(
292  T input,
293  T &output,
294  const T &initial_value,
295  ScanOp scan_op)
296  {
297  if (WARP_SYNCHRONOUS)
298  {
299  // Short-circuit directly to warp-synchronous scan
300  WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
301  }
302  else
303  {
304  // Place thread partial into shared memory raking grid
305  T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
306  *placement_ptr = input;
307 
308  CTA_SYNC();
309 
310  // Reduce parallelism down to just raking threads
311  if (linear_tid < RAKING_THREADS)
312  {
313  // Raking upsweep reduction across shared partials
314  T upsweep_partial = Upsweep(scan_op);
315 
316  // Exclusive Warp-synchronous scan
317  T exclusive_partial;
318  WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
319 
320  // Exclusive raking downsweep scan
321  ExclusiveDownsweep(scan_op, exclusive_partial);
322  }
323 
324  CTA_SYNC();
325 
326  // Grab exclusive partial from shared memory
327  output = *placement_ptr;
328  }
329  }
330 
331 
333  template <typename ScanOp>
334  __device__ __forceinline__ void ExclusiveScan(
335  T input,
336  T &output,
337  ScanOp scan_op,
338  T &block_aggregate)
339  {
340  if (WARP_SYNCHRONOUS)
341  {
342  // Short-circuit directly to warp-synchronous scan
343  WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
344  }
345  else
346  {
347  // Place thread partial into shared memory raking grid
348  T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
349  *placement_ptr = input;
350 
351  CTA_SYNC();
352 
353  // Reduce parallelism down to just raking threads
354  if (linear_tid < RAKING_THREADS)
355  {
356  // Raking upsweep reduction across shared partials
357  T upsweep_partial= Upsweep(scan_op);
358 
359  // Warp-synchronous scan
360  T inclusive_partial;
361  T exclusive_partial;
362  WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
363 
364  // Exclusive raking downsweep scan
365  ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
366 
367  // Broadcast aggregate to all threads
368  if (linear_tid == RAKING_THREADS - 1)
369  temp_storage.block_aggregate = inclusive_partial;
370  }
371 
372  CTA_SYNC();
373 
374  // Grab thread prefix from shared memory
375  output = *placement_ptr;
376 
377  // Retrieve block aggregate
378  block_aggregate = temp_storage.block_aggregate;
379  }
380  }
381 
382 
384  template <typename ScanOp>
385  __device__ __forceinline__ void ExclusiveScan(
386  T input,
387  T &output,
388  const T &initial_value,
389  ScanOp scan_op,
390  T &block_aggregate)
391  {
392  if (WARP_SYNCHRONOUS)
393  {
394  // Short-circuit directly to warp-synchronous scan
395  WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
396  }
397  else
398  {
399  // Place thread partial into shared memory raking grid
400  T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
401  *placement_ptr = input;
402 
403  CTA_SYNC();
404 
405  // Reduce parallelism down to just raking threads
406  if (linear_tid < RAKING_THREADS)
407  {
408  // Raking upsweep reduction across shared partials
409  T upsweep_partial = Upsweep(scan_op);
410 
411  // Warp-synchronous scan
412  T exclusive_partial;
413  WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
414 
415  // Exclusive raking downsweep scan
416  ExclusiveDownsweep(scan_op, exclusive_partial);
417 
418  // Broadcast aggregate to other threads
419  if (linear_tid == 0)
420  temp_storage.block_aggregate = block_aggregate;
421  }
422 
423  CTA_SYNC();
424 
425  // Grab exclusive partial from shared memory
426  output = *placement_ptr;
427 
428  // Retrieve block aggregate
429  block_aggregate = temp_storage.block_aggregate;
430  }
431  }
432 
433 
435  template <
436  typename ScanOp,
437  typename BlockPrefixCallbackOp>
438  __device__ __forceinline__ void ExclusiveScan(
439  T input,
440  T &output,
441  ScanOp scan_op,
442  BlockPrefixCallbackOp &block_prefix_callback_op)
443  {
444  if (WARP_SYNCHRONOUS)
445  {
446  // Short-circuit directly to warp-synchronous scan
447  T block_aggregate;
448  WarpScan warp_scan(temp_storage.warp_scan);
449  warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
450 
451  // Obtain warp-wide prefix in lane0, then broadcast to other lanes
452  T block_prefix = block_prefix_callback_op(block_aggregate);
453  block_prefix = warp_scan.Broadcast(block_prefix, 0);
454 
455  output = scan_op(block_prefix, output);
456  if (linear_tid == 0)
457  output = block_prefix;
458  }
459  else
460  {
461  // Place thread partial into shared memory raking grid
462  T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
463  *placement_ptr = input;
464 
465  CTA_SYNC();
466 
467  // Reduce parallelism down to just raking threads
468  if (linear_tid < RAKING_THREADS)
469  {
470  WarpScan warp_scan(temp_storage.warp_scan);
471 
472  // Raking upsweep reduction across shared partials
473  T upsweep_partial = Upsweep(scan_op);
474 
475  // Warp-synchronous scan
476  T exclusive_partial, block_aggregate;
477  warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
478 
479  // Obtain block-wide prefix in lane0, then broadcast to other lanes
480  T block_prefix = block_prefix_callback_op(block_aggregate);
481  block_prefix = warp_scan.Broadcast(block_prefix, 0);
482 
483  // Update prefix with warpscan exclusive partial
484  T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
485  if (linear_tid == 0)
486  downsweep_prefix = block_prefix;
487 
488  // Exclusive raking downsweep scan
489  ExclusiveDownsweep(scan_op, downsweep_prefix);
490  }
491 
492  CTA_SYNC();
493 
494  // Grab thread prefix from shared memory
495  output = *placement_ptr;
496  }
497  }
498 
499 
500  //---------------------------------------------------------------------
501  // Inclusive scans
502  //---------------------------------------------------------------------
503 
505  template <typename ScanOp>
506  __device__ __forceinline__ void InclusiveScan(
507  T input,
508  T &output,
509  ScanOp scan_op)
510  {
511  if (WARP_SYNCHRONOUS)
512  {
513  // Short-circuit directly to warp-synchronous scan
514  WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
515  }
516  else
517  {
518  // Place thread partial into shared memory raking grid
519  T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
520  *placement_ptr = input;
521 
522  CTA_SYNC();
523 
524  // Reduce parallelism down to just raking threads
525  if (linear_tid < RAKING_THREADS)
526  {
527  // Raking upsweep reduction across shared partials
528  T upsweep_partial = Upsweep(scan_op);
529 
530  // Exclusive Warp-synchronous scan
531  T exclusive_partial;
532  WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
533 
534  // Inclusive raking downsweep scan
535  InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
536  }
537 
538  CTA_SYNC();
539 
540  // Grab thread prefix from shared memory
541  output = *placement_ptr;
542  }
543  }
544 
545 
547  template <typename ScanOp>
548  __device__ __forceinline__ void InclusiveScan(
549  T input,
550  T &output,
551  ScanOp scan_op,
552  T &block_aggregate)
553  {
554  if (WARP_SYNCHRONOUS)
555  {
556  // Short-circuit directly to warp-synchronous scan
557  WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
558  }
559  else
560  {
561  // Place thread partial into shared memory raking grid
562  T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
563  *placement_ptr = input;
564 
565  CTA_SYNC();
566 
567  // Reduce parallelism down to just raking threads
568  if (linear_tid < RAKING_THREADS)
569  {
570  // Raking upsweep reduction across shared partials
571  T upsweep_partial = Upsweep(scan_op);
572 
573  // Warp-synchronous scan
574  T inclusive_partial;
575  T exclusive_partial;
576  WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
577 
578  // Inclusive raking downsweep scan
579  InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
580 
581  // Broadcast aggregate to all threads
582  if (linear_tid == RAKING_THREADS - 1)
583  temp_storage.block_aggregate = inclusive_partial;
584  }
585 
586  CTA_SYNC();
587 
588  // Grab thread prefix from shared memory
589  output = *placement_ptr;
590 
591  // Retrieve block aggregate
592  block_aggregate = temp_storage.block_aggregate;
593  }
594  }
595 
596 
598  template <
599  typename ScanOp,
600  typename BlockPrefixCallbackOp>
601  __device__ __forceinline__ void InclusiveScan(
602  T input,
603  T &output,
604  ScanOp scan_op,
605  BlockPrefixCallbackOp &block_prefix_callback_op)
606  {
607  if (WARP_SYNCHRONOUS)
608  {
609  // Short-circuit directly to warp-synchronous scan
610  T block_aggregate;
611  WarpScan warp_scan(temp_storage.warp_scan);
612  warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
613 
614  // Obtain warp-wide prefix in lane0, then broadcast to other lanes
615  T block_prefix = block_prefix_callback_op(block_aggregate);
616  block_prefix = warp_scan.Broadcast(block_prefix, 0);
617 
618  // Update prefix with exclusive warpscan partial
619  output = scan_op(block_prefix, output);
620  }
621  else
622  {
623  // Place thread partial into shared memory raking grid
624  T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
625  *placement_ptr = input;
626 
627  CTA_SYNC();
628 
629  // Reduce parallelism down to just raking threads
630  if (linear_tid < RAKING_THREADS)
631  {
632  WarpScan warp_scan(temp_storage.warp_scan);
633 
634  // Raking upsweep reduction across shared partials
635  T upsweep_partial = Upsweep(scan_op);
636 
637  // Warp-synchronous scan
638  T exclusive_partial, block_aggregate;
639  warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
640 
641  // Obtain block-wide prefix in lane0, then broadcast to other lanes
642  T block_prefix = block_prefix_callback_op(block_aggregate);
643  block_prefix = warp_scan.Broadcast(block_prefix, 0);
644 
645  // Update prefix with warpscan exclusive partial
646  T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
647  if (linear_tid == 0)
648  downsweep_prefix = block_prefix;
649 
650  // Inclusive raking downsweep scan
651  InclusiveDownsweep(scan_op, downsweep_prefix);
652  }
653 
654  CTA_SYNC();
655 
656  // Grab thread prefix from shared memory
657  output = *placement_ptr;
658  }
659  }
660 
661 };
662 
663 
664 } // CUB namespace
665 CUB_NS_POSTFIX // Optional outer namespace(s)
666 
Alias wrapper allowing storage to be unioned.
__device__ __forceinline__ void InclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix=true)
Performs inclusive downsweep raking scan.
__device__ __forceinline__ T GuardedReduce(T *, ScanOp, T raking_partial, Int2Type< SEGMENT_LENGTH >)
Templated reduction (base case)
__device__ __forceinline__ void CopySegment(T *, T *, Int2Type< SEGMENT_LENGTH >)
Templated copy (base case)
__device__ __forceinline__ void ExclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix=true)
Performs exclusive downsweep raking scan.
__device__ __forceinline__ T GuardedReduce(T *raking_ptr, ScanOp scan_op, T raking_partial, Int2Type< ITERATION >)
Templated reduction.
Optional outer namespace(s)
Shared memory storage layout type.
Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LE...
Number of raking elements per warp-synchronous raking thread (rounded up)
CTA_SYNC()
Definition: util_ptx.cuh:255
BlockRakingLayout< T, BLOCK_THREADS, PTX_ARCH > BlockRakingLayout
Layout type for padded thread block raking grid.
Whether or not we need bounds checking during raking (the number of reduction elements is not a multi...
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
The thread block size in threads.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
static __device__ __forceinline__ T * PlacementPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to place data into the grid.
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
Definition: thread_scan.cuh:63
static __device__ __forceinline__ T * RakingPtr(TempStorage &temp_storage, unsigned int linear_tid)
Returns the location for the calling thread to begin sequential raking.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
WarpScan::TempStorage warp_scan
Buffer for warp-synchronous scan.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void CopySegment(T *out, T *in, Int2Type< ITERATION >)
Templated copy.
Alias wrapper allowing storage to be unioned.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ BlockScanRaking(TempStorage &temp_storage)
Constructor.
WarpScan< T, RAKING_THREADS, PTX_ARCH > WarpScan
WarpScan utility type.
__device__ __forceinline__ T ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
Cooperative work can be entirely warp synchronous.
BlockRakingLayout::TempStorage raking_grid
Padded thread block raking grid.
BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T Upsweep(ScanOp scan_op)
Performs upsweep raking reduction, returning the aggregate.
Number of raking elements per warp synchronous raking thread.
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
\smemstorage{WarpScan}
Definition: warp_scan.cuh:192
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....