OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_scan.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
38#include "../util_arch.cuh"
39#include "../util_type.cuh"
40#include "../util_ptx.cuh"
41#include "../util_namespace.cuh"
42
44CUB_NS_PREFIX
45
47namespace cub {
48
49
50/******************************************************************************
51 * Algorithmic variants
52 ******************************************************************************/
53
58{
59
79
80
89
90
109};
110
111
112/******************************************************************************
113 * Block scan
114 ******************************************************************************/
115
186template <
187 typename T,
188 int BLOCK_DIM_X,
190 int BLOCK_DIM_Y = 1,
191 int BLOCK_DIM_Z = 1,
192 int PTX_ARCH = CUB_PTX_ARCH>
194{
195private:
196
197 /******************************************************************************
198 * Constants and type definitions
199 ******************************************************************************/
200
202 enum
203 {
205 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
206 };
207
215 ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
217 ALGORITHM;
218
220 typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
221
223 typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
224 WarpScans,
226
229
230
231 /******************************************************************************
232 * Thread fields
233 ******************************************************************************/
234
237
239 unsigned int linear_tid;
240
241
242 /******************************************************************************
243 * Utility methods
244 ******************************************************************************/
245
247 __device__ __forceinline__ _TempStorage& PrivateStorage()
248 {
249 __shared__ _TempStorage private_storage;
250 return private_storage;
251 }
252
253
254 /******************************************************************************
255 * Public types
256 ******************************************************************************/
257public:
258
260 struct TempStorage : Uninitialized<_TempStorage> {};
261
262
263 /******************************************************************/
267
271 __device__ __forceinline__ BlockScan()
272 :
274 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
275 {}
276
277
281 __device__ __forceinline__ BlockScan(
283 :
284 temp_storage(temp_storage.Alias()),
285 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
286 {}
287
288
289
291 /******************************************************************/
295
296
333 __device__ __forceinline__ void ExclusiveSum(
334 T input,
335 T &output)
336 {
337 T initial_value = 0;
338 ExclusiveScan(input, output, initial_value, cub::Sum());
339 }
340
341
380 __device__ __forceinline__ void ExclusiveSum(
381 T input,
382 T &output,
383 T &block_aggregate)
384 {
385 T initial_value = 0;
386 ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
387 }
388
389
464 template <typename BlockPrefixCallbackOp>
465 __device__ __forceinline__ void ExclusiveSum(
466 T input,
467 T &output,
468 BlockPrefixCallbackOp &block_prefix_callback_op)
469 {
470 ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
471 }
472
473
475 /******************************************************************/
479
480
520 template <int ITEMS_PER_THREAD>
521 __device__ __forceinline__ void ExclusiveSum(
522 T (&input)[ITEMS_PER_THREAD],
523 T (&output)[ITEMS_PER_THREAD])
524 {
525 T initial_value = 0;
526 ExclusiveScan(input, output, initial_value, cub::Sum());
527 }
528
529
571 template <int ITEMS_PER_THREAD>
572 __device__ __forceinline__ void ExclusiveSum(
573 T (&input)[ITEMS_PER_THREAD],
574 T (&output)[ITEMS_PER_THREAD],
575 T &block_aggregate)
576 {
577 // Reduce consecutive thread items in registers
578 T initial_value = 0;
579 ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
580 }
581
582
670 template <
671 int ITEMS_PER_THREAD,
672 typename BlockPrefixCallbackOp>
673 __device__ __forceinline__ void ExclusiveSum(
674 T (&input)[ITEMS_PER_THREAD],
675 T (&output)[ITEMS_PER_THREAD],
676 BlockPrefixCallbackOp &block_prefix_callback_op)
677 {
678 ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
679 }
680
681
682
684 /******************************************************************/
688
689
727 template <typename ScanOp>
728 __device__ __forceinline__ void ExclusiveScan(
729 T input,
730 T &output,
731 T initial_value,
732 ScanOp scan_op)
733 {
734 InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
735 }
736
737
777 template <typename ScanOp>
778 __device__ __forceinline__ void ExclusiveScan(
779 T input,
780 T &output,
781 T initial_value,
782 ScanOp scan_op,
783 T &block_aggregate)
784 {
785 InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
786 }
787
788
864 template <
865 typename ScanOp,
866 typename BlockPrefixCallbackOp>
867 __device__ __forceinline__ void ExclusiveScan(
868 T input,
869 T &output,
870 ScanOp scan_op,
871 BlockPrefixCallbackOp &block_prefix_callback_op)
872 {
873 InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
874 }
875
876
878 /******************************************************************/
882
883
926 template <
927 int ITEMS_PER_THREAD,
928 typename ScanOp>
929 __device__ __forceinline__ void ExclusiveScan(
930 T (&input)[ITEMS_PER_THREAD],
931 T (&output)[ITEMS_PER_THREAD],
932 T initial_value,
933 ScanOp scan_op)
934 {
935 // Reduce consecutive thread items in registers
936 T thread_prefix = internal::ThreadReduce(input, scan_op);
937
938 // Exclusive thread block-scan
939 ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
940
941 // Exclusive scan in registers with prefix as seed
942 internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
943 }
944
945
988 template <
989 int ITEMS_PER_THREAD,
990 typename ScanOp>
991 __device__ __forceinline__ void ExclusiveScan(
992 T (&input)[ITEMS_PER_THREAD],
993 T (&output)[ITEMS_PER_THREAD],
994 T initial_value,
995 ScanOp scan_op,
996 T &block_aggregate)
997 {
998 // Reduce consecutive thread items in registers
999 T thread_prefix = internal::ThreadReduce(input, scan_op);
1000
1001 // Exclusive thread block-scan
1002 ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
1003
1004 // Exclusive scan in registers with prefix as seed
1005 internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1006 }
1007
1008
1095 template <
1096 int ITEMS_PER_THREAD,
1097 typename ScanOp,
1098 typename BlockPrefixCallbackOp>
1099 __device__ __forceinline__ void ExclusiveScan(
1100 T (&input)[ITEMS_PER_THREAD],
1101 T (&output)[ITEMS_PER_THREAD],
1102 ScanOp scan_op,
1103 BlockPrefixCallbackOp &block_prefix_callback_op)
1104 {
1105 // Reduce consecutive thread items in registers
1106 T thread_prefix = internal::ThreadReduce(input, scan_op);
1107
1108 // Exclusive thread block-scan
1109 ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
1110
1111 // Exclusive scan in registers with prefix as seed
1112 internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1113 }
1114
1115
1117#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans
1118
1119 /******************************************************************/
1123
1124
1135 template <typename ScanOp>
1136 __device__ __forceinline__ void ExclusiveScan(
1137 T input,
1138 T &output,
1139 ScanOp scan_op)
1140 {
1141 InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
1142 }
1143
1144
1155 template <typename ScanOp>
1156 __device__ __forceinline__ void ExclusiveScan(
1157 T input,
1158 T &output,
1159 ScanOp scan_op,
1160 T &block_aggregate)
1161 {
1162 InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1163 }
1164
1166 /******************************************************************/
1170
1171
1184 template <
1185 int ITEMS_PER_THREAD,
1186 typename ScanOp>
1187 __device__ __forceinline__ void ExclusiveScan(
1188 T (&input)[ITEMS_PER_THREAD],
1189 T (&output)[ITEMS_PER_THREAD],
1190 ScanOp scan_op)
1191 {
1192 // Reduce consecutive thread items in registers
1193 T thread_partial = internal::ThreadReduce(input, scan_op);
1194
1195 // Exclusive thread block-scan
1196 ExclusiveScan(thread_partial, thread_partial, scan_op);
1197
1198 // Exclusive scan in registers with prefix
1199 internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1200 }
1201
1202
1215 template <
1216 int ITEMS_PER_THREAD,
1217 typename ScanOp>
1218 __device__ __forceinline__ void ExclusiveScan(
1219 T (&input)[ITEMS_PER_THREAD],
1220 T (&output)[ITEMS_PER_THREAD],
1221 ScanOp scan_op,
1222 T &block_aggregate)
1223 {
1224 // Reduce consecutive thread items in registers
1225 T thread_partial = internal::ThreadReduce(input, scan_op);
1226
1227 // Exclusive thread block-scan
1228 ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1229
1230 // Exclusive scan in registers with prefix
1231 internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1232 }
1233
1234
1236#endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans
1237
1238 /******************************************************************/
1242
1243
1279 __device__ __forceinline__ void InclusiveSum(
1280 T input,
1281 T &output)
1282 {
1283 InclusiveScan(input, output, cub::Sum());
1284 }
1285
1286
1324 __device__ __forceinline__ void InclusiveSum(
1325 T input,
1326 T &output,
1327 T &block_aggregate)
1328 {
1329 InclusiveScan(input, output, cub::Sum(), block_aggregate);
1330 }
1331
1332
1333
1407 template <typename BlockPrefixCallbackOp>
1408 __device__ __forceinline__ void InclusiveSum(
1409 T input,
1410 T &output,
1411 BlockPrefixCallbackOp &block_prefix_callback_op)
1412 {
1413 InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
1414 }
1415
1416
1418 /******************************************************************/
1422
1423
1462 template <int ITEMS_PER_THREAD>
1463 __device__ __forceinline__ void InclusiveSum(
1464 T (&input)[ITEMS_PER_THREAD],
1465 T (&output)[ITEMS_PER_THREAD])
1466 {
1467 if (ITEMS_PER_THREAD == 1)
1468 {
1469 InclusiveSum(input[0], output[0]);
1470 }
1471 else
1472 {
1473 // Reduce consecutive thread items in registers
1474 Sum scan_op;
1475 T thread_prefix = internal::ThreadReduce(input, scan_op);
1476
1477 // Exclusive thread block-scan
1478 ExclusiveSum(thread_prefix, thread_prefix);
1479
1480 // Inclusive scan in registers with prefix as seed
1481 internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1482 }
1483 }
1484
1485
1529 template <int ITEMS_PER_THREAD>
1530 __device__ __forceinline__ void InclusiveSum(
1531 T (&input)[ITEMS_PER_THREAD],
1532 T (&output)[ITEMS_PER_THREAD],
1533 T &block_aggregate)
1534 {
1535 if (ITEMS_PER_THREAD == 1)
1536 {
1537 InclusiveSum(input[0], output[0], block_aggregate);
1538 }
1539 else
1540 {
1541 // Reduce consecutive thread items in registers
1542 Sum scan_op;
1543 T thread_prefix = internal::ThreadReduce(input, scan_op);
1544
1545 // Exclusive thread block-scan
1546 ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1547
1548 // Inclusive scan in registers with prefix as seed
1549 internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1550 }
1551 }
1552
1553
1639 template <
1640 int ITEMS_PER_THREAD,
1641 typename BlockPrefixCallbackOp>
1642 __device__ __forceinline__ void InclusiveSum(
1643 T (&input)[ITEMS_PER_THREAD],
1644 T (&output)[ITEMS_PER_THREAD],
1645 BlockPrefixCallbackOp &block_prefix_callback_op)
1646 {
1647 if (ITEMS_PER_THREAD == 1)
1648 {
1649 InclusiveSum(input[0], output[0], block_prefix_callback_op);
1650 }
1651 else
1652 {
1653 // Reduce consecutive thread items in registers
1654 Sum scan_op;
1655 T thread_prefix = internal::ThreadReduce(input, scan_op);
1656
1657 // Exclusive thread block-scan
1658 ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1659
1660 // Inclusive scan in registers with prefix as seed
1661 internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1662 }
1663 }
1664
1665
1667 /******************************************************************/
1671
1672
1710 template <typename ScanOp>
1711 __device__ __forceinline__ void InclusiveScan(
1712 T input,
1713 T &output,
1714 ScanOp scan_op)
1715 {
1716 InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
1717 }
1718
1719
1759 template <typename ScanOp>
1760 __device__ __forceinline__ void InclusiveScan(
1761 T input,
1762 T &output,
1763 ScanOp scan_op,
1764 T &block_aggregate)
1765 {
1766 InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
1767 }
1768
1769
1845 template <
1846 typename ScanOp,
1847 typename BlockPrefixCallbackOp>
1848 __device__ __forceinline__ void InclusiveScan(
1849 T input,
1850 T &output,
1851 ScanOp scan_op,
1852 BlockPrefixCallbackOp &block_prefix_callback_op)
1853 {
1854 InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
1855 }
1856
1857
1859 /******************************************************************/
1863
1864
1905 template <
1906 int ITEMS_PER_THREAD,
1907 typename ScanOp>
1908 __device__ __forceinline__ void InclusiveScan(
1909 T (&input)[ITEMS_PER_THREAD],
1910 T (&output)[ITEMS_PER_THREAD],
1911 ScanOp scan_op)
1912 {
1913 if (ITEMS_PER_THREAD == 1)
1914 {
1915 InclusiveScan(input[0], output[0], scan_op);
1916 }
1917 else
1918 {
1919 // Reduce consecutive thread items in registers
1920 T thread_prefix = internal::ThreadReduce(input, scan_op);
1921
1922 // Exclusive thread block-scan
1923 ExclusiveScan(thread_prefix, thread_prefix, scan_op);
1924
1925 // Inclusive scan in registers with prefix as seed (first thread does not seed)
1926 internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1927 }
1928 }
1929
1930
1975 template <
1976 int ITEMS_PER_THREAD,
1977 typename ScanOp>
1978 __device__ __forceinline__ void InclusiveScan(
1979 T (&input)[ITEMS_PER_THREAD],
1980 T (&output)[ITEMS_PER_THREAD],
1981 ScanOp scan_op,
1982 T &block_aggregate)
1983 {
1984 if (ITEMS_PER_THREAD == 1)
1985 {
1986 InclusiveScan(input[0], output[0], scan_op, block_aggregate);
1987 }
1988 else
1989 {
1990 // Reduce consecutive thread items in registers
1991 T thread_prefix = internal::ThreadReduce(input, scan_op);
1992
1993 // Exclusive thread block-scan (with no initial value)
1994 ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
1995
1996 // Inclusive scan in registers with prefix as seed (first thread does not seed)
1997 internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1998 }
1999 }
2000
2001
2088 template <
2089 int ITEMS_PER_THREAD,
2090 typename ScanOp,
2091 typename BlockPrefixCallbackOp>
2092 __device__ __forceinline__ void InclusiveScan(
2093 T (&input)[ITEMS_PER_THREAD],
2094 T (&output)[ITEMS_PER_THREAD],
2095 ScanOp scan_op,
2096 BlockPrefixCallbackOp &block_prefix_callback_op)
2097 {
2098 if (ITEMS_PER_THREAD == 1)
2099 {
2100 InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
2101 }
2102 else
2103 {
2104 // Reduce consecutive thread items in registers
2105 T thread_prefix = internal::ThreadReduce(input, scan_op);
2106
2107 // Exclusive thread block-scan
2108 ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
2109
2110 // Inclusive scan in registers with prefix as seed
2111 internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2112 }
2113 }
2114
2116
2117
2118};
2119
2124} // CUB namespace
2125CUB_NS_POSTFIX // Optional outer namespace(s)
2126
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
InternalBlockScan::TempStorage _TempStorage
Shared memory storage layout type for BlockScan.
__device__ __forceinline__ void InclusiveSum(T input, T &output, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T &block_aggregate)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, T &block_aggregate)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
static const BlockScanAlgorithm SAFE_ALGORITHM
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveSum(T input, T &output, T &block_aggregate)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T input, T &output, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveSum(T input, T &output, T &block_aggregate)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ BlockScan(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void InclusiveSum(T input, T &output)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T &block_aggregate)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
If<(SAFE_ALGORITHM==BLOCK_SCAN_WARP_SCANS), WarpScans, Raking >::Type InternalBlockScan
Define the delegate type for the desired algorithm.
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ T ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
@ BLOCK_SCAN_RAKING
@ BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_RAKING_MEMOIZE
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread bloc...
\smemstorage{BlockScan}
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
Default sum functor.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53