OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_scan.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
38 #include "../util_arch.cuh"
39 #include "../util_type.cuh"
40 #include "../util_ptx.cuh"
41 #include "../util_namespace.cuh"
42 
44 CUB_NS_PREFIX
45 
47 namespace cub {
48 
49 
50 /******************************************************************************
51  * Algorithmic variants
52  ******************************************************************************/
53 
58 {
59 
79 
80 
89 
90 
109 };
110 
111 
112 /******************************************************************************
113  * Block scan
114  ******************************************************************************/
115 
186 template <
187  typename T,
188  int BLOCK_DIM_X,
190  int BLOCK_DIM_Y = 1,
191  int BLOCK_DIM_Z = 1,
192  int PTX_ARCH = CUB_PTX_ARCH>
194 {
195 private:
196 
197  /******************************************************************************
198  * Constants and type definitions
199  ******************************************************************************/
200 
202  enum
203  {
205  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
206  };
207 
215  ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
217  ALGORITHM;
218 
220  typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
221 
223  typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
224  WarpScans,
226 
228  typedef typename InternalBlockScan::TempStorage _TempStorage;
229 
230 
231  /******************************************************************************
232  * Thread fields
233  ******************************************************************************/
234 
237 
239  unsigned int linear_tid;
240 
241 
242  /******************************************************************************
243  * Utility methods
244  ******************************************************************************/
245 
247  __device__ __forceinline__ _TempStorage& PrivateStorage()
248  {
249  __shared__ _TempStorage private_storage;
250  return private_storage;
251  }
252 
253 
254  /******************************************************************************
255  * Public types
256  ******************************************************************************/
257 public:
258 
260  struct TempStorage : Uninitialized<_TempStorage> {};
261 
262 
263  /******************************************************************/
267 
271  __device__ __forceinline__ BlockScan()
272  :
274  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
275  {}
276 
277 
281  __device__ __forceinline__ BlockScan(
283  :
284  temp_storage(temp_storage.Alias()),
285  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
286  {}
287 
288 
289 
291  /******************************************************************/
295 
296 
333  __device__ __forceinline__ void ExclusiveSum(
334  T input,
335  T &output)
336  {
337  T initial_value = 0;
338  ExclusiveScan(input, output, initial_value, cub::Sum());
339  }
340 
341 
380  __device__ __forceinline__ void ExclusiveSum(
381  T input,
382  T &output,
383  T &block_aggregate)
384  {
385  T initial_value = 0;
386  ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
387  }
388 
389 
464  template <typename BlockPrefixCallbackOp>
465  __device__ __forceinline__ void ExclusiveSum(
466  T input,
467  T &output,
468  BlockPrefixCallbackOp &block_prefix_callback_op)
469  {
470  ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
471  }
472 
473 
475  /******************************************************************/
479 
480 
520  template <int ITEMS_PER_THREAD>
521  __device__ __forceinline__ void ExclusiveSum(
522  T (&input)[ITEMS_PER_THREAD],
523  T (&output)[ITEMS_PER_THREAD])
524  {
525  T initial_value = 0;
526  ExclusiveScan(input, output, initial_value, cub::Sum());
527  }
528 
529 
571  template <int ITEMS_PER_THREAD>
572  __device__ __forceinline__ void ExclusiveSum(
573  T (&input)[ITEMS_PER_THREAD],
574  T (&output)[ITEMS_PER_THREAD],
575  T &block_aggregate)
576  {
577  // Reduce consecutive thread items in registers
578  T initial_value = 0;
579  ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
580  }
581 
582 
670  template <
671  int ITEMS_PER_THREAD,
672  typename BlockPrefixCallbackOp>
673  __device__ __forceinline__ void ExclusiveSum(
674  T (&input)[ITEMS_PER_THREAD],
675  T (&output)[ITEMS_PER_THREAD],
676  BlockPrefixCallbackOp &block_prefix_callback_op)
677  {
678  ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
679  }
680 
681 
682 
684  /******************************************************************/
688 
689 
727  template <typename ScanOp>
728  __device__ __forceinline__ void ExclusiveScan(
729  T input,
730  T &output,
731  T initial_value,
732  ScanOp scan_op)
733  {
734  InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
735  }
736 
737 
777  template <typename ScanOp>
778  __device__ __forceinline__ void ExclusiveScan(
779  T input,
780  T &output,
781  T initial_value,
782  ScanOp scan_op,
783  T &block_aggregate)
784  {
785  InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
786  }
787 
788 
864  template <
865  typename ScanOp,
866  typename BlockPrefixCallbackOp>
867  __device__ __forceinline__ void ExclusiveScan(
868  T input,
869  T &output,
870  ScanOp scan_op,
871  BlockPrefixCallbackOp &block_prefix_callback_op)
872  {
873  InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
874  }
875 
876 
878  /******************************************************************/
882 
883 
926  template <
927  int ITEMS_PER_THREAD,
928  typename ScanOp>
929  __device__ __forceinline__ void ExclusiveScan(
930  T (&input)[ITEMS_PER_THREAD],
931  T (&output)[ITEMS_PER_THREAD],
932  T initial_value,
933  ScanOp scan_op)
934  {
935  // Reduce consecutive thread items in registers
936  T thread_prefix = internal::ThreadReduce(input, scan_op);
937 
938  // Exclusive thread block-scan
939  ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
940 
941  // Exclusive scan in registers with prefix as seed
942  internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
943  }
944 
945 
988  template <
989  int ITEMS_PER_THREAD,
990  typename ScanOp>
991  __device__ __forceinline__ void ExclusiveScan(
992  T (&input)[ITEMS_PER_THREAD],
993  T (&output)[ITEMS_PER_THREAD],
994  T initial_value,
995  ScanOp scan_op,
996  T &block_aggregate)
997  {
998  // Reduce consecutive thread items in registers
999  T thread_prefix = internal::ThreadReduce(input, scan_op);
1000 
1001  // Exclusive thread block-scan
1002  ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
1003 
1004  // Exclusive scan in registers with prefix as seed
1005  internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1006  }
1007 
1008 
1095  template <
1096  int ITEMS_PER_THREAD,
1097  typename ScanOp,
1098  typename BlockPrefixCallbackOp>
1099  __device__ __forceinline__ void ExclusiveScan(
1100  T (&input)[ITEMS_PER_THREAD],
1101  T (&output)[ITEMS_PER_THREAD],
1102  ScanOp scan_op,
1103  BlockPrefixCallbackOp &block_prefix_callback_op)
1104  {
1105  // Reduce consecutive thread items in registers
1106  T thread_prefix = internal::ThreadReduce(input, scan_op);
1107 
1108  // Exclusive thread block-scan
1109  ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
1110 
1111  // Exclusive scan in registers with prefix as seed
1112  internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1113  }
1114 
1115 
1117 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans
1118 
1119  /******************************************************************/
1123 
1124 
1135  template <typename ScanOp>
1136  __device__ __forceinline__ void ExclusiveScan(
1137  T input,
1138  T &output,
1139  ScanOp scan_op)
1140  {
1141  InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
1142  }
1143 
1144 
1155  template <typename ScanOp>
1156  __device__ __forceinline__ void ExclusiveScan(
1157  T input,
1158  T &output,
1159  ScanOp scan_op,
1160  T &block_aggregate)
1161  {
1162  InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1163  }
1164 
1166  /******************************************************************/
1170 
1171 
1184  template <
1185  int ITEMS_PER_THREAD,
1186  typename ScanOp>
1187  __device__ __forceinline__ void ExclusiveScan(
1188  T (&input)[ITEMS_PER_THREAD],
1189  T (&output)[ITEMS_PER_THREAD],
1190  ScanOp scan_op)
1191  {
1192  // Reduce consecutive thread items in registers
1193  T thread_partial = internal::ThreadReduce(input, scan_op);
1194 
1195  // Exclusive thread block-scan
1196  ExclusiveScan(thread_partial, thread_partial, scan_op);
1197 
1198  // Exclusive scan in registers with prefix
1199  internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1200  }
1201 
1202 
1215  template <
1216  int ITEMS_PER_THREAD,
1217  typename ScanOp>
1218  __device__ __forceinline__ void ExclusiveScan(
1219  T (&input)[ITEMS_PER_THREAD],
1220  T (&output)[ITEMS_PER_THREAD],
1221  ScanOp scan_op,
1222  T &block_aggregate)
1223  {
1224  // Reduce consecutive thread items in registers
1225  T thread_partial = internal::ThreadReduce(input, scan_op);
1226 
1227  // Exclusive thread block-scan
1228  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1229 
1230  // Exclusive scan in registers with prefix
1231  internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1232  }
1233 
1234 
1236 #endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans
1237 
1238  /******************************************************************/
1242 
1243 
1279  __device__ __forceinline__ void InclusiveSum(
1280  T input,
1281  T &output)
1282  {
1283  InclusiveScan(input, output, cub::Sum());
1284  }
1285 
1286 
1324  __device__ __forceinline__ void InclusiveSum(
1325  T input,
1326  T &output,
1327  T &block_aggregate)
1328  {
1329  InclusiveScan(input, output, cub::Sum(), block_aggregate);
1330  }
1331 
1332 
1333 
1407  template <typename BlockPrefixCallbackOp>
1408  __device__ __forceinline__ void InclusiveSum(
1409  T input,
1410  T &output,
1411  BlockPrefixCallbackOp &block_prefix_callback_op)
1412  {
1413  InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
1414  }
1415 
1416 
1418  /******************************************************************/
1422 
1423 
1462  template <int ITEMS_PER_THREAD>
1463  __device__ __forceinline__ void InclusiveSum(
1464  T (&input)[ITEMS_PER_THREAD],
1465  T (&output)[ITEMS_PER_THREAD])
1466  {
1467  if (ITEMS_PER_THREAD == 1)
1468  {
1469  InclusiveSum(input[0], output[0]);
1470  }
1471  else
1472  {
1473  // Reduce consecutive thread items in registers
1474  Sum scan_op;
1475  T thread_prefix = internal::ThreadReduce(input, scan_op);
1476 
1477  // Exclusive thread block-scan
1478  ExclusiveSum(thread_prefix, thread_prefix);
1479 
1480  // Inclusive scan in registers with prefix as seed
1481  internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1482  }
1483  }
1484 
1485 
1529  template <int ITEMS_PER_THREAD>
1530  __device__ __forceinline__ void InclusiveSum(
1531  T (&input)[ITEMS_PER_THREAD],
1532  T (&output)[ITEMS_PER_THREAD],
1533  T &block_aggregate)
1534  {
1535  if (ITEMS_PER_THREAD == 1)
1536  {
1537  InclusiveSum(input[0], output[0], block_aggregate);
1538  }
1539  else
1540  {
1541  // Reduce consecutive thread items in registers
1542  Sum scan_op;
1543  T thread_prefix = internal::ThreadReduce(input, scan_op);
1544 
1545  // Exclusive thread block-scan
1546  ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1547 
1548  // Inclusive scan in registers with prefix as seed
1549  internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1550  }
1551  }
1552 
1553 
1639  template <
1640  int ITEMS_PER_THREAD,
1641  typename BlockPrefixCallbackOp>
1642  __device__ __forceinline__ void InclusiveSum(
1643  T (&input)[ITEMS_PER_THREAD],
1644  T (&output)[ITEMS_PER_THREAD],
1645  BlockPrefixCallbackOp &block_prefix_callback_op)
1646  {
1647  if (ITEMS_PER_THREAD == 1)
1648  {
1649  InclusiveSum(input[0], output[0], block_prefix_callback_op);
1650  }
1651  else
1652  {
1653  // Reduce consecutive thread items in registers
1654  Sum scan_op;
1655  T thread_prefix = internal::ThreadReduce(input, scan_op);
1656 
1657  // Exclusive thread block-scan
1658  ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1659 
1660  // Inclusive scan in registers with prefix as seed
1661  internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1662  }
1663  }
1664 
1665 
1667  /******************************************************************/
1671 
1672 
1710  template <typename ScanOp>
1711  __device__ __forceinline__ void InclusiveScan(
1712  T input,
1713  T &output,
1714  ScanOp scan_op)
1715  {
1716  InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
1717  }
1718 
1719 
1759  template <typename ScanOp>
1760  __device__ __forceinline__ void InclusiveScan(
1761  T input,
1762  T &output,
1763  ScanOp scan_op,
1764  T &block_aggregate)
1765  {
1766  InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
1767  }
1768 
1769 
1845  template <
1846  typename ScanOp,
1847  typename BlockPrefixCallbackOp>
1848  __device__ __forceinline__ void InclusiveScan(
1849  T input,
1850  T &output,
1851  ScanOp scan_op,
1852  BlockPrefixCallbackOp &block_prefix_callback_op)
1853  {
1854  InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
1855  }
1856 
1857 
1859  /******************************************************************/
1863 
1864 
1905  template <
1906  int ITEMS_PER_THREAD,
1907  typename ScanOp>
1908  __device__ __forceinline__ void InclusiveScan(
1909  T (&input)[ITEMS_PER_THREAD],
1910  T (&output)[ITEMS_PER_THREAD],
1911  ScanOp scan_op)
1912  {
1913  if (ITEMS_PER_THREAD == 1)
1914  {
1915  InclusiveScan(input[0], output[0], scan_op);
1916  }
1917  else
1918  {
1919  // Reduce consecutive thread items in registers
1920  T thread_prefix = internal::ThreadReduce(input, scan_op);
1921 
1922  // Exclusive thread block-scan
1923  ExclusiveScan(thread_prefix, thread_prefix, scan_op);
1924 
1925  // Inclusive scan in registers with prefix as seed (first thread does not seed)
1926  internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1927  }
1928  }
1929 
1930 
1975  template <
1976  int ITEMS_PER_THREAD,
1977  typename ScanOp>
1978  __device__ __forceinline__ void InclusiveScan(
1979  T (&input)[ITEMS_PER_THREAD],
1980  T (&output)[ITEMS_PER_THREAD],
1981  ScanOp scan_op,
1982  T &block_aggregate)
1983  {
1984  if (ITEMS_PER_THREAD == 1)
1985  {
1986  InclusiveScan(input[0], output[0], scan_op, block_aggregate);
1987  }
1988  else
1989  {
1990  // Reduce consecutive thread items in registers
1991  T thread_prefix = internal::ThreadReduce(input, scan_op);
1992 
1993  // Exclusive thread block-scan (with no initial value)
1994  ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
1995 
1996  // Inclusive scan in registers with prefix as seed (first thread does not seed)
1997  internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1998  }
1999  }
2000 
2001 
2088  template <
2089  int ITEMS_PER_THREAD,
2090  typename ScanOp,
2091  typename BlockPrefixCallbackOp>
2092  __device__ __forceinline__ void InclusiveScan(
2093  T (&input)[ITEMS_PER_THREAD],
2094  T (&output)[ITEMS_PER_THREAD],
2095  ScanOp scan_op,
2096  BlockPrefixCallbackOp &block_prefix_callback_op)
2097  {
2098  if (ITEMS_PER_THREAD == 1)
2099  {
2100  InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
2101  }
2102  else
2103  {
2104  // Reduce consecutive thread items in registers
2105  T thread_prefix = internal::ThreadReduce(input, scan_op);
2106 
2107  // Exclusive thread block-scan
2108  ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
2109 
2110  // Inclusive scan in registers with prefix as seed
2111  internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2112  }
2113  }
2114 
2116 
2117 
2118 };
2119 
2124 } // CUB namespace
2125 CUB_NS_POSTFIX // Optional outer namespace(s)
2126 
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:673
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, T &block_aggregate)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T &block_aggregate)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:572
__device__ __forceinline__ void ExclusiveSum(T input, T &output, T &block_aggregate)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:380
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
Definition: block_scan.cuh:247
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
Definition: block_scan.cuh:929
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
Definition: block_scan.cuh:867
BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread bloc...
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
InternalBlockScan::TempStorage _TempStorage
Shared memory storage layout type for BlockScan.
Definition: block_scan.cuh:228
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
Definition: thread_scan.cuh:63
unsigned int linear_tid
Linear thread-id.
Definition: block_scan.cuh:239
If<(SAFE_ALGORITHM==BLOCK_SCAN_WARP_SCANS), WarpScans, Raking >::Type InternalBlockScan
Define the delegate type for the desired algorithm.
Definition: block_scan.cuh:225
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
__device__ __forceinline__ void InclusiveSum(T input, T &output, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ BlockScan(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition: block_scan.cuh:281
__device__ __forceinline__ void ExclusiveSum(T input, T &output, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:465
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
Definition: block_scan.cuh:728
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition: block_scan.cuh:271
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
static const BlockScanAlgorithm SAFE_ALGORITHM
Definition: block_scan.cuh:214
__device__ __forceinline__ T ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition: block_scan.cuh:193
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
Definition: block_scan.cuh:778
BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:521
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
Definition: block_scan.cuh:991
__device__ __forceinline__ void InclusiveSum(T input, T &output, T &block_aggregate)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
The thread block size in threads.
Definition: block_scan.cuh:205
_TempStorage & temp_storage
Shared storage reference.
Definition: block_scan.cuh:236
Default sum functor.
__device__ __forceinline__ void InclusiveSum(T input, T &output)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
Definition: block_scan.cuh:333
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T &block_aggregate)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
\smemstorage{BlockScan}
Definition: block_scan.cuh:260
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
Definition: block_scan.cuh:57