38#include "../util_arch.cuh"
39#include "../util_type.cuh"
40#include "../util_ptx.cuh"
41#include "../util_namespace.cuh"
250 return private_storage;
464 template <
typename BlockPrefixCallbackOp>
468 BlockPrefixCallbackOp &block_prefix_callback_op)
520 template <
int ITEMS_PER_THREAD>
522 T (&input)[ITEMS_PER_THREAD],
523 T (&output)[ITEMS_PER_THREAD])
571 template <
int ITEMS_PER_THREAD>
573 T (&input)[ITEMS_PER_THREAD],
574 T (&output)[ITEMS_PER_THREAD],
671 int ITEMS_PER_THREAD,
672 typename BlockPrefixCallbackOp>
674 T (&input)[ITEMS_PER_THREAD],
675 T (&output)[ITEMS_PER_THREAD],
676 BlockPrefixCallbackOp &block_prefix_callback_op)
727 template <
typename ScanOp>
777 template <
typename ScanOp>
866 typename BlockPrefixCallbackOp>
871 BlockPrefixCallbackOp &block_prefix_callback_op)
927 int ITEMS_PER_THREAD,
930 T (&input)[ITEMS_PER_THREAD],
931 T (&output)[ITEMS_PER_THREAD],
989 int ITEMS_PER_THREAD,
992 T (&input)[ITEMS_PER_THREAD],
993 T (&output)[ITEMS_PER_THREAD],
1096 int ITEMS_PER_THREAD,
1098 typename BlockPrefixCallbackOp>
1100 T (&input)[ITEMS_PER_THREAD],
1101 T (&output)[ITEMS_PER_THREAD],
1103 BlockPrefixCallbackOp &block_prefix_callback_op)
1117#ifndef DOXYGEN_SHOULD_SKIP_THIS
1135 template <
typename ScanOp>
1155 template <
typename ScanOp>
1185 int ITEMS_PER_THREAD,
1188 T (&input)[ITEMS_PER_THREAD],
1189 T (&output)[ITEMS_PER_THREAD],
1216 int ITEMS_PER_THREAD,
1219 T (&input)[ITEMS_PER_THREAD],
1220 T (&output)[ITEMS_PER_THREAD],
1407 template <
typename BlockPrefixCallbackOp>
1411 BlockPrefixCallbackOp &block_prefix_callback_op)
1462 template <
int ITEMS_PER_THREAD>
1464 T (&input)[ITEMS_PER_THREAD],
1465 T (&output)[ITEMS_PER_THREAD])
1467 if (ITEMS_PER_THREAD == 1)
1529 template <
int ITEMS_PER_THREAD>
1531 T (&input)[ITEMS_PER_THREAD],
1532 T (&output)[ITEMS_PER_THREAD],
1535 if (ITEMS_PER_THREAD == 1)
1546 ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1640 int ITEMS_PER_THREAD,
1641 typename BlockPrefixCallbackOp>
1643 T (&input)[ITEMS_PER_THREAD],
1644 T (&output)[ITEMS_PER_THREAD],
1645 BlockPrefixCallbackOp &block_prefix_callback_op)
1647 if (ITEMS_PER_THREAD == 1)
1649 InclusiveSum(input[0], output[0], block_prefix_callback_op);
1658 ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1710 template <
typename ScanOp>
1759 template <
typename ScanOp>
1847 typename BlockPrefixCallbackOp>
1852 BlockPrefixCallbackOp &block_prefix_callback_op)
1906 int ITEMS_PER_THREAD,
1909 T (&input)[ITEMS_PER_THREAD],
1910 T (&output)[ITEMS_PER_THREAD],
1913 if (ITEMS_PER_THREAD == 1)
1976 int ITEMS_PER_THREAD,
1979 T (&input)[ITEMS_PER_THREAD],
1980 T (&output)[ITEMS_PER_THREAD],
1984 if (ITEMS_PER_THREAD == 1)
2089 int ITEMS_PER_THREAD,
2091 typename BlockPrefixCallbackOp>
2093 T (&input)[ITEMS_PER_THREAD],
2094 T (&output)[ITEMS_PER_THREAD],
2096 BlockPrefixCallbackOp &block_prefix_callback_op)
2098 if (ITEMS_PER_THREAD == 1)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
InternalBlockScan::TempStorage _TempStorage
Shared memory storage layout type for BlockScan.
__device__ __forceinline__ void InclusiveSum(T input, T &output, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T &block_aggregate)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T input, T &output)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, T &block_aggregate)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
static const BlockScanAlgorithm SAFE_ALGORITHM
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ BlockScan()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveSum(T input, T &output, T &block_aggregate)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T input, T &output, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveSum(T input, T &output, T &block_aggregate)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ BlockScan(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void InclusiveSum(T input, T &output)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T &block_aggregate)
Computes an inclusive block-wide prefix scan using addition (+) as the scan operator....
__device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
If<(SAFE_ALGORITHM==BLOCK_SCAN_WARP_SCANS), WarpScans, Raking >::Type InternalBlockScan
Define the delegate type for the desired algorithm.
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
Computes an inclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ T ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
@ BLOCK_SCAN_RAKING_MEMOIZE
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread bloc...
Type selection (IF ? ThenType : ElseType)
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...