38#include "../thread/thread_operators.cuh"
39#include "../util_arch.cuh"
40#include "../util_type.cuh"
41#include "../util_namespace.cuh"
144 int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
181 unsigned int lane_id;
209 LaneId() % LOGICAL_WARP_THREADS)
446 template <
typename ScanOp>
496 template <
typename ScanOp>
550 template <
typename ScanOp>
559 internal.InclusiveScan(input, inclusive_output,
scan_op);
606 template <
typename ScanOp>
616 internal.InclusiveScan(input, inclusive_output,
scan_op);
667 template <
typename ScanOp>
677 internal.InclusiveScan(input, inclusive_output,
scan_op);
728 template <
typename ScanOp>
739 internal.InclusiveScan(input, inclusive_output,
scan_op);
798 template <
typename ScanOp>
799 __device__ __forceinline__
void Scan(
807 internal.InclusiveScan(input, inclusive_output,
scan_op);
857 template <
typename ScanOp>
858 __device__ __forceinline__
void Scan(
867 internal.InclusiveScan(input, inclusive_output,
scan_op);
924 unsigned int src_lane)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
InternalWarpScan::TempStorage _TempStorage
Shared memory storage layout type for WarpScan.
__device__ __forceinline__ void ExclusiveSum(T input, T &exclusive_output, T &warp_aggregate)
Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial va...
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
__device__ __forceinline__ void InclusiveSum(T input, T &inclusive_output)
Computes an inclusive prefix sum across the calling warp.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &warp_aggregate)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
@ IS_INTEGER
Whether the data type is an integer (which has fully-associative addition)
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &warp_aggregate)
Computes an inclusive prefix scan using the specified binary scan functor across the calling warp....
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
__device__ __forceinline__ WarpScan(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage....
__device__ __forceinline__ void ExclusiveSum(T input, T &exclusive_output)
Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial va...
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, T initial_value, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, T initial_value, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ void InclusiveSum(T input, T &inclusive_output, T &warp_aggregate)
Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wi...
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, T initial_value, ScanOp scan_op, T &warp_aggregate)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
If<(PTX_ARCH >=300)&&(IS_POW_OF_TWO), WarpScanShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >, WarpScanSmem< T, LOGICAL_WARP_THREADS, PTX_ARCH > >::Type InternalWarpScan
Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS i...
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
Type selection (IF ? ThenType : ElseType)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA ...
WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA ...
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...