WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. More...
WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
LOGICAL_WARP_THREADS must be a power-of-two < The PTX compute capability for which to to specialize this collective
Definition at line 58 of file warp_reduce_shfl.cuh.
Data Structures | |
struct | IsInteger |
Public Types | |
enum | { IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)) , STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE , LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS , SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 } |
typedef NullType | TempStorage |
Shared memory storage layout type. | |
Public Member Functions | |
__device__ __forceinline__ | WarpReduceShfl (TempStorage &) |
Constructor. | |
__device__ __forceinline__ unsigned int | ReduceStep (unsigned int input, cub::Sum, int last_lane, int offset) |
Reduction (specialized for summation across uint32 types) | |
__device__ __forceinline__ float | ReduceStep (float input, cub::Sum, int last_lane, int offset) |
Reduction (specialized for summation across fp32 types) | |
__device__ __forceinline__ unsigned long long | ReduceStep (unsigned long long input, cub::Sum, int last_lane, int offset) |
Reduction (specialized for summation across unsigned long long types) | |
__device__ __forceinline__ long long | ReduceStep (long long input, cub::Sum, int last_lane, int offset) |
Reduction (specialized for summation across long long types) | |
__device__ __forceinline__ double | ReduceStep (double input, cub::Sum, int last_lane, int offset) |
Reduction (specialized for summation across double types) | |
template<typename ValueT , typename KeyT > | |
__device__ __forceinline__ KeyValuePair< KeyT, ValueT > | ReduceStep (KeyValuePair< KeyT, ValueT > input, SwizzleScanOp< ReduceByKeyOp< cub::Sum > >, int last_lane, int offset) |
Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types) | |
template<typename ValueT , typename OffsetT > | |
__device__ __forceinline__ KeyValuePair< OffsetT, ValueT > | ReduceStep (KeyValuePair< OffsetT, ValueT > input, SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >, int last_lane, int offset) |
Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types) | |
template<typename _T , typename ReductionOp > | |
__device__ __forceinline__ _T | ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset) |
Reduction step (generic) | |
template<typename _T , typename ReductionOp > | |
__device__ __forceinline__ _T | ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< true >) |
Reduction step (specialized for small unsigned integers size 32b or less) | |
template<typename _T , typename ReductionOp > | |
__device__ __forceinline__ _T | ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< false >) |
Reduction step (specialized for types other than small unsigned integers size 32b or less) | |
template<typename ReductionOp , int STEP> | |
__device__ __forceinline__ void | ReduceStep (T &input, ReductionOp reduction_op, int last_lane, Int2Type< STEP >) |
template<typename ReductionOp > | |
__device__ __forceinline__ void | ReduceStep (T &, ReductionOp, int, Int2Type< STEPS >) |
template<bool ALL_LANES_VALID, typename ReductionOp > | |
__device__ __forceinline__ T | Reduce (T input, int valid_items, ReductionOp reduction_op) |
Reduction. | |
template<bool HEAD_SEGMENTED, typename FlagT , typename ReductionOp > | |
__device__ __forceinline__ T | SegmentedReduce (T input, FlagT flag, ReductionOp reduction_op) |
Segmented reduction. | |
Data Fields | |
unsigned int | lane_id |
Lane index in logical warp. | |
unsigned int | warp_id |
Logical warp index in 32-thread physical warp. | |
unsigned int | member_mask |
32-thread physical warp member mask of logical warp | |
typedef NullType cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::TempStorage |
Shared memory storage layout type.
Definition at line 91 of file warp_reduce_shfl.cuh.
Definition at line 64 of file warp_reduce_shfl.cuh.
|
inline |
Constructor.
Definition at line 113 of file warp_reduce_shfl.cuh.
|
inline |
Reduction.
[in] | input | Calling thread's input |
[in] | valid_items | Total number of valid items across the logical warp |
[in] | reduction_op | Binary reduction operator |
Definition at line 466 of file warp_reduce_shfl.cuh.
|
inline |
Reduction step (generic)
[in] | input | Calling thread's input item. |
[in] | reduction_op | Binary reduction operator |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 389 of file warp_reduce_shfl.cuh.
|
inline |
Reduction step (specialized for types other than small unsigned integers size 32b or less)
[in] | input | Calling thread's input item. |
[in] | reduction_op | Binary reduction operator |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 422 of file warp_reduce_shfl.cuh.
|
inline |
Reduction step (specialized for small unsigned integers size 32b or less)
[in] | input | Calling thread's input item. |
[in] | reduction_op | Binary reduction operator |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 409 of file warp_reduce_shfl.cuh.
|
inline |
Reduction (specialized for summation across double types)
[in] | input | Calling thread's input item. |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 293 of file warp_reduce_shfl.cuh.
|
inline |
Reduction (specialized for summation across fp32 types)
[in] | input | Calling thread's input item. |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 171 of file warp_reduce_shfl.cuh.
|
inline |
Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
[in] | input | Calling thread's input item. |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 341 of file warp_reduce_shfl.cuh.
|
inline |
Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
[in] | input | Calling thread's input item. |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 369 of file warp_reduce_shfl.cuh.
|
inline |
Reduction (specialized for summation across long long types)
[in] | input | Calling thread's input item. |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 250 of file warp_reduce_shfl.cuh.
|
inline |
Definition at line 450 of file warp_reduce_shfl.cuh.
|
inline |
[in] | input | Calling thread's input item. |
[in] | reduction_op | Binary reduction operator |
[in] | last_lane | Index of last lane in segment |
Definition at line 438 of file warp_reduce_shfl.cuh.
|
inline |
Reduction (specialized for summation across uint32 types)
[in] | input | Calling thread's input item. |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 134 of file warp_reduce_shfl.cuh.
|
inline |
Reduction (specialized for summation across unsigned long long types)
[in] | input | Calling thread's input item. |
[in] | last_lane | Index of last lane in segment |
[in] | offset | Up-offset to pull from |
Definition at line 208 of file warp_reduce_shfl.cuh.
|
inline |
Segmented reduction.
[in] | input | Calling thread's input |
[in] | flag | Whether or not the current lane is a segment head/tail |
[in] | reduction_op | Binary reduction operator |
Definition at line 496 of file warp_reduce_shfl.cuh.
unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::lane_id |
Lane index in logical warp.
Definition at line 99 of file warp_reduce_shfl.cuh.
unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::member_mask |
32-thread physical warp member mask of logical warp
Definition at line 105 of file warp_reduce_shfl.cuh.
unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::warp_id |
Logical warp index in 32-thread physical warp.
Definition at line 102 of file warp_reduce_shfl.cuh.