OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH > Struct Template Reference

WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. More...

Detailed Description

template<typename T, int LOGICAL_WARP_THREADS, int PTX_ARCH>
struct cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >

WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.

LOGICAL_WARP_THREADS must be a power-of-two< The PTX compute capability for which to to specialize this collective

Definition at line 58 of file warp_reduce_shfl.cuh.

Data Structures

struct  IsInteger
 

Public Types

enum  { IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE, LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS, SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 }
 
typedef NullType TempStorage
 Shared memory storage layout type.
 

Public Member Functions

__device__ __forceinline__ WarpReduceShfl (TempStorage &)
 Constructor.
 
__device__ __forceinline__ unsigned int ReduceStep (unsigned int input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across uint32 types) More...
 
__device__ __forceinline__ float ReduceStep (float input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across fp32 types) More...
 
__device__ __forceinline__ unsigned long long ReduceStep (unsigned long long input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across unsigned long long types) More...
 
__device__ __forceinline__ long long ReduceStep (long long input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across long long types) More...
 
__device__ __forceinline__ double ReduceStep (double input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across double types) More...
 
template<typename ValueT , typename KeyT >
__device__ __forceinline__ KeyValuePair< KeyT, ValueT > ReduceStep (KeyValuePair< KeyT, ValueT > input, SwizzleScanOp< ReduceByKeyOp< cub::Sum > >, int last_lane, int offset)
 Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types) More...
 
template<typename ValueT , typename OffsetT >
__device__ __forceinline__ KeyValuePair< OffsetT, ValueT > ReduceStep (KeyValuePair< OffsetT, ValueT > input, SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >, int last_lane, int offset)
 Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types) More...
 
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset)
 Reduction step (generic) More...
 
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< true >)
 Reduction step (specialized for small unsigned integers size 32b or less) More...
 
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< false >)
 Reduction step (specialized for types other than small unsigned integers size 32b or less) More...
 
template<typename ReductionOp , int STEP>
__device__ __forceinline__ void ReduceStep (T &input, ReductionOp reduction_op, int last_lane, Int2Type< STEP >)
 
template<typename ReductionOp >
__device__ __forceinline__ void ReduceStep (T &, ReductionOp, int, Int2Type< STEPS >)
 
template<bool ALL_LANES_VALID, typename ReductionOp >
__device__ __forceinline__ T Reduce (T input, int valid_items, ReductionOp reduction_op)
 Reduction. More...
 
template<bool HEAD_SEGMENTED, typename FlagT , typename ReductionOp >
__device__ __forceinline__ T SegmentedReduce (T input, FlagT flag, ReductionOp reduction_op)
 Segmented reduction. More...
 

Data Fields

unsigned int lane_id
 Lane index in logical warp.
 
unsigned int warp_id
 Logical warp index in 32-thread physical warp.
 
unsigned int member_mask
 32-thread physical warp member mask of logical warp
 

Member Enumeration Documentation

◆ anonymous enum

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
anonymous enum
Enumerator
IS_ARCH_WARP 

Whether the logical warp size and the PTX warp size coincide.

STEPS 

The number of warp reduction steps.

LOGICAL_WARPS 

Number of logical warps in a PTX warp.

SHFL_C 

The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up.

Definition at line 64 of file warp_reduce_shfl.cuh.

Member Function Documentation

◆ Reduce()

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<bool ALL_LANES_VALID, typename ReductionOp >
__device__ __forceinline__ T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::Reduce ( input,
int  valid_items,
ReductionOp  reduction_op 
)
inline

Reduction.

Parameters
[in]inputCalling thread's input
[in]valid_itemsTotal number of valid items across the logical warp
[in]reduction_opBinary reduction operator

Definition at line 466 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [1/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( unsigned int  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across uint32 types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 134 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [2/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ float cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( float  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across fp32 types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 171 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [3/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ unsigned long long cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( unsigned long long  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across unsigned long long types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 208 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [4/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ long long cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( long long  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across long long types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 250 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [5/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ double cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( double  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across double types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 293 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [6/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename ValueT , typename KeyT >
__device__ __forceinline__ KeyValuePair<KeyT, ValueT> cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( KeyValuePair< KeyT, ValueT >  input,
SwizzleScanOp< ReduceByKeyOp< cub::Sum > >  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 341 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [7/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename ValueT , typename OffsetT >
__device__ __forceinline__ KeyValuePair<OffsetT, ValueT> cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( KeyValuePair< OffsetT, ValueT >  input,
SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 369 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [8/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( _T  input,
ReductionOp  reduction_op,
int  last_lane,
int  offset 
)
inline

Reduction step (generic)

Parameters
[in]inputCalling thread's input item.
[in]reduction_opBinary reduction operator
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 389 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [9/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( _T  input,
ReductionOp  reduction_op,
int  last_lane,
int  offset,
Int2Type< true >   
)
inline

Reduction step (specialized for small unsigned integers size 32b or less)

Parameters
[in]inputCalling thread's input item.
[in]reduction_opBinary reduction operator
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 409 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [10/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( _T  input,
ReductionOp  reduction_op,
int  last_lane,
int  offset,
Int2Type< false >   
)
inline

Reduction step (specialized for types other than small unsigned integers size 32b or less)

Parameters
[in]inputCalling thread's input item.
[in]reduction_opBinary reduction operator
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 422 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [11/11]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename ReductionOp , int STEP>
__device__ __forceinline__ void cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( T &  input,
ReductionOp  reduction_op,
int  last_lane,
Int2Type< STEP >   
)
inline
Parameters
[in]inputCalling thread's input item.
[in]reduction_opBinary reduction operator
[in]last_laneIndex of last lane in segment

Definition at line 438 of file warp_reduce_shfl.cuh.

◆ SegmentedReduce()

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<bool HEAD_SEGMENTED, typename FlagT , typename ReductionOp >
__device__ __forceinline__ T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::SegmentedReduce ( input,
FlagT  flag,
ReductionOp  reduction_op 
)
inline

Segmented reduction.

Parameters
[in]inputCalling thread's input
[in]flagWhether or not the current lane is a segment head/tail
[in]reduction_opBinary reduction operator

Definition at line 496 of file warp_reduce_shfl.cuh.


The documentation for this struct was generated from the following file: