OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH > Struct Template Reference

WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. More...

Detailed Description

template<typename T, int LOGICAL_WARP_THREADS, int PTX_ARCH>
struct cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >

WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.

LOGICAL_WARP_THREADS must be a power-of-two < The PTX compute capability for which to to specialize this collective

Definition at line 58 of file warp_reduce_shfl.cuh.

Data Structures

struct  IsInteger
 

Public Types

enum  { IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)) , STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE , LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS , SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 }
 
typedef NullType TempStorage
 Shared memory storage layout type.
 

Public Member Functions

__device__ __forceinline__ WarpReduceShfl (TempStorage &)
 Constructor.
 
__device__ __forceinline__ unsigned int ReduceStep (unsigned int input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across uint32 types)
 
__device__ __forceinline__ float ReduceStep (float input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across fp32 types)
 
__device__ __forceinline__ unsigned long long ReduceStep (unsigned long long input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across unsigned long long types)
 
__device__ __forceinline__ long long ReduceStep (long long input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across long long types)
 
__device__ __forceinline__ double ReduceStep (double input, cub::Sum, int last_lane, int offset)
 Reduction (specialized for summation across double types)
 
template<typename ValueT , typename KeyT >
__device__ __forceinline__ KeyValuePair< KeyT, ValueT > ReduceStep (KeyValuePair< KeyT, ValueT > input, SwizzleScanOp< ReduceByKeyOp< cub::Sum > >, int last_lane, int offset)
 Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
 
template<typename ValueT , typename OffsetT >
__device__ __forceinline__ KeyValuePair< OffsetT, ValueT > ReduceStep (KeyValuePair< OffsetT, ValueT > input, SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >, int last_lane, int offset)
 Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
 
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset)
 Reduction step (generic)
 
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< true >)
 Reduction step (specialized for small unsigned integers size 32b or less)
 
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< false >)
 Reduction step (specialized for types other than small unsigned integers size 32b or less)
 
template<typename ReductionOp , int STEP>
__device__ __forceinline__ void ReduceStep (T &input, ReductionOp reduction_op, int last_lane, Int2Type< STEP >)
 
template<typename ReductionOp >
__device__ __forceinline__ void ReduceStep (T &, ReductionOp, int, Int2Type< STEPS >)
 
template<bool ALL_LANES_VALID, typename ReductionOp >
__device__ __forceinline__ T Reduce (T input, int valid_items, ReductionOp reduction_op)
 Reduction.
 
template<bool HEAD_SEGMENTED, typename FlagT , typename ReductionOp >
__device__ __forceinline__ T SegmentedReduce (T input, FlagT flag, ReductionOp reduction_op)
 Segmented reduction.
 

Data Fields

unsigned int lane_id
 Lane index in logical warp.
 
unsigned int warp_id
 Logical warp index in 32-thread physical warp.
 
unsigned int member_mask
 32-thread physical warp member mask of logical warp
 

Member Typedef Documentation

◆ TempStorage

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
typedef NullType cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::TempStorage

Shared memory storage layout type.

Definition at line 91 of file warp_reduce_shfl.cuh.

Member Enumeration Documentation

◆ anonymous enum

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
anonymous enum
Enumerator
IS_ARCH_WARP 

Whether the logical warp size and the PTX warp size coincide.

STEPS 

The number of warp reduction steps.

LOGICAL_WARPS 

Number of logical warps in a PTX warp.

SHFL_C 

The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up.

Definition at line 64 of file warp_reduce_shfl.cuh.

Constructor & Destructor Documentation

◆ WarpReduceShfl()

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::WarpReduceShfl ( TempStorage )
inline

Constructor.

Definition at line 113 of file warp_reduce_shfl.cuh.

Member Function Documentation

◆ Reduce()

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<bool ALL_LANES_VALID, typename ReductionOp >
__device__ __forceinline__ T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::Reduce ( input,
int  valid_items,
ReductionOp  reduction_op 
)
inline

Reduction.

Parameters
[in]inputCalling thread's input
[in]valid_itemsTotal number of valid items across the logical warp
[in]reduction_opBinary reduction operator

Definition at line 466 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [1/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( _T  input,
ReductionOp  reduction_op,
int  last_lane,
int  offset 
)
inline

Reduction step (generic)

Parameters
[in]inputCalling thread's input item.
[in]reduction_opBinary reduction operator
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 389 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [2/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( _T  input,
ReductionOp  reduction_op,
int  last_lane,
int  offset,
Int2Type< false >   
)
inline

Reduction step (specialized for types other than small unsigned integers size 32b or less)

Parameters
[in]inputCalling thread's input item.
[in]reduction_opBinary reduction operator
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 422 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [3/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename _T , typename ReductionOp >
__device__ __forceinline__ _T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( _T  input,
ReductionOp  reduction_op,
int  last_lane,
int  offset,
Int2Type< true >   
)
inline

Reduction step (specialized for small unsigned integers size 32b or less)

Parameters
[in]inputCalling thread's input item.
[in]reduction_opBinary reduction operator
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 409 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [4/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ double cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( double  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across double types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 293 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [5/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ float cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( float  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across fp32 types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 171 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [6/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename ValueT , typename KeyT >
__device__ __forceinline__ KeyValuePair< KeyT, ValueT > cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( KeyValuePair< KeyT, ValueT >  input,
SwizzleScanOp< ReduceByKeyOp< cub::Sum > >  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 341 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [7/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename ValueT , typename OffsetT >
__device__ __forceinline__ KeyValuePair< OffsetT, ValueT > cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( KeyValuePair< OffsetT, ValueT >  input,
SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 369 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [8/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ long long cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( long long  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across long long types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 250 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [9/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename ReductionOp >
__device__ __forceinline__ void cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( T &  ,
ReductionOp  ,
int  ,
Int2Type< STEPS  
)
inline

Definition at line 450 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [10/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<typename ReductionOp , int STEP>
__device__ __forceinline__ void cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( T &  input,
ReductionOp  reduction_op,
int  last_lane,
Int2Type< STEP >   
)
inline
Parameters
[in]inputCalling thread's input item.
[in]reduction_opBinary reduction operator
[in]last_laneIndex of last lane in segment

Definition at line 438 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [11/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( unsigned int  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across uint32 types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 134 of file warp_reduce_shfl.cuh.

◆ ReduceStep() [12/12]

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
__device__ __forceinline__ unsigned long long cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::ReduceStep ( unsigned long long  input,
cub::Sum  ,
int  last_lane,
int  offset 
)
inline

Reduction (specialized for summation across unsigned long long types)

Parameters
[in]inputCalling thread's input item.
[in]last_laneIndex of last lane in segment
[in]offsetUp-offset to pull from

Definition at line 208 of file warp_reduce_shfl.cuh.

◆ SegmentedReduce()

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
template<bool HEAD_SEGMENTED, typename FlagT , typename ReductionOp >
__device__ __forceinline__ T cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::SegmentedReduce ( input,
FlagT  flag,
ReductionOp  reduction_op 
)
inline

Segmented reduction.

Parameters
[in]inputCalling thread's input
[in]flagWhether or not the current lane is a segment head/tail
[in]reduction_opBinary reduction operator

Definition at line 496 of file warp_reduce_shfl.cuh.

Field Documentation

◆ lane_id

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::lane_id

Lane index in logical warp.

Definition at line 99 of file warp_reduce_shfl.cuh.

◆ member_mask

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::member_mask

32-thread physical warp member mask of logical warp

Definition at line 105 of file warp_reduce_shfl.cuh.

◆ warp_id

template<typename T , int LOGICAL_WARP_THREADS, int PTX_ARCH>
unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::warp_id

Logical warp index in 32-thread physical warp.

Definition at line 102 of file warp_reduce_shfl.cuh.


The documentation for this struct was generated from the following file: