WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. More...
WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
LOGICAL_WARP_THREADS must be a power-of-two < The PTX compute capability for which to to specialize this collective
Definition at line 58 of file warp_reduce_shfl.cuh.
Data Structures | |
| struct | IsInteger | 
Public Types | |
| enum | { IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)) , STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE , LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS , SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 } | 
| typedef NullType | TempStorage | 
| Shared memory storage layout type.   | |
Public Member Functions | |
| __device__ __forceinline__ | WarpReduceShfl (TempStorage &) | 
| Constructor.   | |
| __device__ __forceinline__ unsigned int | ReduceStep (unsigned int input, cub::Sum, int last_lane, int offset) | 
| Reduction (specialized for summation across uint32 types)   | |
| __device__ __forceinline__ float | ReduceStep (float input, cub::Sum, int last_lane, int offset) | 
| Reduction (specialized for summation across fp32 types)   | |
| __device__ __forceinline__ unsigned long long | ReduceStep (unsigned long long input, cub::Sum, int last_lane, int offset) | 
| Reduction (specialized for summation across unsigned long long types)   | |
| __device__ __forceinline__ long long | ReduceStep (long long input, cub::Sum, int last_lane, int offset) | 
| Reduction (specialized for summation across long long types)   | |
| __device__ __forceinline__ double | ReduceStep (double input, cub::Sum, int last_lane, int offset) | 
| Reduction (specialized for summation across double types)   | |
| template<typename ValueT , typename KeyT > | |
| __device__ __forceinline__ KeyValuePair< KeyT, ValueT > | ReduceStep (KeyValuePair< KeyT, ValueT > input, SwizzleScanOp< ReduceByKeyOp< cub::Sum > >, int last_lane, int offset) | 
| Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)   | |
| template<typename ValueT , typename OffsetT > | |
| __device__ __forceinline__ KeyValuePair< OffsetT, ValueT > | ReduceStep (KeyValuePair< OffsetT, ValueT > input, SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >, int last_lane, int offset) | 
| Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)   | |
| template<typename _T , typename ReductionOp > | |
| __device__ __forceinline__ _T | ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset) | 
| Reduction step (generic)   | |
| template<typename _T , typename ReductionOp > | |
| __device__ __forceinline__ _T | ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< true >) | 
| Reduction step (specialized for small unsigned integers size 32b or less)   | |
| template<typename _T , typename ReductionOp > | |
| __device__ __forceinline__ _T | ReduceStep (_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< false >) | 
| Reduction step (specialized for types other than small unsigned integers size 32b or less)   | |
| template<typename ReductionOp , int STEP> | |
| __device__ __forceinline__ void | ReduceStep (T &input, ReductionOp reduction_op, int last_lane, Int2Type< STEP >) | 
| template<typename ReductionOp > | |
| __device__ __forceinline__ void | ReduceStep (T &, ReductionOp, int, Int2Type< STEPS >) | 
| template<bool ALL_LANES_VALID, typename ReductionOp > | |
| __device__ __forceinline__ T | Reduce (T input, int valid_items, ReductionOp reduction_op) | 
| Reduction.   | |
| template<bool HEAD_SEGMENTED, typename FlagT , typename ReductionOp > | |
| __device__ __forceinline__ T | SegmentedReduce (T input, FlagT flag, ReductionOp reduction_op) | 
| Segmented reduction.   | |
Data Fields | |
| unsigned int | lane_id | 
| Lane index in logical warp.   | |
| unsigned int | warp_id | 
| Logical warp index in 32-thread physical warp.   | |
| unsigned int | member_mask | 
| 32-thread physical warp member mask of logical warp   | |
| typedef NullType cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::TempStorage | 
Shared memory storage layout type.
Definition at line 91 of file warp_reduce_shfl.cuh.
Definition at line 64 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Constructor.
Definition at line 113 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction.
| [in] | input | Calling thread's input | 
| [in] | valid_items | Total number of valid items across the logical warp | 
| [in] | reduction_op | Binary reduction operator | 
Definition at line 466 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction step (generic)
| [in] | input | Calling thread's input item. | 
| [in] | reduction_op | Binary reduction operator | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 389 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction step (specialized for types other than small unsigned integers size 32b or less)
| [in] | input | Calling thread's input item. | 
| [in] | reduction_op | Binary reduction operator | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 422 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction step (specialized for small unsigned integers size 32b or less)
| [in] | input | Calling thread's input item. | 
| [in] | reduction_op | Binary reduction operator | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 409 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction (specialized for summation across double types)
| [in] | input | Calling thread's input item. | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 293 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction (specialized for summation across fp32 types)
| [in] | input | Calling thread's input item. | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 171 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
| [in] | input | Calling thread's input item. | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 341 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
| [in] | input | Calling thread's input item. | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 369 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction (specialized for summation across long long types)
| [in] | input | Calling thread's input item. | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 250 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Definition at line 450 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
| [in] | input | Calling thread's input item. | 
| [in] | reduction_op | Binary reduction operator | 
| [in] | last_lane | Index of last lane in segment | 
Definition at line 438 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction (specialized for summation across uint32 types)
| [in] | input | Calling thread's input item. | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 134 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Reduction (specialized for summation across unsigned long long types)
| [in] | input | Calling thread's input item. | 
| [in] | last_lane | Index of last lane in segment | 
| [in] | offset | Up-offset to pull from | 
Definition at line 208 of file warp_reduce_shfl.cuh.
      
  | 
  inline | 
Segmented reduction.
| [in] | input | Calling thread's input | 
| [in] | flag | Whether or not the current lane is a segment head/tail | 
| [in] | reduction_op | Binary reduction operator | 
Definition at line 496 of file warp_reduce_shfl.cuh.
| unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::lane_id | 
Lane index in logical warp.
Definition at line 99 of file warp_reduce_shfl.cuh.
| unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::member_mask | 
32-thread physical warp member mask of logical warp
Definition at line 105 of file warp_reduce_shfl.cuh.
| unsigned int cub::WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >::warp_id | 
Logical warp index in 32-thread physical warp.
Definition at line 102 of file warp_reduce_shfl.cuh.