38#include "../thread/thread_operators.cuh"
39#include "../util_arch.cuh"
40#include "../util_type.cuh"
41#include "../util_namespace.cuh"
139 int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
160 #ifndef DOXYGEN_SHOULD_SKIP_THIS
251 __device__ __forceinline__ T
Sum(
295 __device__ __forceinline__ T
Sum(
444 template <
typename ReductionOp>
493 template <
typename ReductionOp>
543 typename ReductionOp,
594 typename ReductionOp,
The WarpReduce class provides collective methods for computing a parallel reduction of items partitio...
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
Computes a warp-wide reduction in the calling warp using the specified binary reduction functor....
__device__ __forceinline__ T HeadSegmentedSum(T input, FlagT head_flag)
Computes a segmented sum in the calling warp where segments are defined by head-flags....
__device__ __forceinline__ T HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op)
Computes a segmented reduction in the calling warp where segments are defined by head-flags....
__device__ __forceinline__ T TailSegmentedSum(T input, FlagT tail_flag)
Computes a segmented sum in the calling warp where segments are defined by tail-flags....
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
_TempStorage & temp_storage
Shared storage reference.
InternalWarpReduce::TempStorage _TempStorage
Shared memory storage layout type for WarpReduce.
__device__ __forceinline__ T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op)
Computes a segmented reduction in the calling warp where segments are defined by tail-flags....
__device__ __forceinline__ T Sum(T input)
Computes a warp-wide sum in the calling warp. The output is valid in warp lane0.
__device__ __forceinline__ WarpReduce(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage....
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int valid_items)
Computes a partially-full warp-wide reduction in the calling warp using the specified binary reductio...
__device__ __forceinline__ T Sum(T input, int valid_items)
Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0.
If<(PTX_ARCH >=300)&&(IS_POW_OF_TWO), WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >, WarpReduceSmem< T, LOGICAL_WARP_THREADS, PTX_ARCH > >::Type InternalWarpReduce
Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THRE...
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Alias wrapper allowing storage to be unioned.
Type selection (IF ? ThenType : ElseType)
Statically determine if N is a power-of-two.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA ...
WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA ...
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...