39#include "../util_ptx.cuh"
40#include "../util_type.cuh"
41#include "../thread/thread_operators.cuh"
42#include "../util_namespace.cuh"
244 RakingCommutativeOnly,
259 return private_storage;
347 template <
typename ReductionOp>
393 int ITEMS_PER_THREAD,
394 typename ReductionOp>
396 T (&inputs)[ITEMS_PER_THREAD],
439 template <
typename ReductionOp>
497 __device__ __forceinline__ T
Sum(
538 template <
int ITEMS_PER_THREAD>
539 __device__ __forceinline__ T
Sum(
540 T (&inputs)[ITEMS_PER_THREAD])
582 __device__ __forceinline__ T
Sum(
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int num_valid)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ BlockReduce()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ T Sum(T input, int num_valid)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
__device__ __forceinline__ BlockReduce(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ T Sum(T input)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
__device__ __forceinline__ T Reduce(T(&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
@ BLOCK_THREADS
The thread block size in threads.
InternalBlockReduce::TempStorage _TempStorage
Shared memory storage layout type for BlockReduce.
__device__ __forceinline__ T Sum(T(&inputs)[ITEMS_PER_THREAD])
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
If<(ALGORITHM==BLOCK_REDUCE_WARP_REDUCTIONS), WarpReductions, typenameIf<(ALGORITHM==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), RakingCommutativeOnly, Raking >::Type >::Type InternalBlockReduce
Internal specialization type.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
@ BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
@ BLOCK_REDUCE_WARP_REDUCTIONS
Alias wrapper allowing storage to be unioned.
BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA th...
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA ...
\smemstorage{BlockReduce}
Type selection (IF ? ThenType : ElseType)
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...