39 #include "../util_ptx.cuh" 40 #include "../util_type.cuh" 41 #include "../thread/thread_operators.cuh" 42 #include "../util_namespace.cuh" 244 RakingCommutativeOnly,
259 return private_storage;
347 template <
typename ReductionOp>
393 int ITEMS_PER_THREAD,
394 typename ReductionOp>
396 T (&inputs)[ITEMS_PER_THREAD],
439 template <
typename ReductionOp>
497 __device__ __forceinline__ T
Sum(
538 template <
int ITEMS_PER_THREAD>
539 __device__ __forceinline__ T
Sum(
540 T (&inputs)[ITEMS_PER_THREAD])
582 __device__ __forceinline__ T
Sum(
__device__ __forceinline__ BlockReduce(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
Optional outer namespace(s)
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA ...
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
__device__ __forceinline__ BlockReduce()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ T Sum(T input, int num_valid)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
The thread block size in threads.
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int num_valid)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
InternalBlockReduce::TempStorage _TempStorage
Shared memory storage layout type for BlockReduce.
unsigned int linear_tid
Linear thread-id.
If<(ALGORITHM==BLOCK_REDUCE_WARP_REDUCTIONS), WarpReductions, typename If<(ALGORITHM==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), RakingCommutativeOnly, Raking >::Type >::Type InternalBlockReduce
Internal specialization type.
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
__device__ __forceinline__ T Sum(T(&inputs)[ITEMS_PER_THREAD])
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
__device__ __forceinline__ T Sum(T input)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
__device__ __forceinline__ T Reduce(T(&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
Alias wrapper allowing storage to be unioned.
Type selection (IF ? ThenType : ElseType)
\smemstorage{BlockReduce}
_TempStorage & temp_storage
Shared storage reference.
BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA th...