39#include "../util_ptx.cuh" 
   40#include "../util_type.cuh" 
   41#include "../thread/thread_operators.cuh" 
   42#include "../util_namespace.cuh" 
  244            RakingCommutativeOnly,
 
  259        return private_storage;
 
  347    template <
typename ReductionOp>
 
  393        int ITEMS_PER_THREAD,
 
  394        typename ReductionOp>
 
  396        T               (&inputs)[ITEMS_PER_THREAD],    
 
  439    template <
typename ReductionOp>
 
  497    __device__ __forceinline__ T 
Sum(
 
  538    template <
int ITEMS_PER_THREAD>
 
  539    __device__ __forceinline__ T 
Sum(
 
  540        T   (&inputs)[ITEMS_PER_THREAD])    
 
  582    __device__ __forceinline__ T 
Sum(
 
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
 
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
 
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int num_valid)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
 
unsigned int linear_tid
Linear thread-id.
 
__device__ __forceinline__ BlockReduce()
Collective constructor using a private static allocation of shared memory as temporary storage.
 
__device__ __forceinline__ T Sum(T input, int num_valid)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
 
__device__ __forceinline__ BlockReduce(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
 
_TempStorage & temp_storage
Shared storage reference.
 
__device__ __forceinline__ T Sum(T input)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
 
__device__ __forceinline__ T Reduce(T(&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
 
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
 
@ BLOCK_THREADS
The thread block size in threads.
 
InternalBlockReduce::TempStorage _TempStorage
Shared memory storage layout type for BlockReduce.
 
__device__ __forceinline__ T Sum(T(&inputs)[ITEMS_PER_THREAD])
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
 
If<(ALGORITHM==BLOCK_REDUCE_WARP_REDUCTIONS), WarpReductions, typenameIf<(ALGORITHM==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), RakingCommutativeOnly, Raking >::Type >::Type InternalBlockReduce
Internal specialization type.
 
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
 
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
 
Optional outer namespace(s)
 
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
 
@ BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
 
@ BLOCK_REDUCE_WARP_REDUCTIONS
 
Alias wrapper allowing storage to be unioned.
 
BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA th...
 
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
 
BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA ...
 
\smemstorage{BlockReduce}
 
Type selection (IF ? ThenType : ElseType)
 
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
 
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...