|
template<typename ChainedPolicyT , typename InputIteratorT , typename OutputIteratorT , typename OffsetT , typename ReductionOpT > |
| cub::__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceReduceKernel(InputIteratorT d_in |
| < Binary reduction functor type having member T operator()(const T &a, const T &b)
|
|
| cub::if (threadIdx.x==0) d_out[blockIdx.x] = reduction_op(init, block_aggregate) |
|
template<typename ChainedPolicyT , typename InputIteratorT , typename OutputIteratorT , typename OffsetT , typename ReductionOpT , typename OuputT > |
| cub::__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) __global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in |
| < Data element type that is convertible to the value type of OutputIteratorT
|
|
| cub::if (num_items==0) |
|
template<typename T , typename OffsetT , typename IteratorT > |
__device__ __forceinline__ void | cub::NormalizeReductionOutput (T &, OffsetT, IteratorT) |
| Normalize input iterator to segment offset.
|
|
template<typename KeyValuePairT , typename OffsetT , typename WrappedIteratorT , typename OutputValueT > |
__device__ __forceinline__ void | cub::NormalizeReductionOutput (KeyValuePairT &val, OffsetT base_offset, ArgIndexInputIterator< WrappedIteratorT, OffsetT, OutputValueT >) |
| Normalize input iterator to segment offset (specialized for arg-index)
|
|
template<typename ChainedPolicyT , typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT , typename OffsetT , typename ReductionOpT , typename OutputT > |
| cub::__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceSegmentedReduceKernel(InputIteratorT d_in |
| < Data element type that is convertible to the value type of OutputIteratorT
|
|
| cub::if (segment_begin==segment_end) |
|
| cub::NormalizeReductionOutput (block_aggregate, segment_begin, d_in) |
|
cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
Definition in file dispatch_reduce.cuh.