|
| template<typename ChainedPolicyT , typename InputIteratorT , typename OutputIteratorT , typename OffsetT , typename ReductionOpT > |
| | cub::__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceReduceKernel(InputIteratorT d_in |
| | < Binary reduction functor type having member T operator()(const T &a, const T &b)
|
| |
|
| cub::if (threadIdx.x==0) d_out[blockIdx.x] = reduction_op(init, block_aggregate) |
| |
| template<typename ChainedPolicyT , typename InputIteratorT , typename OutputIteratorT , typename OffsetT , typename ReductionOpT , typename OuputT > |
| | cub::__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) __global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in |
| | < Data element type that is convertible to the value type of OutputIteratorT
|
| |
| | cub::if (num_items==0) |
| |
| template<typename T , typename OffsetT , typename IteratorT > |
| __device__ __forceinline__ void | cub::NormalizeReductionOutput (T &, OffsetT, IteratorT) |
| | Normalize input iterator to segment offset.
|
| |
| template<typename KeyValuePairT , typename OffsetT , typename WrappedIteratorT , typename OutputValueT > |
| __device__ __forceinline__ void | cub::NormalizeReductionOutput (KeyValuePairT &val, OffsetT base_offset, ArgIndexInputIterator< WrappedIteratorT, OffsetT, OutputValueT >) |
| | Normalize input iterator to segment offset (specialized for arg-index)
|
| |
| template<typename ChainedPolicyT , typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT , typename OffsetT , typename ReductionOpT , typename OutputT > |
| | cub::__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceSegmentedReduceKernel(InputIteratorT d_in |
| | < Data element type that is convertible to the value type of OutputIteratorT
|
| |
| | cub::if (segment_begin==segment_end) |
| |
|
| cub::NormalizeReductionOutput (block_aggregate, segment_begin, d_in) |
| |
cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
Definition in file dispatch_reduce.cuh.