8 #ifndef SEGREDUCE_OFP_HPP_
9 #define SEGREDUCE_OFP_HPP_
13 #include "util/cuda_launch.hpp"
14 #include "util/ofp_context.hpp"
16 #if CUDART_VERSION >= 11000
20 #include "hipcub/hipcub.hpp"
22 #include "cub/cub.cuh"
32 template<
typename input_it,
33 typename segments_it,
typename output_it,
typename op_t,
typename type_t>
34 void segreduce(input_it input,
int count, segments_it segments,
35 int num_segments, output_it output, op_t op, type_t init,
41 for ( ; i < num_segments - 1; i++)
45 if (j == segments[i+1]) {
continue;}
48 for ( ; j < segments[i+1] ; j++)
50 output[i] = op(output[i],input[j]);
60 for ( ; j < count ; j++)
62 output[i] = op(output[i],input[j]);
69 size_t temp_storage_bytes = 0;
71 hipcub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,
72 num_segments, segments, segments + 1, op, init);
74 auto & temporal = context.getTemporalCUB();
75 temporal.resize(temp_storage_bytes);
77 hipcub::DeviceSegmentedReduce::Reduce(temporal.getDeviceBuffer<0>(), temp_storage_bytes, input, output,
78 num_segments, segments, segments + 1, op, init);
82 size_t temp_storage_bytes = 0;
85 num_segments, segments, segments + 1, op, init);
87 auto & temporal = context.getTemporalCUB();
88 temporal.resize(temp_storage_bytes);
91 num_segments, segments, segments + 1, op, init);
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
convert a type into constant type
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.