8 #ifndef REDUCE_OFP_HPP_
9 #define REDUCE_OFP_HPP_
13 #include "util/cuda_util.hpp"
14 #include "util/ofp_context.hpp"
16 #if CUDART_VERSION >= 11000
20 #include "hipcub/hipcub.hpp"
22 #include "cub/cub.cuh"
32 template<
typename input_it,
typename output_it,
typename reduce_op>
33 void reduce(input_it input,
int count, output_it output, reduce_op op,
gpu::ofp_context_t& gpuContext)
38 for (
int i = 1 ; i < count ; i++)
40 output[0] = op(output[0],input[i]);
47 size_t temp_storage_bytes = 0;
48 hipcub::DeviceReduce::Reduce(NULL,
49 temp_storage_bytes,input, output, count, op, op.reduceInitValue());
51 auto & temporal = gpuContext.getTemporalCUB();
52 temporal.resize(temp_storage_bytes);
54 hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(),
55 temp_storage_bytes,input, output, count, op, op.reduceInitValue());
58 size_t temp_storage_bytes = 0;
60 temp_storage_bytes, input, output, count, op, op.reduceInitValue());
62 auto & temporal = gpuContext.getTemporalCUB();
63 temporal.resize(temp_storage_bytes);
66 temp_storage_bytes, input, output, count, op, op.reduceInitValue());
convert a type into constant type
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...