8 #ifndef REDUCE_OFP_HPP_ 9 #define REDUCE_OFP_HPP_ 13 #include "util/cuda_launch.hpp" 15 #if CUDART_VERSION >= 11000 19 #include "hipcub/hipcub.hpp" 21 #include "cub/cub.cuh" 23 #ifndef REDUCE_WITH_CUB 24 #define REDUCE_WITH_CUB 30 #include "util/cuda/moderngpu/kernel_reduce.hxx" 33 #include "util/cuda/ofp_context.hxx" 37 template<
typename input_it,
typename output_it,
typename reduce_op>
38 void reduce(input_it input,
int count, output_it output,
reduce_op op, mgpu::ofp_context_t& context)
43 for (
int i = 0 ; i < count ; i++)
45 output[0] = op(output[0],input[i]);
49 #ifdef REDUCE_WITH_CUB 53 void *d_temp_storage = NULL;
54 size_t temp_storage_bytes = 0;
55 hipcub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input,
61 auto & temporal = context.getTemporalCUB();
62 temporal.resize(temp_storage_bytes);
65 hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
72 void *d_temp_storage = NULL;
73 size_t temp_storage_bytes = 0;
80 auto & temporal = context.getTemporalCUB();
81 temporal.resize(temp_storage_bytes);
93 mgpu::reduce(input,count,output,op,context);
convert a type into constant type
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
this class is a functor for "for_each" algorithm