13#include "util/cuda_launch.hpp" 
   14#include "util/ofp_context.hpp" 
   16#if CUDART_VERSION >= 11000 
   20            #include "hipcub/hipcub.hpp" 
   22            #include "cub/cub.cuh" 
   32    template<
typename input_it, 
typename output_it, 
typename reduce_op>
 
   38    for (
int i = 0 ; i < count ; i++)
 
   40        output[0] = op(output[0],input[i]);
 
   47        size_t temp_storage_bytes = 0;
 
   48        hipcub::DeviceReduce::Reduce(NULL,
 
   49            temp_storage_bytes,input, output, count, op, 
false);
 
   51        auto & temporal = context.getTemporalCUB();
 
   52        temporal.resize(temp_storage_bytes);
 
   54        hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(),
 
   55            temp_storage_bytes,input, output, count, op, 
false);
 
   58        size_t temp_storage_bytes = 0;
 
   60            temp_storage_bytes, input, output, count, op, 
false);
 
   62        auto & temporal = context.getTemporalCUB();
 
   63        temporal.resize(temp_storage_bytes);
 
   66            temp_storage_bytes, input, output, count, op, 
false);
 
convert a type into constant type
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
this class is a functor for "for_each" algorithm