13 #include "util/cuda_launch.hpp" 15 #if CUDART_VERSION >= 11000 19 #include "hipcub/hipcub.hpp" 21 #include "cub/cub.cuh" 30 #include "util/cuda/moderngpu/kernel_scan.hxx" 33 #include "util/cuda/ofp_context.hxx" 37 template<
typename input_it,
typename output_it>
38 void scan(input_it input,
int count, output_it output, mgpu::ofp_context_t& context)
42 if (count == 0) {
return;}
46 for (
int i = 1 ; i < count ; i++)
48 auto next = prec + output[i-1];
58 if (count == 0) {
return;}
60 void *d_temp_storage = NULL;
61 size_t temp_storage_bytes = 0;
62 hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
66 auto & temporal = context.getTemporalCUB();
67 temporal.resize(temp_storage_bytes);
70 hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
76 void *d_temp_storage = NULL;
77 size_t temp_storage_bytes = 0;
82 auto & temporal = context.getTemporalCUB();
83 temporal.resize(temp_storage_bytes);
93 mgpu::scan(input,count,output,context);
convert a type into constant type
static CUB_RUNTIME_FUNCTION cudaError_t ExclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value,...