13#include "util/cuda_launch.hpp"
14#include "util/ofp_context.hpp"
16#if CUDART_VERSION >= 11000
20 #include "hipcub/hipcub.hpp"
22 #include "cub/cub.cuh"
32 template<
typename input_it,
typename output_it>
37 if (count == 0) {
return;}
41 for (
int i = 1 ; i < count ; i++)
43 auto next = prec + output[i-1];
49 if (count == 0)
return;
53 size_t temp_storage_bytes = 0;
54 hipcub::DeviceScan::ExclusiveSum(NULL,
55 temp_storage_bytes,input, output, count);
57 auto & temporal = context.getTemporalCUB();
58 temporal.resize(temp_storage_bytes);
60 hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(),
61 temp_storage_bytes, input, output, count);
65 size_t temp_storage_bytes = 0;
67 temp_storage_bytes, input, output, count);
69 auto & temporal = context.getTemporalCUB();
70 temporal.resize(temp_storage_bytes);
73 temp_storage_bytes,input, output, count);
convert a type into constant type
static CUB_RUNTIME_FUNCTION cudaError_t ExclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value,...