OpenFPM  5.2.0
Project that contain the implementation of distributed structures
scan_ofp.cuh
1 /*
2  * scan_ofp.hpp
3  *
4  * Created on: May 15, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef SCAN_OFP_HPP_
9 #define SCAN_OFP_HPP_
10 
11 #ifdef __NVCC__
12 
13 #include "util/cuda_util.hpp"
14 #include "util/ofp_context.hpp"
15 
16 #if CUDART_VERSION >= 11000
17  // Here we have for sure CUDA >= 11
18  #ifndef CUDA_ON_CPU
19  #ifdef __HIP__
20  #include "hipcub/hipcub.hpp"
21  #else
22  #include "cub/cub.cuh"
23  #endif
24  #endif
25 #else
26  #include "cub_old/cub.cuh"
27 #endif
28 
29 
30 namespace openfpm
31 {
32  template<typename input_it, typename output_it>
33  void scan(input_it input, int count, output_it output, gpu::ofp_context_t& gpuContext)
34  {
35 #ifdef CUDA_ON_CPU
36 
37  if (count == 0) {return;}
38 
39  auto prec = input[0];
40  output[0] = 0;
41  for (int i = 1 ; i < count ; i++)
42  {
43  auto next = prec + output[i-1];
44  prec = input[i];
45  output[i] = next;
46  }
47 
48 #else
49  if (count == 0) return;
50 
51  #ifdef __HIP__
52 
53  size_t temp_storage_bytes = 0;
54  hipcub::DeviceScan::ExclusiveSum(NULL,
55  temp_storage_bytes,input, output, count);
56 
57  auto & temporal = gpuContext.getTemporalCUB();
58  temporal.resize(temp_storage_bytes);
59 
60  hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(),
61  temp_storage_bytes, input, output, count);
62 
63  #else
64 
65  size_t temp_storage_bytes = 0;
67  temp_storage_bytes, input, output, count);
68 
69  auto & temporal = gpuContext.getTemporalCUB();
70  temporal.resize(temp_storage_bytes);
71 
72  cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(),
73  temp_storage_bytes,input, output, count);
74 
75  #endif
76 #endif
77  }
78 }
79 
80 #endif /* __NVCC__ */
81 
82 #endif /* SCAN_OFP_HPP_ */
convert a type into constant type
Definition: aggregate.hpp:302
static CUB_RUNTIME_FUNCTION cudaError_t ExclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value,...