OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
scan_ofp.cuh
1 /*
2  * scan_ofp.hpp
3  *
4  * Created on: May 15, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef SCAN_OFP_HPP_
9 #define SCAN_OFP_HPP_
10 
11 #ifdef __NVCC__
12 
13 #include "util/cuda_launch.hpp"
14 
15 #if CUDART_VERSION >= 11000
16  #ifndef CUDA_ON_CPU
17  // Here we have for sure CUDA >= 11
18  #ifdef __HIP__
19  #include "hipcub/hipcub.hpp"
20  #else
21  #include "cub/cub.cuh"
22  #endif
23  #ifndef SCAN_WITH_CUB
24  #define SCAN_WITH_CUB
25  #endif
26  #endif
27 #else
28  // Here we have old CUDA
29  #include "cub_old/cub.cuh"
30  #include "util/cuda/moderngpu/kernel_scan.hxx"
31 #endif
32 
33 #include "util/cuda/ofp_context.hxx"
34 
35 namespace openfpm
36 {
37  template<typename input_it, typename output_it>
38  void scan(input_it input, int count, output_it output, mgpu::ofp_context_t& context)
39  {
40 #ifdef CUDA_ON_CPU
41 
42  if (count == 0) {return;}
43 
44  auto prec = input[0];
45  output[0] = 0;
46  for (int i = 1 ; i < count ; i++)
47  {
48  auto next = prec + output[i-1];
49  prec = input[i];
50  output[i] = next;
51  }
52 
53 #else
54  #ifdef SCAN_WITH_CUB
55 
56  #ifdef __HIP__
57 
58  if (count == 0) {return;}
59 
60  void *d_temp_storage = NULL;
61  size_t temp_storage_bytes = 0;
62  hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
63  output,
64  count);
65 
66  auto & temporal = context.getTemporalCUB();
67  temporal.resize(temp_storage_bytes);
68 
69  // Run
70  hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
71  output,
72  count);
73 
74  #else
75 
76  void *d_temp_storage = NULL;
77  size_t temp_storage_bytes = 0;
78  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
79  output,
80  count);
81 
82  auto & temporal = context.getTemporalCUB();
83  temporal.resize(temp_storage_bytes);
84 
85  // Run
86  cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
87  output,
88  count);
89 
90  #endif
91 
92  #else
93  mgpu::scan(input,count,output,context);
94  #endif
95 #endif
96  }
97 }
98 
99 #endif /* __NVCC__ */
100 
101 #endif /* SCAN_OFP_HPP_ */
convert a type into constant type
Definition: aggregate.hpp:292
static CUB_RUNTIME_FUNCTION cudaError_t ExclusiveSum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value,...