OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
reduce_ofp.cuh
1 /*
2  * reduce_ofp.hpp
3  *
4  * Created on: May 15, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef REDUCE_OFP_HPP_
9 #define REDUCE_OFP_HPP_
10 
11 #ifdef __NVCC__
12 
13 #include "util/cuda_launch.hpp"
14 
15 #if CUDART_VERSION >= 11000
16  #ifndef CUDA_ON_CPU
17  // Here we have for sure CUDA >= 11
18  #ifdef __HIP__
19  #include "hipcub/hipcub.hpp"
20  #else
21  #include "cub/cub.cuh"
22  #endif
23  #ifndef REDUCE_WITH_CUB
24  #define REDUCE_WITH_CUB
25  #endif
26  #endif
27 #else
28  // Here we have old CUDA
29  #include "cub_old/cub.cuh"
30  #include "util/cuda/moderngpu/kernel_reduce.hxx"
31 #endif
32 
33 #include "util/cuda/ofp_context.hxx"
34 
35 namespace openfpm
36 {
37  template<typename input_it, typename output_it, typename reduce_op>
38  void reduce(input_it input, int count, output_it output, reduce_op op, mgpu::ofp_context_t& context)
39  {
40 #ifdef CUDA_ON_CPU
41 
42  output[0] = 0;
43  for (int i = 0 ; i < count ; i++)
44  {
45  output[0] = op(output[0],input[i]);
46  }
47 
48 #else
49  #ifdef REDUCE_WITH_CUB
50 
51  #ifdef __HIP__
52 
53  void *d_temp_storage = NULL;
54  size_t temp_storage_bytes = 0;
55  hipcub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input,
56  output,
57  count,
58  op,
59  false);
60 
61  auto & temporal = context.getTemporalCUB();
62  temporal.resize(temp_storage_bytes);
63 
64  // Run
65  hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
66  output,
67  count,
68  op,
69  false);
70  #else
71 
72  void *d_temp_storage = NULL;
73  size_t temp_storage_bytes = 0;
74  cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input,
75  output,
76  count,
77  op,
78  false);
79 
80  auto & temporal = context.getTemporalCUB();
81  temporal.resize(temp_storage_bytes);
82 
83  // Run
84  cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
85  output,
86  count,
87  op,
88  false);
89 
90  #endif
91 
92  #else
93  mgpu::reduce(input,count,output,op,context);
94  #endif
95 #endif
96  }
97 }
98 
99 #endif
100 
101 #endif /* REDUCE_OFP_HPP_ */
convert a type into constant type
Definition: aggregate.hpp:292
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
this class is a functor for "for_each" algorithm
Definition: trash_bin.hpp:142