OpenFPM  5.2.0
Project that contain the implementation of distributed structures
reduce_ofp.cuh
1 /*
2  * reduce_ofp.hpp
3  *
4  * Created on: May 15, 2019
5  * Author: i-bird
6  */
7 
8 #ifndef REDUCE_OFP_HPP_
9 #define REDUCE_OFP_HPP_
10 
11 #ifdef __NVCC__
12 
13 #include "util/cuda_util.hpp"
14 #include "util/ofp_context.hpp"
15 
16 #if CUDART_VERSION >= 11000
17  // Here we have for sure CUDA >= 11
18  #ifndef CUDA_ON_CPU
19  #ifdef __HIP__
20  #include "hipcub/hipcub.hpp"
21  #else
22  #include "cub/cub.cuh"
23  #endif
24  #endif
25 #else
26  #include "cub_old/cub.cuh"
27 #endif
28 
29 
30 namespace openfpm
31 {
32  template<typename input_it, typename output_it, typename reduce_op>
33  void reduce(input_it input, int count, output_it output, reduce_op op, gpu::ofp_context_t& gpuContext)
34  {
35 #ifdef CUDA_ON_CPU
36 
37  output[0] = input[0];
38  for (int i = 1 ; i < count ; i++)
39  {
40  output[0] = op(output[0],input[i]);
41  }
42 
43 #else
44 
45  #ifdef __HIP__
46 
47  size_t temp_storage_bytes = 0;
48  hipcub::DeviceReduce::Reduce(NULL,
49  temp_storage_bytes,input, output, count, op, op.reduceInitValue());
50 
51  auto & temporal = gpuContext.getTemporalCUB();
52  temporal.resize(temp_storage_bytes);
53 
54  hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(),
55  temp_storage_bytes,input, output, count, op, op.reduceInitValue());
56  #else
57 
58  size_t temp_storage_bytes = 0;
60  temp_storage_bytes, input, output, count, op, op.reduceInitValue());
61 
62  auto & temporal = gpuContext.getTemporalCUB();
63  temporal.resize(temp_storage_bytes);
64 
65  cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(),
66  temp_storage_bytes, input, output, count, op, op.reduceInitValue());
67 
68  #endif
69 #endif
70  }
71 }
72 
73 #endif
74 
75 #endif /* REDUCE_OFP_HPP_ */
convert a type into constant type
Definition: aggregate.hpp:302
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...