OpenFPM  5.2.0
Project that contain the implementation of distributed structures
segreduce_ofp.cuh
1 /*
2  * segreduce_ofp.hpp
3  *
4  * Created on: May 15, 2019
5  * Author: i-bird
6  */
7 
8  #ifndef SEGREDUCE_OFP_HPP_
9  #define SEGREDUCE_OFP_HPP_
10 
11  #ifdef __NVCC__
12 
13  #include "util/cuda_util.hpp"
14  #include "util/ofp_context.hpp"
15 
16  #if CUDART_VERSION >= 11000
17  // Here we have for sure CUDA >= 11
18  #ifndef CUDA_ON_CPU
19  #ifdef __HIP__
20  #include "hipcub/hipcub.hpp"
21  #else
22  #include "cub/cub.cuh"
23  #endif
24  #endif
25 #else
26  #include "cub_old/cub.cuh"
27 #endif
28 
29 
30  namespace openfpm
31  {
32  template<typename input_it,
33  typename segments_it, typename output_it, typename op_t, typename type_t>
34  void segreduce(input_it input, int count, segments_it segments,
35  int num_segments, output_it output, op_t op, type_t init,
36  gpu::ofp_context_t& gpuContext)
37  {
38  #ifdef CUDA_ON_CPU
39 
40  int i = 0;
41  for ( ; i < num_segments - 1; i++)
42  {
43  int j = segments[i];
44  output[i] = init;
45  if (j == segments[i+1]) {continue;}
46  output[i] = input[j];
47  ++j;
48  for ( ; j < segments[i+1] ; j++)
49  {
50  output[i] = op(output[i],input[j]);
51  }
52  }
53 
54  // Last segment
55  int j = segments[i];
56  if (j != count)
57  {
58  output[i] = input[j];
59  ++j;
60  for ( ; j < count ; j++)
61  {
62  output[i] = op(output[i],input[j]);
63  }
64  }
65 
66  #else
67  #ifdef __HIP__
68 
69  size_t temp_storage_bytes = 0;
70 
71  hipcub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,
72  num_segments, segments, segments + 1, op, init);
73 
74  auto & temporal = gpuContext.getTemporalCUB();
75  temporal.resize(temp_storage_bytes);
76 
77  hipcub::DeviceSegmentedReduce::Reduce(temporal.getDeviceBuffer<0>(), temp_storage_bytes, input, output,
78  num_segments, segments, segments + 1, op, init);
79 
80  #else
81 
82  size_t temp_storage_bytes = 0;
83 
84  cub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,
85  num_segments, segments, segments + 1, op, init);
86 
87  auto & temporal = gpuContext.getTemporalCUB();
88  temporal.resize(temp_storage_bytes);
89 
90  cub::DeviceSegmentedReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes, input, output,
91  num_segments, segments, segments + 1, op, init);
92 
93  #endif
94  #endif
95  }
96  }
97 
98  #endif /* __NVCC__ */
99 
100  #endif /* SCAN_OFP_HPP_ */
101 
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
convert a type into constant type
Definition: aggregate.hpp:302
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.