OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
segreduce_ofp.cuh
1/*
2 * segreduce_ofp.hpp
3 *
4 * Created on: May 15, 2019
5 * Author: i-bird
6 */
7
8 #ifndef SEGREDUCE_OFP_HPP_
9 #define SEGREDUCE_OFP_HPP_
10
11 #ifdef __NVCC__
12
13 #include "util/cuda_launch.hpp"
14 #include "util/ofp_context.hpp"
15
16 #if CUDART_VERSION >= 11000
17 // Here we have for sure CUDA >= 11
18 #ifndef CUDA_ON_CPU
19 #ifdef __HIP__
20 #include "hipcub/hipcub.hpp"
21 #else
22 #include "cub/cub.cuh"
23 #endif
24 #endif
25#else
26 #include "cub_old/cub.cuh"
27#endif
28
29
30 namespace openfpm
31 {
32 template<typename input_it,
33 typename segments_it, typename output_it, typename op_t, typename type_t>
34 void segreduce(input_it input, int count, segments_it segments,
35 int num_segments, output_it output, op_t op, type_t init,
36 gpu::ofp_context_t & context)
37 {
38 #ifdef CUDA_ON_CPU
39
40 int i = 0;
41 for ( ; i < num_segments - 1; i++)
42 {
43 int j = segments[i];
44 output[i] = init;
45 if (j == segments[i+1]) {continue;}
46 output[i] = input[j];
47 ++j;
48 for ( ; j < segments[i+1] ; j++)
49 {
50 output[i] = op(output[i],input[j]);
51 }
52 }
53
54 // Last segment
55 int j = segments[i];
56 if (j != count)
57 {
58 output[i] = input[j];
59 ++j;
60 for ( ; j < count ; j++)
61 {
62 output[i] = op(output[i],input[j]);
63 }
64 }
65
66 #else
67 #ifdef __HIP__
68
69 size_t temp_storage_bytes = 0;
70
71 hipcub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,
72 num_segments, segments, segments + 1, op, init);
73
74 auto & temporal = context.getTemporalCUB();
75 temporal.resize(temp_storage_bytes);
76
77 hipcub::DeviceSegmentedReduce::Reduce(temporal.getDeviceBuffer<0>(), temp_storage_bytes, input, output,
78 num_segments, segments, segments + 1, op, init);
79
80 #else
81
82 size_t temp_storage_bytes = 0;
83
84 cub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,
85 num_segments, segments, segments + 1, op, init);
86
87 auto & temporal = context.getTemporalCUB();
88 temporal.resize(temp_storage_bytes);
89
90 cub::DeviceSegmentedReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes, input, output,
91 num_segments, segments, segments + 1, op, init);
92
93 #endif
94 #endif
95 }
96 }
97
98 #endif /* __NVCC__ */
99
100 #endif /* SCAN_OFP_HPP_ */
101
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
convert a type into constant type
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.