OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
segreduce_ofp.cuh
1 /*
2  * segreduce_ofp.hpp
3  *
4  * Created on: May 15, 2019
5  * Author: i-bird
6  */
7 
8  #ifndef SEGREDUCE_OFP_HPP_
9  #define SEGREDUCE_OFP_HPP_
10 
11  #ifdef __NVCC__
12 
13  #include "Vector/map_vector.hpp"
14  #include "util/cuda_launch.hpp"
15  #include "util/cuda/segreduce_ofp.cuh"
16 
17  #if CUDART_VERSION >= 11000
18  #ifndef CUDA_ON_CPU
19  // Here we have for sure CUDA >= 11
20  #ifdef __HIP__
21  #undef __CUDACC__
22  #undef __CUDA__
23  #include <thrust/reduce.h>
24  #define __CUDACC__
25  #define __CUDA__
26  #else
27  #include "util/cuda/moderngpu/kernel_segreduce.hxx"
28  #endif
29  #endif
30  #else
31  #include "util/cuda/moderngpu/kernel_segreduce.hxx"
32  #endif
33  #include "util/cuda/ofp_context.hxx"
34 
35 template<typename segments_it, typename keys_type, typename output_it, typename seg_type, typename type_t>
36 __global__ void seg_to_keys(segments_it segs, keys_type keys, seg_type seg_out ,output_it output, int n_count, int num_segments,type_t init)
37 {
38  int tid = blockIdx.x * blockDim.x + threadIdx.x;
39 
40  if (tid >= num_segments) {return;}
41 
42  int s = segs[tid];
43  int s_p1 = (tid == num_segments -1)?n_count:segs[tid+1];
44 
45  int n_ele = s_p1 - s;
46 
47  seg_out.template get<1>(tid) = (s != s_p1);
48  output[tid] = init;
49 
50  for (int j = 0 ; j < n_ele ; j++)
51  {
52  keys.template get<0>(s + j) = tid;
53  }
54 }
55 
56 template<typename output_it, typename out_tmp_type ,typename segs_type>
57 __global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type segs, int num_segments)
58 {
59  int tid = blockIdx.x * blockDim.x + threadIdx.x;
60 
61  if (tid >= num_segments) {return;}
62 
63  int t = segs.template get<2>(tid);
64  int to_copy = segs.template get<1>(tid);
65 
66  auto op = out_tmp.template get<0>(t);
67 
68  if (to_copy == 1)
69  {out[tid] = op;}
70 }
71 
72  namespace openfpm
73  {
74  template<typename input_it,
75  typename segments_it, typename output_it, typename op_t, typename type_t>
76  void segreduce(input_it input, int count, segments_it segments,
77  int num_segments, output_it output, op_t op, type_t init,
78  mgpu::ofp_context_t & context)
79  {
80  #ifdef CUDA_ON_CPU
81 
82  int i = 0;
83  for ( ; i < num_segments - 1; i++)
84  {
85  int j = segments[i];
86  output[i] = init;
87  if (j == segments[i+1]) {continue;}
88  output[i] = input[j];
89  ++j;
90  for ( ; j < segments[i+1] ; j++)
91  {
92  output[i] = op(output[i],input[j]);
93  }
94  }
95 
96  // Last segment
97  int j = segments[i];
98  if (j != count)
99  {
100  output[i] = input[j];
101  ++j;
102  for ( ; j < count ; j++)
103  {
104  output[i] = op(output[i],input[j]);
105  }
106  }
107 
108  #else
109 
110  #ifdef __HIP__
111 
112  typedef typename std::remove_pointer<segments_it>::type index_type;
113  typedef typename std::remove_pointer<output_it>::type out_type;
114 
116  keys.resize(count);
117 
119  segs_out.resize(num_segments);
120 
122  out_tmp.resize(num_segments);
123 
124  grid_sm<1,void> g(num_segments);
125 
126  auto it = g.getGPUIterator();
127 
128  CUDA_LAUNCH(seg_to_keys,it,segments,keys.toKernel(),segs_out.toKernel(),output,count,num_segments,init);
129 
130  openfpm::scan((index_type *)segs_out.template getDeviceBuffer<1>(),num_segments,(index_type *)segs_out.template getDeviceBuffer<2>(),context);
131 
132  thrust::pair<index_type *,out_type *> new_end;
133  new_end = thrust::reduce_by_key(thrust::device, (segments_it)keys.template getDeviceBuffer<0>(),((segments_it)keys.template getDeviceBuffer<0>()) + count,
134  input,
135  (segments_it)segs_out.template getDeviceBuffer<0>(),
136  (output_it)out_tmp.template getDeviceBuffer<0>(),
137  thrust::equal_to<int>(),
138  op);
139 
140  // .. Not so easy to emulate a segmented reduce we have to track the zeros segments and realign the output
141 
142  CUDA_LAUNCH(realign_output,it,output,out_tmp.toKernel(),segs_out.toKernel(),num_segments);
143 
144  #else
145 
146  mgpu::segreduce(input,count,segments,num_segments,output,op,init,context);
147 
148  #endif
149 
150  #endif
151  }
152  }
153 
154  #endif /* __NVCC__ */
155 
156  #endif /* SCAN_OFP_HPP_ */
157 
convert a type into constant type
Definition: aggregate.hpp:292
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202