OpenFPM  5.2.0
Project that contain the implementation of distributed structures
cudify_cuda.hpp
1 #ifndef __CUDIFY_CUDA_HPP__
2 #define __CUDIFY_CUDA_HPP__
3 
4 #define CUDA_ON_BACKEND CUDA_BACKEND_CUDA
5 #include <cuda_runtime.h>
6 #include <boost/preprocessor.hpp>
7 
8 #ifdef DEFAULT_CUDA_THREADS
9 constexpr size_t default_kernel_wg_threads_ = static_cast<size_t>(DEFAULT_CUDA_THREADS);
10 #else
11 constexpr size_t default_kernel_wg_threads_ = static_cast<size_t>(1024);
12 #endif
13 
14 #if CUDART_VERSION >= 11000 && defined(__NVCC__)
15  #include "cub/util_type.cuh"
16  #include "cub/block/block_scan.cuh"
17 #endif
18 
19 #ifdef __NVCC__
20 #include "operators.hpp"
21 
22 template<typename lambda_f>
23 __global__ void kernel_launch_lambda(lambda_f f)
24 {
25  dim3 bid = blockIdx;
26  dim3 tid = threadIdx;
27  f(bid,tid);
28 }
29 
30 template<typename lambda_f>
31 __global__ void kernel_launch_lambda_tls(lambda_f f)
32 {
33  f();
34 }
35 
36 
48 template<typename dim3Type, typename... Args>
49 void FixConfigLaunch(void (* _kernel)(Args...), dim3Type & wthr, dim3Type & thr) {
50 
51  if (thr.x != 0xFFFFFFFF) {
52  return;
53  }
54 
55  int blockSize = 0; // The launch configurator returned block size
56  int minGridSize; // The minimum grid size needed to achieve the
57  // maximum occupancy for a full device launch
58 
59  cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, *_kernel, 0, 0);
60 
61  int dim = (wthr.x != 0) + (wthr.y != 0) + (wthr.z != 0);
62  if (dim == 0) {
63  return;
64  }
65 
66  size_t tot_work;
67 
68  unsigned int wthr_x = wthr.x;
69  unsigned int wthr_y = wthr.y;
70  unsigned int wthr_z = wthr.z;
71 
72  if (dim == 1)
73  tot_work = wthr.x;
74  else if (dim == 2)
75  tot_work = wthr.x * wthr.y;
76  else if (dim == 3)
77  tot_work = wthr.x * wthr.y * wthr.z;
78 
79  // round to the nearest bigger power of 2
80  size_t tot_work_2 = tot_work;
81  tot_work_2--;
82  tot_work_2 |= tot_work_2 >> 1;
83  tot_work_2 |= tot_work_2 >> 2;
84  tot_work_2 |= tot_work_2 >> 4;
85  tot_work_2 |= tot_work_2 >> 8;
86  tot_work_2 |= tot_work_2 >> 16;
87  tot_work_2++;
88 
89  size_t n = (tot_work <= blockSize)?tot_work_2:blockSize;
90 
91  if (tot_work == 0)
92  {
93  thr.x = 0;
94  thr.y = 0;
95  thr.z = 0;
96 
97  wthr.x = 0;
98  wthr.y = 0;
99  wthr.z = 0;
100  }
101 
102  thr.x = 1;
103  thr.y = 1;
104  thr.z = 1;
105 
106  int dir = 0;
107 
108  while (n != 1)
109  {
110  if (dir % 3 == 0)
111  {thr.x = thr.x << 1;}
112  else if (dir % 3 == 1)
113  {thr.y = thr.y << 1;}
114  else if (dir % 3 == 2)
115  {thr.z = thr.z << 1;}
116 
117  n = n >> 1;
118 
119  dir++;
120  dir %= dim;
121  }
122 
123  if (dim >= 1)
124  {wthr.x = (wthr.x) / thr.x + (((wthr_x)%thr.x != 0)?1:0);}
125 
126 
127  if (dim >= 2)
128  {wthr.y = (wthr.y) / thr.y + (((wthr_y)%thr.y != 0)?1:0);}
129  else
130  {wthr.y = 1;}
131 
132  if (dim >= 3)
133  {wthr.z = (wthr.z) / thr.z + (((wthr_z)%thr.z != 0)?1:0);}
134  else
135  {wthr.z = 1;}
136 
137  // crop if wthr == 1
138 
139  if (dim >= 1 && wthr.x == 1)
140  {thr.x = wthr_x;}
141 
142  if (dim >= 2 && wthr.y == 1)
143  {thr.y = wthr_y;}
144 
145  if (dim == 3 && wthr.z == 1)
146  {thr.z = wthr_z;}
147 }
148 
149 #endif
150 
151 static void init_wrappers()
152 {}
153 
154 #if defined(SE_CLASS1) || defined(CUDA_CHECK_LAUNCH)
155 
156 #define CUDA_LAUNCH(cuda_call,ite, ...) \
157  {\
158  cudaDeviceSynchronize(); \
159  {\
160  cudaError_t e = cudaGetLastError();\
161  if (e != cudaSuccess)\
162  {\
163  std::string error = cudaGetErrorString(e);\
164  std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
165  }\
166  }\
167  CHECK_SE_CLASS1_PRE\
168  if (ite.wthr.x != 0)\
169  {cuda_call<<<ite.wthr,ite.thr>>>(__VA_ARGS__);}\
170  cudaDeviceSynchronize(); \
171  {\
172  cudaError_t e = cudaGetLastError();\
173  if (e != cudaSuccess)\
174  {\
175  std::string error = cudaGetErrorString(e);\
176  std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
177  }\
178  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
179  }\
180  }
181 
182 #define CUDA_LAUNCH_DIM3(cuda_call,wthr,thr, ...) \
183  {\
184  cudaDeviceSynchronize(); \
185  {\
186  cudaError_t e = cudaGetLastError();\
187  if (e != cudaSuccess)\
188  {\
189  std::string error = cudaGetErrorString(e);\
190  std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
191  }\
192  }\
193  CHECK_SE_CLASS1_PRE\
194  cuda_call<<<wthr,thr>>>(__VA_ARGS__);\
195  cudaDeviceSynchronize(); \
196  {\
197  cudaError_t e = cudaGetLastError();\
198  if (e != cudaSuccess)\
199  {\
200  std::string error = cudaGetErrorString(e);\
201  std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
202  }\
203  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
204  }\
205  }
206 
207 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr,thr, ...) \
208  {\
209  cudaDeviceSynchronize(); \
210  {\
211  cudaError_t e = cudaGetLastError();\
212  if (e != cudaSuccess)\
213  {\
214  std::string error = cudaGetErrorString(e);\
215  std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
216  }\
217  }\
218  CHECK_SE_CLASS1_PRE\
219  cuda_call<<<wthr,thr>>>(__VA_ARGS__);\
220  cudaDeviceSynchronize(); \
221  {\
222  cudaError_t e = cudaGetLastError();\
223  if (e != cudaSuccess)\
224  {\
225  std::string error = cudaGetErrorString(e);\
226  std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
227  }\
228  }\
229  }
230 
231 #define CUDA_LAUNCH_LAMBDA(ite, lambda_f, ...) \
232  {\
233  cudaDeviceSynchronize(); \
234  {\
235  cudaError_t e = cudaGetLastError();\
236  if (e != cudaSuccess)\
237  {\
238  std::string error = cudaGetErrorString(e);\
239  std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
240  }\
241  }\
242  CHECK_SE_CLASS1_PRE\
243  if (ite.wthr.x != 0)\
244  {kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}\
245  cudaDeviceSynchronize(); \
246  {\
247  cudaError_t e = cudaGetLastError();\
248  if (e != cudaSuccess)\
249  {\
250  std::string error = cudaGetErrorString(e);\
251  std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
252  }\
253  CHECK_SE_CLASS1_POST("lambda",0)\
254  }\
255  }
256 
257 #define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
258  {\
259  cudaDeviceSynchronize(); \
260  {\
261  cudaError_t e = cudaGetLastError();\
262  if (e != cudaSuccess)\
263  {\
264  std::string error = cudaGetErrorString(e);\
265  std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
266  }\
267  }\
268  CHECK_SE_CLASS1_PRE\
269  if (ite.wthr.x != 0)\
270  {kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}\
271  cudaDeviceSynchronize(); \
272  {\
273  cudaError_t e = cudaGetLastError();\
274  if (e != cudaSuccess)\
275  {\
276  std::string error = cudaGetErrorString(e);\
277  std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
278  }\
279  CHECK_SE_CLASS1_POST("lambda",0)\
280  }\
281  }
282 
283 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
284  {\
285  cudaDeviceSynchronize(); \
286  {\
287  cudaError_t e = cudaGetLastError();\
288  if (e != cudaSuccess)\
289  {\
290  std::string error = cudaGetErrorString(e);\
291  std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
292  }\
293  }\
294  CHECK_SE_CLASS1_PRE\
295  if (ite.wthr.x != 0)\
296  {kernel_launch_lambda<<<wthr_,thr_>>>(lambda_f);}\
297  cudaDeviceSynchronize(); \
298  {\
299  cudaError_t e = cudaGetLastError();\
300  if (e != cudaSuccess)\
301  {\
302  std::string error = cudaGetErrorString(e);\
303  std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
304  }\
305  CHECK_SE_CLASS1_POST("lambda",0)\
306  }\
307  }
308 
309 #define CUDA_CHECK() \
310  {\
311  cudaDeviceSynchronize(); \
312  {\
313  cudaError_t e = cudaGetLastError();\
314  if (e != cudaSuccess)\
315  {\
316  std::string error = cudaGetErrorString(e);\
317  std::cout << "Cuda an error has occurred before, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
318  }\
319  }\
320  CHECK_SE_CLASS1_PRE\
321  cudaDeviceSynchronize(); \
322  {\
323  cudaError_t e = cudaGetLastError();\
324  if (e != cudaSuccess)\
325  {\
326  std::string error = cudaGetErrorString(e);\
327  std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
328  }\
329  CHECK_SE_CLASS1_POST("no call","no args")\
330  }\
331  }
332 
333 #else
334 
335 template<typename... Args, typename ite_type>
336 void CUDA_LAUNCH(void (* _kernel)(Args...),ite_type ite,Args... args)
337 {
338 // std::cout << "DEMANGLE " << typeid(decltype(_kernel)).name() << " " << ite.wthr.x << " " << ite.wthr.y << " " << ite.wthr.z << "/" << ite.thr.x << " " << ite.thr.y << " " << ite.thr.z << std::endl;
339 
340  #ifdef __NVCC__
341  FixConfigLaunch(_kernel,ite.wthr,ite.thr);
342  _kernel<<<ite.wthr,ite.thr>>>(args...);
343  #else
344  std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH not implemented for this compiler" << std::endl;
345  #endif
346 }
347 
348 template<typename... Args>
349 void CUDA_LAUNCH_DIM3(void (* _kernel)(Args...),dim3 wthr, dim3 thr,Args... args)
350 {
351 // std::cout << "DEMANGLE " << typeid(decltype(_kernel)).name() << " " << wthr.x << " " << wthr.y << " " << wthr.z << "/" << thr.x << " " << thr.y << " " << thr.z << std::endl;
352 
353  #ifdef __NVCC__
354  FixConfigLaunch(_kernel,wthr,thr);
355  _kernel<<<wthr,thr>>>(args...);
356  #else
357  std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_DIM3 not implemented for this compiler" << std::endl;
358  #endif
359 }
360 
361 template<typename lambda_type, typename ite_type, typename... Args>
362 void CUDA_LAUNCH_LAMBDA(ite_type ite, lambda_type lambda_f, Args... args)
363 {
364  #ifdef __NVCC__
365  void (* _ker)(lambda_type) = kernel_launch_lambda;
366  FixConfigLaunch(_ker,ite.wthr,ite.thr);
367 
368  kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);
369  #else
370  std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA not implemented for this compiler" << std::endl;
371  #endif
372 }
373 
374 static void CUDA_CHECK() {}
375 
376 template<typename lambda_type, typename ite_type, typename... Args>
377 void CUDA_LAUNCH_LAMBDA_TLS(ite_type ite, lambda_type lambda_f, Args... args)
378 {
379  #ifdef __NVCC__
380  void (* _ker)(lambda_type) = kernel_launch_lambda;
381  FixConfigLaunch(_ker,ite.wthr,ite.thr);
382 
383  if (ite.wthr.x != 0)
384  {kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}
385  #else
386  std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
387  #endif
388 }
389 
390 template<typename lambda_type, typename... Args>
391 void CUDA_LAUNCH_LAMBDA_DIM3(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
392 {
393  #ifdef __NVCC__
394  void (* _ker)(lambda_type) = kernel_launch_lambda;
395  FixConfigLaunch(_ker,wthr_,thr_);
396 
397  dim3 wthr__(wthr_);
398  dim3 thr__(thr_);
399  if (wthr__.x != 0)
400  {kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
401  #else
402  std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
403  #endif
404 }
405 
406 template<typename lambda_type, typename... Args>
407 void CUDA_LAUNCH_LAMBDA_DIM3_TLS(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
408 {
409  #ifdef __NVCC__
410  void (* _ker)(lambda_type) = kernel_launch_lambda;
411  FixConfigLaunch(_ker,wthr_,thr_);
412 
413  dim3 wthr__(wthr_);
414  dim3 thr__(thr_);
415  if (wthr__.x != 0)
416  {kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
417  #else
418  std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
419  #endif
420 }
421 
422 #endif
423 
424 #endif