1 #ifndef __CUDIFY_CUDA_HPP__
2 #define __CUDIFY_CUDA_HPP__
4 #define CUDA_ON_BACKEND CUDA_BACKEND_CUDA
5 #include <cuda_runtime.h>
6 #include <boost/preprocessor.hpp>
8 #ifdef DEFAULT_CUDA_THREADS
9 constexpr
size_t default_kernel_wg_threads_ =
static_cast<size_t>(DEFAULT_CUDA_THREADS);
11 constexpr
size_t default_kernel_wg_threads_ =
static_cast<size_t>(1024);
14 #if CUDART_VERSION >= 11000 && defined(__NVCC__)
15 #include "cub/util_type.cuh"
16 #include "cub/block/block_scan.cuh"
20 #include "operators.hpp"
22 template<
typename lambda_f>
23 __global__
void kernel_launch_lambda(lambda_f f)
30 template<
typename lambda_f>
31 __global__
void kernel_launch_lambda_tls(lambda_f f)
48 template<
typename dim3Type,
typename... Args>
49 void FixConfigLaunch(
void (* _kernel)(Args...), dim3Type & wthr, dim3Type & thr) {
51 if (thr.x != 0xFFFFFFFF) {
59 cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, *_kernel, 0, 0);
61 int dim = (wthr.x != 0) + (wthr.y != 0) + (wthr.z != 0);
68 unsigned int wthr_x = wthr.x;
69 unsigned int wthr_y = wthr.y;
70 unsigned int wthr_z = wthr.z;
75 tot_work = wthr.x * wthr.y;
77 tot_work = wthr.x * wthr.y * wthr.z;
80 size_t tot_work_2 = tot_work;
82 tot_work_2 |= tot_work_2 >> 1;
83 tot_work_2 |= tot_work_2 >> 2;
84 tot_work_2 |= tot_work_2 >> 4;
85 tot_work_2 |= tot_work_2 >> 8;
86 tot_work_2 |= tot_work_2 >> 16;
89 size_t n = (tot_work <= blockSize)?tot_work_2:blockSize;
111 {thr.x = thr.x << 1;}
112 else if (dir % 3 == 1)
113 {thr.y = thr.y << 1;}
114 else if (dir % 3 == 2)
115 {thr.z = thr.z << 1;}
124 {wthr.x = (wthr.x) / thr.x + (((wthr_x)%thr.x != 0)?1:0);}
128 {wthr.y = (wthr.y) / thr.y + (((wthr_y)%thr.y != 0)?1:0);}
133 {wthr.z = (wthr.z) / thr.z + (((wthr_z)%thr.z != 0)?1:0);}
139 if (dim >= 1 && wthr.x == 1)
142 if (dim >= 2 && wthr.y == 1)
145 if (dim == 3 && wthr.z == 1)
151 static void init_wrappers()
154 #if defined(SE_CLASS1) || defined(CUDA_CHECK_LAUNCH)
156 #define CUDA_LAUNCH(cuda_call,ite, ...) \
158 cudaDeviceSynchronize(); \
160 cudaError_t e = cudaGetLastError();\
161 if (e != cudaSuccess)\
163 std::string error = cudaGetErrorString(e);\
164 std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
168 if (ite.wthr.x != 0)\
169 {cuda_call<<<ite.wthr,ite.thr>>>(__VA_ARGS__);}\
170 cudaDeviceSynchronize(); \
172 cudaError_t e = cudaGetLastError();\
173 if (e != cudaSuccess)\
175 std::string error = cudaGetErrorString(e);\
176 std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
178 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
182 #define CUDA_LAUNCH_DIM3(cuda_call,wthr,thr, ...) \
184 cudaDeviceSynchronize(); \
186 cudaError_t e = cudaGetLastError();\
187 if (e != cudaSuccess)\
189 std::string error = cudaGetErrorString(e);\
190 std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
194 cuda_call<<<wthr,thr>>>(__VA_ARGS__);\
195 cudaDeviceSynchronize(); \
197 cudaError_t e = cudaGetLastError();\
198 if (e != cudaSuccess)\
200 std::string error = cudaGetErrorString(e);\
201 std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
203 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
207 #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr,thr, ...) \
209 cudaDeviceSynchronize(); \
211 cudaError_t e = cudaGetLastError();\
212 if (e != cudaSuccess)\
214 std::string error = cudaGetErrorString(e);\
215 std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
219 cuda_call<<<wthr,thr>>>(__VA_ARGS__);\
220 cudaDeviceSynchronize(); \
222 cudaError_t e = cudaGetLastError();\
223 if (e != cudaSuccess)\
225 std::string error = cudaGetErrorString(e);\
226 std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
231 #define CUDA_LAUNCH_LAMBDA(ite, lambda_f, ...) \
233 cudaDeviceSynchronize(); \
235 cudaError_t e = cudaGetLastError();\
236 if (e != cudaSuccess)\
238 std::string error = cudaGetErrorString(e);\
239 std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
243 if (ite.wthr.x != 0)\
244 {kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}\
245 cudaDeviceSynchronize(); \
247 cudaError_t e = cudaGetLastError();\
248 if (e != cudaSuccess)\
250 std::string error = cudaGetErrorString(e);\
251 std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
253 CHECK_SE_CLASS1_POST("lambda",0)\
257 #define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
259 cudaDeviceSynchronize(); \
261 cudaError_t e = cudaGetLastError();\
262 if (e != cudaSuccess)\
264 std::string error = cudaGetErrorString(e);\
265 std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
269 if (ite.wthr.x != 0)\
270 {kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}\
271 cudaDeviceSynchronize(); \
273 cudaError_t e = cudaGetLastError();\
274 if (e != cudaSuccess)\
276 std::string error = cudaGetErrorString(e);\
277 std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
279 CHECK_SE_CLASS1_POST("lambda",0)\
283 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
285 cudaDeviceSynchronize(); \
287 cudaError_t e = cudaGetLastError();\
288 if (e != cudaSuccess)\
290 std::string error = cudaGetErrorString(e);\
291 std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
295 if (ite.wthr.x != 0)\
296 {kernel_launch_lambda<<<wthr_,thr_>>>(lambda_f);}\
297 cudaDeviceSynchronize(); \
299 cudaError_t e = cudaGetLastError();\
300 if (e != cudaSuccess)\
302 std::string error = cudaGetErrorString(e);\
303 std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
305 CHECK_SE_CLASS1_POST("lambda",0)\
309 #define CUDA_CHECK() \
311 cudaDeviceSynchronize(); \
313 cudaError_t e = cudaGetLastError();\
314 if (e != cudaSuccess)\
316 std::string error = cudaGetErrorString(e);\
317 std::cout << "Cuda an error has occurred before, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
321 cudaDeviceSynchronize(); \
323 cudaError_t e = cudaGetLastError();\
324 if (e != cudaSuccess)\
326 std::string error = cudaGetErrorString(e);\
327 std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
329 CHECK_SE_CLASS1_POST("no call","no args")\
335 template<
typename... Args,
typename ite_type>
336 void CUDA_LAUNCH(
void (* _kernel)(Args...),ite_type ite,Args... args)
341 FixConfigLaunch(_kernel,ite.wthr,ite.thr);
342 _kernel<<<ite.wthr,ite.thr>>>(args...);
344 std::cout << __FILE__ <<
":" << __LINE__ <<
" " <<
"CUDA_LAUNCH not implemented for this compiler" << std::endl;
348 template<
typename... Args>
349 void CUDA_LAUNCH_DIM3(
void (* _kernel)(Args...),dim3 wthr, dim3 thr,Args... args)
354 FixConfigLaunch(_kernel,wthr,thr);
355 _kernel<<<wthr,thr>>>(args...);
357 std::cout << __FILE__ <<
":" << __LINE__ <<
" " <<
"CUDA_LAUNCH_DIM3 not implemented for this compiler" << std::endl;
361 template<
typename lambda_type,
typename ite_type,
typename... Args>
362 void CUDA_LAUNCH_LAMBDA(ite_type ite, lambda_type lambda_f, Args... args)
365 void (* _ker)(lambda_type) = kernel_launch_lambda;
366 FixConfigLaunch(_ker,ite.wthr,ite.thr);
368 kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);
370 std::cout << __FILE__ <<
":" << __LINE__ <<
" " <<
"CUDA_LAUNCH_LAMBDA not implemented for this compiler" << std::endl;
374 static void CUDA_CHECK() {}
376 template<
typename lambda_type,
typename ite_type,
typename... Args>
377 void CUDA_LAUNCH_LAMBDA_TLS(ite_type ite, lambda_type lambda_f, Args... args)
380 void (* _ker)(lambda_type) = kernel_launch_lambda;
381 FixConfigLaunch(_ker,ite.wthr,ite.thr);
384 {kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}
386 std::cout << __FILE__ <<
":" << __LINE__ <<
" " <<
"CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
390 template<
typename lambda_type,
typename... Args>
391 void CUDA_LAUNCH_LAMBDA_DIM3(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
394 void (* _ker)(lambda_type) = kernel_launch_lambda;
395 FixConfigLaunch(_ker,wthr_,thr_);
400 {kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
402 std::cout << __FILE__ <<
":" << __LINE__ <<
" " <<
"CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
406 template<
typename lambda_type,
typename... Args>
407 void CUDA_LAUNCH_LAMBDA_DIM3_TLS(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
410 void (* _ker)(lambda_type) = kernel_launch_lambda;
411 FixConfigLaunch(_ker,wthr_,thr_);
416 {kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
418 std::cout << __FILE__ <<
":" << __LINE__ <<
" " <<
"CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;