6#define CUDA_ON_BACKEND CUDA_BACKEND_HIP
10 #include <hip/hip_runtime.h>
13 #include <hip/hip_runtime.h>
16constexpr int default_kernel_wg_threads_ = 256;
18typedef hipError_t cudaError_t;
19typedef hipStream_t cudaStream_t;
20typedef hipDeviceProp_t cudaDeviceProp_t;
21typedef cudaDeviceProp_t cudaDeviceProp;
22typedef hipEvent_t cudaEvent_t;
23typedef hipFuncAttributes cudaFuncAttributes;
26#define cudaSuccess hipSuccess
29static void init_wrappers()
37 cudaMemcpyHostToHost = 0,
38 cudaMemcpyHostToDevice = 1,
39 cudaMemcpyDeviceToHost = 2,
40 cudaMemcpyDeviceToDevice = 3,
44static cudaError_t cudaMemcpyToSymbol(
unsigned char * global_cuda_error_array,
const void * mem,
size_t sz,
int offset,cudaMemcpyKind opt)
50 case cudaMemcpyHostToHost:
51 opt_ = hipMemcpyHostToHost;
54 case cudaMemcpyHostToDevice:
55 opt_ = hipMemcpyHostToDevice;
58 case cudaMemcpyDeviceToHost:
59 opt_ = hipMemcpyDeviceToHost;
62 case cudaMemcpyDeviceToDevice:
63 opt_ = hipMemcpyDeviceToDevice;
67 opt_ = hipMemcpyDefault;
71 return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
74static cudaError_t cudaDeviceSynchronize()
76 return hipDeviceSynchronize();
79static cudaError_t cudaMemcpyFromSymbol(
void * dev_mem,
const unsigned char * global_cuda_error_array,
size_t sz)
81 return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
84static const char* cudaGetErrorString ( cudaError_t error )
86 return hipGetErrorString(error);
89static cudaError_t cudaGetDevice (
int* device )
91 return hipGetDevice(device);
94static cudaError_t cudaSetDevice (
int device )
96 return hipSetDevice(device);
99static cudaError_t cudaMemGetInfo (
size_t* free,
size_t* total )
101 return hipMemGetInfo(free,total);
104static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr,
const void* func )
106 return hipFuncGetAttributes(attr,func);
109static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp*
prop,
int device )
111 return hipGetDeviceProperties(
prop,device);
114static cudaError_t cudaEventCreate ( cudaEvent_t* event )
116 return hipEventCreate(event);
119static cudaError_t cudaEventDestroy ( cudaEvent_t event )
121 return hipEventDestroy(event);
124static cudaError_t cudaMalloc (
void** devPtr,
size_t size )
126 return hipMalloc(devPtr,size);
129static cudaError_t cudaMallocHost (
void** ptr,
size_t size )
131 return hipHostMalloc(ptr,size);
134static cudaError_t cudaFree (
void* devPtr )
136 return hipFree(devPtr);
139static cudaError_t cudaFreeHost (
void* ptr )
141 return hipHostFree(ptr);
144static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
146 return hipStreamSynchronize(stream);
149static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
151 return hipEventRecord(event,stream);
154static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
156 return hipEventSynchronize(event);
159static cudaError_t cudaEventElapsedTime (
float* ms, cudaEvent_t start, cudaEvent_t end )
161 return hipEventElapsedTime(ms,start,end);
164static cudaError_t cudaGetDeviceCount (
int* count )
166 return hipGetDeviceCount(count);
169static cudaError_t cudaMemcpy (
void* dst,
const void* src,
size_t count, cudaMemcpyKind opt )
175 case cudaMemcpyHostToHost:
176 opt_ = hipMemcpyHostToHost;
179 case cudaMemcpyHostToDevice:
180 opt_ = hipMemcpyHostToDevice;
183 case cudaMemcpyDeviceToHost:
184 opt_ = hipMemcpyDeviceToHost;
187 case cudaMemcpyDeviceToDevice:
188 opt_ = hipMemcpyDeviceToDevice;
192 opt_ = hipMemcpyDefault;
196 return hipMemcpy(dst,src,count,opt_);
201#include "util/cudify/cudify_hardware_cpu.hpp"
202#include "util/cuda_util.hpp"
205#include "hipcub/hipcub.hpp"
206#include "hipcub/block/block_scan.hpp"
208template<
typename lambda_f>
209__global__
void kernel_launch_lambda(lambda_f f)
212 dim3 tid = threadIdx;
216template<
typename lambda_f>
217__global__
void kernel_launch_lambda_tls(lambda_f f)
224 template<
typename T,
unsigned int bd>
225 using BlockScan = hipcub::BlockScan<T,bd>;
229struct has_work_gpu_cl_lin_blocks_
231 static unsigned int lin(
const T & b)
233 return b.x * b.y * b.z;
238struct has_work_gpu_cl_lin_blocks_<unsigned
int>
240 static unsigned int lin(
const unsigned int & b)
247struct has_work_gpu_cl_lin_blocks_<unsigned long>
249 static unsigned int lin(
const unsigned long & b)
256struct has_work_gpu_cl_lin_blocks_<
int>
258 static unsigned int lin(
const int & b)
264template<
typename wthr_type,
typename thr_type>
265bool has_work_gpu_cl_(
const wthr_type & wthr,
const thr_type & thr)
267 return (has_work_gpu_cl_lin_blocks_<
typename std::remove_const<wthr_type>::type>::lin(wthr) *
268 has_work_gpu_cl_lin_blocks_<
typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
271#ifdef PRINT_CUDA_LAUNCHES
273#define CUDA_LAUNCH(cuda_call,ite, ...)\
277 std::cout << "Launching: " << #cuda_call << std::endl;\
279 hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
281 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
285#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
295 std::cout << "Launching: " << #cuda_call << std::endl;\
297 hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
299 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
306#define CUDA_LAUNCH(cuda_call,ite, ...) \
311 if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
312 {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\
314 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
318#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
323 if (has_work_gpu_cl_(wthr_,thr_) == true)\
324 {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\
326 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
329#define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\
334 if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
335 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\
337 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
340#define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
343 if (ite.wthr.x != 0)\
344 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\
345 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
348#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
354 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\
355 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
358#define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
364 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\
365 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...