1 #ifndef CUDIFY_HIP_HPP_
2 #define CUDIFY_HIP_HPP_
6 #define CUDA_ON_BACKEND CUDA_BACKEND_HIP
10 #include <hip/hip_runtime.h>
13 #include <hip/hip_runtime.h>
16 constexpr
int default_kernel_wg_threads_ = 256;
18 typedef hipError_t cudaError_t;
19 typedef hipStream_t cudaStream_t;
20 typedef hipDeviceProp_t cudaDeviceProp_t;
21 typedef cudaDeviceProp_t cudaDeviceProp;
22 typedef hipEvent_t cudaEvent_t;
23 typedef hipFuncAttributes cudaFuncAttributes;
26 #define cudaSuccess hipSuccess
29 static void init_wrappers()
37 cudaMemcpyHostToHost = 0,
38 cudaMemcpyHostToDevice = 1,
39 cudaMemcpyDeviceToHost = 2,
40 cudaMemcpyDeviceToDevice = 3,
44 static cudaError_t cudaMemcpyToSymbol(
unsigned char * global_cuda_error_array,
const void * mem,
size_t sz,
int offset,cudaMemcpyKind opt)
50 case cudaMemcpyHostToHost:
51 opt_ = hipMemcpyHostToHost;
54 case cudaMemcpyHostToDevice:
55 opt_ = hipMemcpyHostToDevice;
58 case cudaMemcpyDeviceToHost:
59 opt_ = hipMemcpyDeviceToHost;
62 case cudaMemcpyDeviceToDevice:
63 opt_ = hipMemcpyDeviceToDevice;
67 opt_ = hipMemcpyDefault;
71 return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
74 static cudaError_t cudaDeviceSynchronize()
76 return hipDeviceSynchronize();
79 static cudaError_t cudaMemcpyFromSymbol(
void * dev_mem,
const unsigned char * global_cuda_error_array,
size_t sz)
81 return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
84 static const char* cudaGetErrorString ( cudaError_t error )
86 return hipGetErrorString(error);
89 static cudaError_t cudaGetDevice (
int* device )
91 return hipGetDevice(device);
94 static cudaError_t cudaSetDevice (
int device )
96 return hipSetDevice(device);
99 static cudaError_t cudaMemGetInfo (
size_t* free,
size_t* total )
101 return hipMemGetInfo(free,total);
104 static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr,
const void* func )
106 return hipFuncGetAttributes(attr,func);
109 static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp*
prop,
int device )
111 return hipGetDeviceProperties(
prop,device);
114 static cudaError_t cudaEventCreate ( cudaEvent_t* event )
116 return hipEventCreate(event);
119 static cudaError_t cudaEventDestroy ( cudaEvent_t event )
121 return hipEventDestroy(event);
124 static cudaError_t cudaMalloc (
void** devPtr,
size_t size )
126 return hipMalloc(devPtr,size);
129 static cudaError_t cudaMallocHost (
void** ptr,
size_t size )
131 return hipHostMalloc(ptr,size);
134 static cudaError_t cudaFree (
void* devPtr )
136 return hipFree(devPtr);
139 static cudaError_t cudaFreeHost (
void* ptr )
141 return hipHostFree(ptr);
144 static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
146 return hipStreamSynchronize(stream);
149 static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
151 return hipEventRecord(event,stream);
154 static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
156 return hipEventSynchronize(event);
159 static cudaError_t cudaEventElapsedTime (
float* ms, cudaEvent_t start, cudaEvent_t end )
161 return hipEventElapsedTime(ms,start,end);
164 static cudaError_t cudaGetDeviceCount (
int* count )
166 return hipGetDeviceCount(count);
169 static cudaError_t cudaMemcpy (
void* dst,
const void* src,
size_t count, cudaMemcpyKind opt )
175 case cudaMemcpyHostToHost:
176 opt_ = hipMemcpyHostToHost;
179 case cudaMemcpyHostToDevice:
180 opt_ = hipMemcpyHostToDevice;
183 case cudaMemcpyDeviceToHost:
184 opt_ = hipMemcpyDeviceToHost;
187 case cudaMemcpyDeviceToDevice:
188 opt_ = hipMemcpyDeviceToDevice;
192 opt_ = hipMemcpyDefault;
196 return hipMemcpy(dst,src,count,opt_);
201 #include "util/cudify/cudify_hardware_cpu.hpp"
204 #include "hipcub/hipcub.hpp"
205 #include "hipcub/block/block_scan.hpp"
207 template<
typename lambda_f>
208 __global__
void kernel_launch_lambda(lambda_f f)
211 dim3 tid = threadIdx;
215 template<
typename lambda_f>
216 __global__
void kernel_launch_lambda_tls(lambda_f f)
223 template<
typename T,
unsigned int bd>
224 using BlockScan = hipcub::BlockScan<T,bd>;
228 struct has_work_gpu_cl_lin_blocks_
230 static unsigned int lin(
const T & b)
232 return b.x * b.y * b.z;
237 struct has_work_gpu_cl_lin_blocks_<unsigned
int>
239 static unsigned int lin(
const unsigned int & b)
246 struct has_work_gpu_cl_lin_blocks_<unsigned long>
248 static unsigned int lin(
const unsigned long & b)
255 struct has_work_gpu_cl_lin_blocks_<
int>
257 static unsigned int lin(
const int & b)
263 template<
typename wthr_type,
typename thr_type>
264 bool has_work_gpu_cl_(
const wthr_type & wthr,
const thr_type & thr)
266 return (has_work_gpu_cl_lin_blocks_<
typename std::remove_const<wthr_type>::type>::lin(wthr) *
267 has_work_gpu_cl_lin_blocks_<
typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
270 #ifdef PRINT_CUDA_LAUNCHES
272 #define CUDA_LAUNCH(cuda_call,ite, ...)\
276 std::cout << "Launching: " << #cuda_call << std::endl;\
278 hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
280 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
284 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
294 std::cout << "Launching: " << #cuda_call << std::endl;\
296 hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
298 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
305 #define CUDA_LAUNCH(cuda_call,ite, ...) \
310 if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
311 {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\
313 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
317 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
322 if (has_work_gpu_cl_(wthr_,thr_) == true)\
323 {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\
325 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
328 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\
333 if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
334 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\
336 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
339 #define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
342 if (ite.wthr.x != 0)\
343 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\
344 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
347 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
353 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\
354 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
357 #define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
363 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\
364 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...