OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
cudify_hip.hpp
1 #ifndef CUDIFY_HIP_HPP_
2 #define CUDIFY_HIP_HPP_
3 
4 #include "config.h"
5 
6 #define CUDA_ON_BACKEND CUDA_BACKEND_HIP
7 
8 #ifdef __NVCC__
9  #undef __NVCC__
10  #include <hip/hip_runtime.h>
11  #define __NVCC__
12 #else
13  #include <hip/hip_runtime.h>
14 #endif
15 
16 constexpr int default_kernel_wg_threads_ = 256;
17 
18 typedef hipError_t cudaError_t;
19 typedef hipStream_t cudaStream_t;
20 typedef hipDeviceProp_t cudaDeviceProp_t;
21 typedef cudaDeviceProp_t cudaDeviceProp;
22 typedef hipEvent_t cudaEvent_t;
23 typedef hipFuncAttributes cudaFuncAttributes;
24 
25 
26 #define cudaSuccess hipSuccess
27 
28 
29 static void init_wrappers()
30 {}
31 
35 enum cudaMemcpyKind
36 {
37  cudaMemcpyHostToHost = 0,
38  cudaMemcpyHostToDevice = 1,
39  cudaMemcpyDeviceToHost = 2,
40  cudaMemcpyDeviceToDevice = 3,
41  cudaMemcpyDefault = 4
42 };
43 
44 static cudaError_t cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,cudaMemcpyKind opt)
45 {
46  hipMemcpyKind opt_;
47 
48  switch (opt)
49  {
50  case cudaMemcpyHostToHost:
51  opt_ = hipMemcpyHostToHost;
52  break;
53 
54  case cudaMemcpyHostToDevice:
55  opt_ = hipMemcpyHostToDevice;
56  break;
57 
58  case cudaMemcpyDeviceToHost:
59  opt_ = hipMemcpyDeviceToHost;
60  break;
61 
62  case cudaMemcpyDeviceToDevice:
63  opt_ = hipMemcpyDeviceToDevice;
64  break;
65 
66  default:
67  opt_ = hipMemcpyDefault;
68  break;
69  }
70 
71  return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
72 }
73 
74 static cudaError_t cudaDeviceSynchronize()
75 {
76  return hipDeviceSynchronize();
77 }
78 
79 static cudaError_t cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
80 {
81  return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
82 }
83 
84 static const char* cudaGetErrorString ( cudaError_t error )
85 {
86  return hipGetErrorString(error);
87 }
88 
89 static cudaError_t cudaGetDevice ( int* device )
90 {
91  return hipGetDevice(device);
92 }
93 
94 static cudaError_t cudaSetDevice ( int device )
95 {
96  return hipSetDevice(device);
97 }
98 
99 static cudaError_t cudaMemGetInfo ( size_t* free, size_t* total )
100 {
101  return hipMemGetInfo(free,total);
102 }
103 
104 static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr, const void* func )
105 {
106  return hipFuncGetAttributes(attr,func);
107 }
108 
109 static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop, int device )
110 {
111  return hipGetDeviceProperties(prop,device);
112 }
113 
114 static cudaError_t cudaEventCreate ( cudaEvent_t* event )
115 {
116  return hipEventCreate(event);
117 }
118 
119 static cudaError_t cudaEventDestroy ( cudaEvent_t event )
120 {
121  return hipEventDestroy(event);
122 }
123 
124 static cudaError_t cudaMalloc ( void** devPtr, size_t size )
125 {
126  return hipMalloc(devPtr,size);
127 }
128 
129 static cudaError_t cudaMallocHost ( void** ptr, size_t size )
130 {
131  return hipHostMalloc(ptr,size);
132 }
133 
134 static cudaError_t cudaFree ( void* devPtr )
135 {
136  return hipFree(devPtr);
137 }
138 
139 static cudaError_t cudaFreeHost ( void* ptr )
140 {
141  return hipHostFree(ptr);
142 }
143 
144 static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
145 {
146  return hipStreamSynchronize(stream);
147 }
148 
149 static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
150 {
151  return hipEventRecord(event,stream);
152 }
153 
154 static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
155 {
156  return hipEventSynchronize(event);
157 }
158 
159 static cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end )
160 {
161  return hipEventElapsedTime(ms,start,end);
162 }
163 
164 static cudaError_t cudaGetDeviceCount ( int* count )
165 {
166  return hipGetDeviceCount(count);
167 }
168 
169 static cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind opt )
170 {
171  hipMemcpyKind opt_;
172 
173  switch (opt)
174  {
175  case cudaMemcpyHostToHost:
176  opt_ = hipMemcpyHostToHost;
177  break;
178 
179  case cudaMemcpyHostToDevice:
180  opt_ = hipMemcpyHostToDevice;
181  break;
182 
183  case cudaMemcpyDeviceToHost:
184  opt_ = hipMemcpyDeviceToHost;
185  break;
186 
187  case cudaMemcpyDeviceToDevice:
188  opt_ = hipMemcpyDeviceToDevice;
189  break;
190 
191  default:
192  opt_ = hipMemcpyDefault;
193  break;
194  }
195 
196  return hipMemcpy(dst,src,count,opt_);
197 }
198 
199 #ifdef __HIPCC__
200 
201 #include "cudify_hardware_common.hpp"
202 #include "util/cuda_util.hpp"
203 #include <vector>
204 #include <string.h>
205 #include "hipcub/hipcub.hpp"
206 #include "hipcub/block/block_scan.hpp"
207 
208 template<typename lambda_f>
209 __global__ void kernel_launch_lambda(lambda_f f)
210 {
211  dim3 bid = blockIdx;
212  dim3 tid = threadIdx;
213  f(bid,tid);
214 }
215 
216 template<typename lambda_f>
217 __global__ void kernel_launch_lambda_tls(lambda_f f)
218 {
219  f();
220 }
221 
222 namespace cub
223 {
224  template<typename T, unsigned int bd>
225  using BlockScan = hipcub::BlockScan<T,bd>;
226 }
227 
228 template<typename T>
229 struct has_work_gpu_cl_lin_blocks_
230 {
231  static unsigned int lin(const T & b)
232  {
233  return b.x * b.y * b.z;
234  }
235 };
236 
237 template<>
238 struct has_work_gpu_cl_lin_blocks_<unsigned int>
239 {
240  static unsigned int lin(const unsigned int & b)
241  {
242  return b;
243  }
244 };
245 
246 template<>
247 struct has_work_gpu_cl_lin_blocks_<unsigned long>
248 {
249  static unsigned int lin(const unsigned long & b)
250  {
251  return b;
252  }
253 };
254 
255 template<>
256 struct has_work_gpu_cl_lin_blocks_<int>
257 {
258  static unsigned int lin(const int & b)
259  {
260  return b;
261  }
262 };
263 
264 template<typename wthr_type, typename thr_type>
265 bool has_work_gpu_cl_(const wthr_type & wthr, const thr_type & thr)
266 {
267  return (has_work_gpu_cl_lin_blocks_<typename std::remove_const<wthr_type>::type>::lin(wthr) *
268  has_work_gpu_cl_lin_blocks_<typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
269 }
270 
271 #ifdef PRINT_CUDA_LAUNCHES
272 
273 #define CUDA_LAUNCH(cuda_call,ite, ...)\
274  \
275  CHECK_SE_CLASS1_PRE\
276  \
277  std::cout << "Launching: " << #cuda_call << std::endl;\
278  \
279  hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
280  \
281  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
282  }
283 
284 
285 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
286  {\
287  dim3 wthr__(wthr_);\
288  dim3 thr__(thr_);\
289  \
290  ite_gpu<1> itg;\
291  itg.wthr = wthr;\
292  itg.thr = thr;\
293  \
294  CHECK_SE_CLASS1_PRE\
295  std::cout << "Launching: " << #cuda_call << std::endl;\
296  \
297  hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
298  \
299  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
300  }
301 
302 #define CUDA_CHECK()
303 
304 #else
305 
306 #define CUDA_LAUNCH(cuda_call,ite, ...) \
307  \
308  {\
309  CHECK_SE_CLASS1_PRE\
310  \
311  if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
312  {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\
313  \
314  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
315  }
316 
317 
318 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
319  {\
320  \
321  CHECK_SE_CLASS1_PRE\
322  \
323  if (has_work_gpu_cl_(wthr_,thr_) == true)\
324  {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\
325  \
326  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
327  }
328 
329 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\
330  {\
331  \
332  CHECK_SE_CLASS1_PRE\
333  \
334  if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
335  {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\
336  \
337  CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
338  }
339 
340 #define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
341  {\
342  CHECK_SE_CLASS1_PRE\
343  if (ite.wthr.x != 0)\
344  {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\
345  CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
346  }
347 
348 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
349  {\
350  dim3 wthr__(wthr_);\
351  dim3 thr__(thr_);\
352  CHECK_SE_CLASS1_PRE\
353  if (wthr__.x != 0)\
354  {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\
355  CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
356  }
357 
358 #define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
359  {\
360  dim3 wthr__(wthr_);\
361  dim3 thr__(thr_);\
362  CHECK_SE_CLASS1_PRE\
363  if (wthr__.x != 0)\
364  {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\
365  CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
366  }
367 
368 #define CUDA_CHECK()
369 
370 #endif
371 
372 #endif
373 
374 
375 #endif
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data