OpenFPM  5.2.0
Project that contain the implementation of distributed structures
cudify_hip.hpp
1 #ifndef CUDIFY_HIP_HPP_
2 #define CUDIFY_HIP_HPP_
3 
4 #include "config.h"
5 
6 #define CUDA_ON_BACKEND CUDA_BACKEND_HIP
7 
8 #ifdef __NVCC__
9  #undef __NVCC__
10  #include <hip/hip_runtime.h>
11  #define __NVCC__
12 #else
13  #include <hip/hip_runtime.h>
14 #endif
15 
16 constexpr int default_kernel_wg_threads_ = 256;
17 
18 typedef hipError_t cudaError_t;
19 typedef hipStream_t cudaStream_t;
20 typedef hipDeviceProp_t cudaDeviceProp_t;
21 typedef cudaDeviceProp_t cudaDeviceProp;
22 typedef hipEvent_t cudaEvent_t;
23 typedef hipFuncAttributes cudaFuncAttributes;
24 
25 
26 #define cudaSuccess hipSuccess
27 
28 
29 static void init_wrappers()
30 {}
31 
35 enum cudaMemcpyKind
36 {
37  cudaMemcpyHostToHost = 0,
38  cudaMemcpyHostToDevice = 1,
39  cudaMemcpyDeviceToHost = 2,
40  cudaMemcpyDeviceToDevice = 3,
41  cudaMemcpyDefault = 4
42 };
43 
44 static cudaError_t cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,cudaMemcpyKind opt)
45 {
46  hipMemcpyKind opt_;
47 
48  switch (opt)
49  {
50  case cudaMemcpyHostToHost:
51  opt_ = hipMemcpyHostToHost;
52  break;
53 
54  case cudaMemcpyHostToDevice:
55  opt_ = hipMemcpyHostToDevice;
56  break;
57 
58  case cudaMemcpyDeviceToHost:
59  opt_ = hipMemcpyDeviceToHost;
60  break;
61 
62  case cudaMemcpyDeviceToDevice:
63  opt_ = hipMemcpyDeviceToDevice;
64  break;
65 
66  default:
67  opt_ = hipMemcpyDefault;
68  break;
69  }
70 
71  return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
72 }
73 
74 static cudaError_t cudaDeviceSynchronize()
75 {
76  return hipDeviceSynchronize();
77 }
78 
79 static cudaError_t cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
80 {
81  return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
82 }
83 
84 static const char* cudaGetErrorString ( cudaError_t error )
85 {
86  return hipGetErrorString(error);
87 }
88 
89 static cudaError_t cudaGetDevice ( int* device )
90 {
91  return hipGetDevice(device);
92 }
93 
94 static cudaError_t cudaSetDevice ( int device )
95 {
96  return hipSetDevice(device);
97 }
98 
99 static cudaError_t cudaMemGetInfo ( size_t* free, size_t* total )
100 {
101  return hipMemGetInfo(free,total);
102 }
103 
104 static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr, const void* func )
105 {
106  return hipFuncGetAttributes(attr,func);
107 }
108 
109 static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop, int device )
110 {
111  return hipGetDeviceProperties(prop,device);
112 }
113 
114 static cudaError_t cudaEventCreate ( cudaEvent_t* event )
115 {
116  return hipEventCreate(event);
117 }
118 
119 static cudaError_t cudaEventDestroy ( cudaEvent_t event )
120 {
121  return hipEventDestroy(event);
122 }
123 
124 static cudaError_t cudaMalloc ( void** devPtr, size_t size )
125 {
126  return hipMalloc(devPtr,size);
127 }
128 
129 static cudaError_t cudaMallocHost ( void** ptr, size_t size )
130 {
131  return hipHostMalloc(ptr,size);
132 }
133 
134 static cudaError_t cudaFree ( void* devPtr )
135 {
136  return hipFree(devPtr);
137 }
138 
139 static cudaError_t cudaFreeHost ( void* ptr )
140 {
141  return hipHostFree(ptr);
142 }
143 
144 static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
145 {
146  return hipStreamSynchronize(stream);
147 }
148 
149 static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
150 {
151  return hipEventRecord(event,stream);
152 }
153 
154 static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
155 {
156  return hipEventSynchronize(event);
157 }
158 
159 static cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end )
160 {
161  return hipEventElapsedTime(ms,start,end);
162 }
163 
164 static cudaError_t cudaGetDeviceCount ( int* count )
165 {
166  return hipGetDeviceCount(count);
167 }
168 
169 static cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind opt )
170 {
171  hipMemcpyKind opt_;
172 
173  switch (opt)
174  {
175  case cudaMemcpyHostToHost:
176  opt_ = hipMemcpyHostToHost;
177  break;
178 
179  case cudaMemcpyHostToDevice:
180  opt_ = hipMemcpyHostToDevice;
181  break;
182 
183  case cudaMemcpyDeviceToHost:
184  opt_ = hipMemcpyDeviceToHost;
185  break;
186 
187  case cudaMemcpyDeviceToDevice:
188  opt_ = hipMemcpyDeviceToDevice;
189  break;
190 
191  default:
192  opt_ = hipMemcpyDefault;
193  break;
194  }
195 
196  return hipMemcpy(dst,src,count,opt_);
197 }
198 
199 #ifdef __HIPCC__
200 
201 #include "util/cudify/cudify_hardware_cpu.hpp"
202 #include <vector>
203 #include <string.h>
204 #include "hipcub/hipcub.hpp"
205 #include "hipcub/block/block_scan.hpp"
206 
207 template<typename lambda_f>
208 __global__ void kernel_launch_lambda(lambda_f f)
209 {
210  dim3 bid = blockIdx;
211  dim3 tid = threadIdx;
212  f(bid,tid);
213 }
214 
215 template<typename lambda_f>
216 __global__ void kernel_launch_lambda_tls(lambda_f f)
217 {
218  f();
219 }
220 
221 namespace cub
222 {
223  template<typename T, unsigned int bd>
224  using BlockScan = hipcub::BlockScan<T,bd>;
225 }
226 
227 template<typename T>
228 struct has_work_gpu_cl_lin_blocks_
229 {
230  static unsigned int lin(const T & b)
231  {
232  return b.x * b.y * b.z;
233  }
234 };
235 
236 template<>
237 struct has_work_gpu_cl_lin_blocks_<unsigned int>
238 {
239  static unsigned int lin(const unsigned int & b)
240  {
241  return b;
242  }
243 };
244 
245 template<>
246 struct has_work_gpu_cl_lin_blocks_<unsigned long>
247 {
248  static unsigned int lin(const unsigned long & b)
249  {
250  return b;
251  }
252 };
253 
254 template<>
255 struct has_work_gpu_cl_lin_blocks_<int>
256 {
257  static unsigned int lin(const int & b)
258  {
259  return b;
260  }
261 };
262 
263 template<typename wthr_type, typename thr_type>
264 bool has_work_gpu_cl_(const wthr_type & wthr, const thr_type & thr)
265 {
266  return (has_work_gpu_cl_lin_blocks_<typename std::remove_const<wthr_type>::type>::lin(wthr) *
267  has_work_gpu_cl_lin_blocks_<typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
268 }
269 
270 #ifdef PRINT_CUDA_LAUNCHES
271 
272 #define CUDA_LAUNCH(cuda_call,ite, ...)\
273  \
274  CHECK_SE_CLASS1_PRE\
275  \
276  std::cout << "Launching: " << #cuda_call << std::endl;\
277  \
278  hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
279  \
280  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
281  }
282 
283 
284 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
285  {\
286  dim3 wthr__(wthr_);\
287  dim3 thr__(thr_);\
288  \
289  ite_gpu<1> itg;\
290  itg.wthr = wthr;\
291  itg.thr = thr;\
292  \
293  CHECK_SE_CLASS1_PRE\
294  std::cout << "Launching: " << #cuda_call << std::endl;\
295  \
296  hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
297  \
298  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
299  }
300 
301 #define CUDA_CHECK()
302 
303 #else
304 
305 #define CUDA_LAUNCH(cuda_call,ite, ...) \
306  \
307  {\
308  CHECK_SE_CLASS1_PRE\
309  \
310  if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
311  {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\
312  \
313  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
314  }
315 
316 
317 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
318  {\
319  \
320  CHECK_SE_CLASS1_PRE\
321  \
322  if (has_work_gpu_cl_(wthr_,thr_) == true)\
323  {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\
324  \
325  CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
326  }
327 
328 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\
329  {\
330  \
331  CHECK_SE_CLASS1_PRE\
332  \
333  if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
334  {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\
335  \
336  CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
337  }
338 
339 #define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
340  {\
341  CHECK_SE_CLASS1_PRE\
342  if (ite.wthr.x != 0)\
343  {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\
344  CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
345  }
346 
347 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
348  {\
349  dim3 wthr__(wthr_);\
350  dim3 thr__(thr_);\
351  CHECK_SE_CLASS1_PRE\
352  if (wthr__.x != 0)\
353  {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\
354  CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
355  }
356 
357 #define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
358  {\
359  dim3 wthr__(wthr_);\
360  dim3 thr__(thr_);\
361  CHECK_SE_CLASS1_PRE\
362  if (wthr__.x != 0)\
363  {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\
364  CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
365  }
366 
367 #define CUDA_CHECK()
368 
369 #endif
370 
371 #endif
372 
373 
374 #endif
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:221