OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
cudify_hip.hpp
1#ifndef CUDIFY_HIP_HPP_
2#define CUDIFY_HIP_HPP_
3
4#include "config.h"
5
6#define CUDA_ON_BACKEND CUDA_BACKEND_HIP
7
8#ifdef __NVCC__
9 #undef __NVCC__
10 #include <hip/hip_runtime.h>
11 #define __NVCC__
12#else
13 #include <hip/hip_runtime.h>
14#endif
15
16constexpr int default_kernel_wg_threads_ = 256;
17
18typedef hipError_t cudaError_t;
19typedef hipStream_t cudaStream_t;
20typedef hipDeviceProp_t cudaDeviceProp_t;
21typedef cudaDeviceProp_t cudaDeviceProp;
22typedef hipEvent_t cudaEvent_t;
23typedef hipFuncAttributes cudaFuncAttributes;
24
25
26#define cudaSuccess hipSuccess
27
28
29static void init_wrappers()
30{}
31
35enum cudaMemcpyKind
36{
37 cudaMemcpyHostToHost = 0,
38 cudaMemcpyHostToDevice = 1,
39 cudaMemcpyDeviceToHost = 2,
40 cudaMemcpyDeviceToDevice = 3,
41 cudaMemcpyDefault = 4
42};
43
44static cudaError_t cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,cudaMemcpyKind opt)
45{
46 hipMemcpyKind opt_;
47
48 switch (opt)
49 {
50 case cudaMemcpyHostToHost:
51 opt_ = hipMemcpyHostToHost;
52 break;
53
54 case cudaMemcpyHostToDevice:
55 opt_ = hipMemcpyHostToDevice;
56 break;
57
58 case cudaMemcpyDeviceToHost:
59 opt_ = hipMemcpyDeviceToHost;
60 break;
61
62 case cudaMemcpyDeviceToDevice:
63 opt_ = hipMemcpyDeviceToDevice;
64 break;
65
66 default:
67 opt_ = hipMemcpyDefault;
68 break;
69 }
70
71 return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
72}
73
74static cudaError_t cudaDeviceSynchronize()
75{
76 return hipDeviceSynchronize();
77}
78
79static cudaError_t cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
80{
81 return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
82}
83
84static const char* cudaGetErrorString ( cudaError_t error )
85{
86 return hipGetErrorString(error);
87}
88
89static cudaError_t cudaGetDevice ( int* device )
90{
91 return hipGetDevice(device);
92}
93
94static cudaError_t cudaSetDevice ( int device )
95{
96 return hipSetDevice(device);
97}
98
99static cudaError_t cudaMemGetInfo ( size_t* free, size_t* total )
100{
101 return hipMemGetInfo(free,total);
102}
103
104static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr, const void* func )
105{
106 return hipFuncGetAttributes(attr,func);
107}
108
109static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop, int device )
110{
111 return hipGetDeviceProperties(prop,device);
112}
113
114static cudaError_t cudaEventCreate ( cudaEvent_t* event )
115{
116 return hipEventCreate(event);
117}
118
119static cudaError_t cudaEventDestroy ( cudaEvent_t event )
120{
121 return hipEventDestroy(event);
122}
123
124static cudaError_t cudaMalloc ( void** devPtr, size_t size )
125{
126 return hipMalloc(devPtr,size);
127}
128
129static cudaError_t cudaMallocHost ( void** ptr, size_t size )
130{
131 return hipHostMalloc(ptr,size);
132}
133
134static cudaError_t cudaFree ( void* devPtr )
135{
136 return hipFree(devPtr);
137}
138
139static cudaError_t cudaFreeHost ( void* ptr )
140{
141 return hipHostFree(ptr);
142}
143
144static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
145{
146 return hipStreamSynchronize(stream);
147}
148
149static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
150{
151 return hipEventRecord(event,stream);
152}
153
154static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
155{
156 return hipEventSynchronize(event);
157}
158
159static cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end )
160{
161 return hipEventElapsedTime(ms,start,end);
162}
163
164static cudaError_t cudaGetDeviceCount ( int* count )
165{
166 return hipGetDeviceCount(count);
167}
168
169static cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind opt )
170{
171 hipMemcpyKind opt_;
172
173 switch (opt)
174 {
175 case cudaMemcpyHostToHost:
176 opt_ = hipMemcpyHostToHost;
177 break;
178
179 case cudaMemcpyHostToDevice:
180 opt_ = hipMemcpyHostToDevice;
181 break;
182
183 case cudaMemcpyDeviceToHost:
184 opt_ = hipMemcpyDeviceToHost;
185 break;
186
187 case cudaMemcpyDeviceToDevice:
188 opt_ = hipMemcpyDeviceToDevice;
189 break;
190
191 default:
192 opt_ = hipMemcpyDefault;
193 break;
194 }
195
196 return hipMemcpy(dst,src,count,opt_);
197}
198
199#ifdef __HIPCC__
200
201#include "util/cudify/cudify_hardware_cpu.hpp"
202#include "util/cuda_util.hpp"
203#include <vector>
204#include <string.h>
205#include "hipcub/hipcub.hpp"
206#include "hipcub/block/block_scan.hpp"
207
208template<typename lambda_f>
209__global__ void kernel_launch_lambda(lambda_f f)
210{
211 dim3 bid = blockIdx;
212 dim3 tid = threadIdx;
213 f(bid,tid);
214}
215
216template<typename lambda_f>
217__global__ void kernel_launch_lambda_tls(lambda_f f)
218{
219 f();
220}
221
222namespace cub
223{
224 template<typename T, unsigned int bd>
225 using BlockScan = hipcub::BlockScan<T,bd>;
226}
227
228template<typename T>
229struct has_work_gpu_cl_lin_blocks_
230{
231 static unsigned int lin(const T & b)
232 {
233 return b.x * b.y * b.z;
234 }
235};
236
237template<>
238struct has_work_gpu_cl_lin_blocks_<unsigned int>
239{
240 static unsigned int lin(const unsigned int & b)
241 {
242 return b;
243 }
244};
245
246template<>
247struct has_work_gpu_cl_lin_blocks_<unsigned long>
248{
249 static unsigned int lin(const unsigned long & b)
250 {
251 return b;
252 }
253};
254
255template<>
256struct has_work_gpu_cl_lin_blocks_<int>
257{
258 static unsigned int lin(const int & b)
259 {
260 return b;
261 }
262};
263
264template<typename wthr_type, typename thr_type>
265bool has_work_gpu_cl_(const wthr_type & wthr, const thr_type & thr)
266{
267 return (has_work_gpu_cl_lin_blocks_<typename std::remove_const<wthr_type>::type>::lin(wthr) *
268 has_work_gpu_cl_lin_blocks_<typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
269}
270
271#ifdef PRINT_CUDA_LAUNCHES
272
273#define CUDA_LAUNCH(cuda_call,ite, ...)\
274 \
275 CHECK_SE_CLASS1_PRE\
276 \
277 std::cout << "Launching: " << #cuda_call << std::endl;\
278 \
279 hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
280 \
281 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
282 }
283
284
285#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
286 {\
287 dim3 wthr__(wthr_);\
288 dim3 thr__(thr_);\
289 \
290 ite_gpu<1> itg;\
291 itg.wthr = wthr;\
292 itg.thr = thr;\
293 \
294 CHECK_SE_CLASS1_PRE\
295 std::cout << "Launching: " << #cuda_call << std::endl;\
296 \
297 hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
298 \
299 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
300 }
301
302#define CUDA_CHECK()
303
304#else
305
306#define CUDA_LAUNCH(cuda_call,ite, ...) \
307 \
308 {\
309 CHECK_SE_CLASS1_PRE\
310 \
311 if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
312 {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\
313 \
314 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
315 }
316
317
318#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
319 {\
320 \
321 CHECK_SE_CLASS1_PRE\
322 \
323 if (has_work_gpu_cl_(wthr_,thr_) == true)\
324 {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\
325 \
326 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
327 }
328
329#define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\
330 {\
331 \
332 CHECK_SE_CLASS1_PRE\
333 \
334 if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
335 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\
336 \
337 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
338 }
339
340#define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
341 {\
342 CHECK_SE_CLASS1_PRE\
343 if (ite.wthr.x != 0)\
344 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\
345 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
346 }
347
348#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
349 {\
350 dim3 wthr__(wthr_);\
351 dim3 thr__(thr_);\
352 CHECK_SE_CLASS1_PRE\
353 if (wthr__.x != 0)\
354 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\
355 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
356 }
357
358#define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
359 {\
360 dim3 wthr__(wthr_);\
361 dim3 thr__(thr_);\
362 CHECK_SE_CLASS1_PRE\
363 if (wthr__.x != 0)\
364 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\
365 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
366 }
367
368#define CUDA_CHECK()
369
370#endif
371
372#endif
373
374
375#endif
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...