1 #ifndef CUDIFY_HIP_HPP_ 2 #define CUDIFY_HIP_HPP_ 6 #define CUDA_ON_BACKEND CUDA_BACKEND_HIP 10 #include <hip/hip_runtime.h> 13 #include <hip/hip_runtime.h> 16 constexpr
int default_kernel_wg_threads_ = 256;
18 typedef hipError_t cudaError_t;
19 typedef hipStream_t cudaStream_t;
20 typedef hipDeviceProp_t cudaDeviceProp_t;
21 typedef cudaDeviceProp_t cudaDeviceProp;
22 typedef hipEvent_t cudaEvent_t;
23 typedef hipFuncAttributes cudaFuncAttributes;
26 #define cudaSuccess hipSuccess 29 static void init_wrappers()
37 cudaMemcpyHostToHost = 0,
38 cudaMemcpyHostToDevice = 1,
39 cudaMemcpyDeviceToHost = 2,
40 cudaMemcpyDeviceToDevice = 3,
44 static cudaError_t cudaMemcpyToSymbol(
unsigned char * global_cuda_error_array,
const void * mem,
size_t sz,
int offset,cudaMemcpyKind opt)
50 case cudaMemcpyHostToHost:
51 opt_ = hipMemcpyHostToHost;
54 case cudaMemcpyHostToDevice:
55 opt_ = hipMemcpyHostToDevice;
58 case cudaMemcpyDeviceToHost:
59 opt_ = hipMemcpyDeviceToHost;
62 case cudaMemcpyDeviceToDevice:
63 opt_ = hipMemcpyDeviceToDevice;
67 opt_ = hipMemcpyDefault;
71 return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
74 static cudaError_t cudaDeviceSynchronize()
76 return hipDeviceSynchronize();
79 static cudaError_t cudaMemcpyFromSymbol(
void * dev_mem,
const unsigned char * global_cuda_error_array,
size_t sz)
81 return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
84 static const char* cudaGetErrorString ( cudaError_t error )
86 return hipGetErrorString(error);
89 static cudaError_t cudaGetDevice (
int* device )
91 return hipGetDevice(device);
94 static cudaError_t cudaSetDevice (
int device )
96 return hipSetDevice(device);
99 static cudaError_t cudaMemGetInfo (
size_t* free,
size_t* total )
101 return hipMemGetInfo(free,total);
104 static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr,
const void* func )
106 return hipFuncGetAttributes(attr,func);
109 static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop,
int device )
111 return hipGetDeviceProperties(prop,device);
114 static cudaError_t cudaEventCreate ( cudaEvent_t* event )
116 return hipEventCreate(event);
119 static cudaError_t cudaEventDestroy ( cudaEvent_t event )
121 return hipEventDestroy(event);
124 static cudaError_t cudaMalloc (
void** devPtr,
size_t size )
126 return hipMalloc(devPtr,size);
129 static cudaError_t cudaMallocHost (
void** ptr,
size_t size )
131 return hipHostMalloc(ptr,size);
134 static cudaError_t cudaFree (
void* devPtr )
136 return hipFree(devPtr);
139 static cudaError_t cudaFreeHost (
void* ptr )
141 return hipHostFree(ptr);
144 static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
146 return hipStreamSynchronize(stream);
149 static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
151 return hipEventRecord(event,stream);
154 static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
156 return hipEventSynchronize(event);
159 static cudaError_t cudaEventElapsedTime (
float* ms, cudaEvent_t start, cudaEvent_t end )
161 return hipEventElapsedTime(ms,start,end);
164 static cudaError_t cudaGetDeviceCount (
int* count )
166 return hipGetDeviceCount(count);
169 static cudaError_t cudaMemcpy (
void* dst,
const void* src,
size_t count, cudaMemcpyKind opt )
175 case cudaMemcpyHostToHost:
176 opt_ = hipMemcpyHostToHost;
179 case cudaMemcpyHostToDevice:
180 opt_ = hipMemcpyHostToDevice;
183 case cudaMemcpyDeviceToHost:
184 opt_ = hipMemcpyDeviceToHost;
187 case cudaMemcpyDeviceToDevice:
188 opt_ = hipMemcpyDeviceToDevice;
192 opt_ = hipMemcpyDefault;
196 return hipMemcpy(dst,src,count,opt_);
201 #include "cudify_hardware_common.hpp" 202 #include "util/cuda_util.hpp" 205 #include "hipcub/hipcub.hpp" 206 #include "hipcub/block/block_scan.hpp" 208 template<
typename lambda_f>
209 __global__
void kernel_launch_lambda(lambda_f f)
212 dim3 tid = threadIdx;
216 template<
typename lambda_f>
217 __global__
void kernel_launch_lambda_tls(lambda_f f)
224 template<
typename T,
unsigned int bd>
225 using BlockScan = hipcub::BlockScan<T,bd>;
229 struct has_work_gpu_cl_lin_blocks_
231 static unsigned int lin(
const T & b)
233 return b.x * b.y * b.z;
238 struct has_work_gpu_cl_lin_blocks_<unsigned
int>
240 static unsigned int lin(
const unsigned int & b)
247 struct has_work_gpu_cl_lin_blocks_<unsigned long>
249 static unsigned int lin(
const unsigned long & b)
256 struct has_work_gpu_cl_lin_blocks_<
int>
258 static unsigned int lin(
const int & b)
264 template<
typename wthr_type,
typename thr_type>
265 bool has_work_gpu_cl_(
const wthr_type & wthr,
const thr_type & thr)
267 return (has_work_gpu_cl_lin_blocks_<
typename std::remove_const<wthr_type>::type>::lin(wthr) *
268 has_work_gpu_cl_lin_blocks_<
typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
271 #ifdef PRINT_CUDA_LAUNCHES 273 #define CUDA_LAUNCH(cuda_call,ite, ...)\ 277 std::cout << "Launching: " << #cuda_call << std::endl;\ 279 hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\ 281 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 285 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ 295 std::cout << "Launching: " << #cuda_call << std::endl;\ 297 hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\ 299 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 306 #define CUDA_LAUNCH(cuda_call,ite, ...) \ 311 if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\ 312 {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\ 314 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 318 #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ 323 if (has_work_gpu_cl_(wthr_,thr_) == true)\ 324 {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\ 326 CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ 329 #define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\ 334 if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\ 335 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\ 337 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\ 340 #define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \ 343 if (ite.wthr.x != 0)\ 344 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\ 345 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\ 348 #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \ 354 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\ 355 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\ 358 #define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \ 364 {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\ 365 CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\ Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data