doxygen/openfpm/cudify__hip_8hpp_source.html

#ifndef CUDIFY_HIP_HPP_

#define CUDIFY_HIP_HPP_


#include "config.h"


#define CUDA_ON_BACKEND CUDA_BACKEND_HIP


#ifdef __NVCC__

    #undef __NVCC__

    #include <hip/hip_runtime.h>

    #define __NVCC__

#else

    #include <hip/hip_runtime.h>

#endif


constexpr int default_kernel_wg_threads_ = 256;


typedef hipError_t cudaError_t;

typedef hipStream_t cudaStream_t;

typedef hipDeviceProp_t cudaDeviceProp_t;

typedef cudaDeviceProp_t cudaDeviceProp;

typedef hipEvent_t cudaEvent_t;

typedef hipFuncAttributes cudaFuncAttributes;


#define cudaSuccess hipSuccess


static void init_wrappers()

{}


enum  cudaMemcpyKind

{

    cudaMemcpyHostToHost          =   0,

    cudaMemcpyHostToDevice        =   1,

    cudaMemcpyDeviceToHost        =   2,

    cudaMemcpyDeviceToDevice      =   3,

    cudaMemcpyDefault             =   4

};


static cudaError_t cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,cudaMemcpyKind opt)

{

    hipMemcpyKind opt_;


    switch (opt)

    {

        case cudaMemcpyHostToHost:

            opt_ = hipMemcpyHostToHost;

            break;


        case cudaMemcpyHostToDevice:

            opt_ = hipMemcpyHostToDevice;

            break;


        case cudaMemcpyDeviceToHost:

            opt_ = hipMemcpyDeviceToHost;

            break;


        case cudaMemcpyDeviceToDevice:

            opt_ = hipMemcpyDeviceToDevice;

            break;


        default:

            opt_ = hipMemcpyDefault;

            break;

    }


    return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);

}


static cudaError_t cudaDeviceSynchronize()

{

    return hipDeviceSynchronize();

}


static cudaError_t cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)

{

    return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);

}


static const char* cudaGetErrorString ( cudaError_t error )

{

    return hipGetErrorString(error);

}


static cudaError_t cudaGetDevice ( int* device )

{

    return hipGetDevice(device);

}


static cudaError_t cudaSetDevice ( int  device )

{

    return hipSetDevice(device);

}


static cudaError_t cudaMemGetInfo ( size_t* free, size_t* total )

{

    return hipMemGetInfo(free,total);

}


static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr, const void* func )

{

    return hipFuncGetAttributes(attr,func);

}


static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop, int  device )

{

    return hipGetDeviceProperties(prop,device);

}


static cudaError_t cudaEventCreate ( cudaEvent_t* event )

{

    return hipEventCreate(event);

}


static cudaError_t cudaEventDestroy ( cudaEvent_t event )

{

    return hipEventDestroy(event);

}


static cudaError_t cudaMalloc ( void** devPtr, size_t size )

{

    return hipMalloc(devPtr,size);

}


static cudaError_t cudaMallocHost ( void** ptr, size_t size )

{

    return hipHostMalloc(ptr,size);

}


static cudaError_t cudaFree ( void* devPtr )

{

    return hipFree(devPtr);

}


static cudaError_t cudaFreeHost ( void* ptr )

{

    return hipHostFree(ptr);

}


static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )

{

    return hipStreamSynchronize(stream);

}


static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )

{

    return hipEventRecord(event,stream);

}


static cudaError_t cudaEventSynchronize ( cudaEvent_t event )

{

    return hipEventSynchronize(event);

}


static cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end )

{

    return hipEventElapsedTime(ms,start,end);

}


static cudaError_t cudaGetDeviceCount ( int* count )

{

    return hipGetDeviceCount(count);

}


static cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind opt )

{

    hipMemcpyKind opt_;


    switch (opt)

    {

        case cudaMemcpyHostToHost:

            opt_ = hipMemcpyHostToHost;

            break;


        case cudaMemcpyHostToDevice:

            opt_ = hipMemcpyHostToDevice;

            break;


        case cudaMemcpyDeviceToHost:

            opt_ = hipMemcpyDeviceToHost;

            break;


        case cudaMemcpyDeviceToDevice:

            opt_ = hipMemcpyDeviceToDevice;

            break;


        default:

            opt_ = hipMemcpyDefault;

            break;

    }


    return hipMemcpy(dst,src,count,opt_);

}


#ifdef __HIPCC__


#include "util/cudify/cudify_hardware_cpu.hpp"

#include "util/cuda_util.hpp"

#include <vector>

#include <string.h>

#include "hipcub/hipcub.hpp"

#include "hipcub/block/block_scan.hpp"


template<typename lambda_f>

__global__ void kernel_launch_lambda(lambda_f f)

{

    dim3 bid = blockIdx;

    dim3 tid = threadIdx;

    f(bid,tid);

}


template<typename lambda_f>

__global__ void kernel_launch_lambda_tls(lambda_f f)

{

    f();

}


namespace cub

{

    template<typename T, unsigned int bd>

    using BlockScan = hipcub::BlockScan<T,bd>;

}


template<typename T>

struct has_work_gpu_cl_lin_blocks_

{

    static unsigned int lin(const T & b)

    {

        return b.x * b.y * b.z;

    }

};


template<>

struct has_work_gpu_cl_lin_blocks_<unsigned int>

{

    static unsigned int lin(const unsigned int & b)

    {

        return b;

    }

};


template<>

struct has_work_gpu_cl_lin_blocks_<unsigned long>

{

    static unsigned int lin(const unsigned long & b)

    {

        return b;

    }

};


template<>

struct has_work_gpu_cl_lin_blocks_<int>

{

    static unsigned int lin(const int & b)

    {

        return b;

    }

};


template<typename wthr_type, typename thr_type>

bool has_work_gpu_cl_(const wthr_type & wthr, const thr_type & thr)

{

    return (has_work_gpu_cl_lin_blocks_<typename std::remove_const<wthr_type>::type>::lin(wthr) *

            has_work_gpu_cl_lin_blocks_<typename std::remove_const<thr_type>::type>::lin(thr)) != 0;

}


#ifdef PRINT_CUDA_LAUNCHES


#define CUDA_LAUNCH(cuda_call,ite, ...)\

        \

        CHECK_SE_CLASS1_PRE\

        \

        std::cout << "Launching: " << #cuda_call << std::endl;\

        \

        hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\

        \

        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\

        }


#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\

        {\

        dim3 wthr__(wthr_);\

        dim3 thr__(thr_);\

        \

        ite_gpu<1> itg;\

        itg.wthr = wthr;\

        itg.thr = thr;\

        \

        CHECK_SE_CLASS1_PRE\

        std::cout << "Launching: " << #cuda_call << std::endl;\

        \

        hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\

        \

        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\

        }


#define CUDA_CHECK()


#else


#define CUDA_LAUNCH(cuda_call,ite, ...) \

        \

        {\

        CHECK_SE_CLASS1_PRE\

        \

        if (has_work_gpu_cl_(ite.wthr,ite.thr)  == true)\

        {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\

        \

        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\

        }


#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\

        {\

        \

        CHECK_SE_CLASS1_PRE\

        \

        if (has_work_gpu_cl_(wthr_,thr_) == true)\

        {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\

        \

        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\

        }


#define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\

        {\

        \

        CHECK_SE_CLASS1_PRE\

        \

        if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\

        {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\

        \

        CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\

        }


#define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \

        {\

        CHECK_SE_CLASS1_PRE\

        if (ite.wthr.x != 0)\

        {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\

    CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\

        }


#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \

        {\

        dim3 wthr__(wthr_);\

        dim3 thr__(thr_);\

    CHECK_SE_CLASS1_PRE\

        if (wthr__.x != 0)\

        {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\

    CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\

        }


#define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \

        {\

        dim3 wthr__(wthr_);\

        dim3 thr__(thr_);\

        CHECK_SE_CLASS1_PRE\

        if (wthr__.x != 0)\

        {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\

        CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\

        }


#define CUDA_CHECK()


#endif


#endif


#endif

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

aggregate
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition aggregate.hpp:215