40#include "util_macro.cuh"
54#ifndef DOXYGEN_SHOULD_SKIP_THIS
60template <
int ALLOCATIONS>
61__host__ __device__ __forceinline__
64 size_t &temp_storage_bytes,
65 void* (&allocations)[ALLOCATIONS],
66 size_t (&allocation_sizes)[ALLOCATIONS])
68 const int ALIGN_BYTES = 256;
69 const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
72 size_t allocation_offsets[ALLOCATIONS];
73 size_t bytes_needed = 0;
74 for (
int i = 0; i < ALLOCATIONS; ++i)
76 size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
77 allocation_offsets[i] = bytes_needed;
78 bytes_needed += allocation_bytes;
80 bytes_needed += ALIGN_BYTES - 1;
85 temp_storage_bytes = bytes_needed;
90 if (temp_storage_bytes < bytes_needed)
92 return CubDebug(cudaErrorInvalidValue);
96 d_temp_storage = (
void *) ((
size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
97 for (
int i = 0; i < ALLOCATIONS; ++i)
99 allocations[i] =
static_cast<char*
>(d_temp_storage) + allocation_offsets[i];
118CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
PtxVersion(
int &ptx_version)
123 typedef void (*EmptyKernelPtr)();
126 CUB_RUNTIME_FUNCTION __forceinline__
127 EmptyKernelPtr Empty()
129 return EmptyKernel<void>;
134#ifndef CUB_RUNTIME_ENABLED
138 return cudaErrorInvalidConfiguration;
140#elif (CUB_PTX_ARCH > 0)
147 cudaError_t error = cudaSuccess;
150 cudaFuncAttributes empty_kernel_attrs;
151 if (
CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>)))
break;
152 ptx_version = empty_kernel_attrs.ptxVersion * 10;
165CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
SmVersion(
int &sm_version,
int device_ordinal)
167#ifndef CUB_RUNTIME_ENABLED
169 (void)device_ordinal;
172 return cudaErrorInvalidConfiguration;
176 cudaError_t error = cudaSuccess;
181 if (
CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal)))
break;
182 if (
CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal)))
break;
183 sm_version = major * 100 + minor * 10;
193#ifndef DOXYGEN_SHOULD_SKIP_THIS
198CUB_RUNTIME_FUNCTION __forceinline__
201#if (CUB_PTX_ARCH == 0)
202 return cudaStreamSynchronize(stream);
206 return cudaDeviceSynchronize();
242template <
typename KernelPtr>
243CUB_RUNTIME_FUNCTION __forceinline__
245 int &max_sm_occupancy,
246 KernelPtr kernel_ptr,
248 int dynamic_smem_bytes = 0)
250#ifndef CUB_RUNTIME_ENABLED
251 (void)dynamic_smem_bytes;
254 (void)max_sm_occupancy;
257 return CubDebug(cudaErrorInvalidConfiguration);
261 return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
281 int items_per_thread;
285 CUB_RUNTIME_FUNCTION __forceinline__
286 KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
288 template <
typename AgentPolicyT,
typename KernelPtrT>
289 CUB_RUNTIME_FUNCTION __forceinline__
290 cudaError_t Init(KernelPtrT kernel_ptr)
292 block_threads = AgentPolicyT::BLOCK_THREADS;
293 items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
294 tile_size = block_threads * items_per_thread;
295 cudaError_t retval =
MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
303template <
int PTX_VERSION,
typename PolicyT,
typename PrevPolicyT>
307 typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type
ActivePolicy;
310 template <
typename FunctorT>
311 CUB_RUNTIME_FUNCTION __forceinline__
312 static cudaError_t
Invoke(
int ptx_version, FunctorT &op)
314 if (ptx_version < PTX_VERSION) {
315 return PrevPolicyT::Invoke(ptx_version, op);
317 return op.template Invoke<PolicyT>();
322template <
int PTX_VERSION,
typename PolicyT>
329 template <
typename FunctorT>
330 CUB_RUNTIME_FUNCTION __forceinline__
331 static cudaError_t
Invoke(
int , FunctorT &op) {
332 return op.template Invoke<PolicyT>();
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__global__ void EmptyKernel(void)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
Retrieves the SM version (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Optional outer namespace(s)
PolicyT ActivePolicy
The policy for the active compiler pass.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Invoke(int, FunctorT &op)
Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
Helper for dispatching into a policy chain.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Invoke(int ptx_version, FunctorT &op)
Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
If<(CUB_PTX_ARCH< PTX_VERSION), typenamePrevPolicyT::ActivePolicy, PolicyT >::Type ActivePolicy
The policy for the active compiler pass.
Type selection (IF ? ThenType : ElseType)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...