OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
util_device.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include "util_type.cuh"
37#include "util_arch.cuh"
38#include "util_debug.cuh"
39#include "util_namespace.cuh"
40#include "util_macro.cuh"
41
43CUB_NS_PREFIX
44
46namespace cub {
47
48
54#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
55
56
60template <int ALLOCATIONS>
61__host__ __device__ __forceinline__
62cudaError_t AliasTemporaries(
63 void *d_temp_storage,
64 size_t &temp_storage_bytes,
65 void* (&allocations)[ALLOCATIONS],
66 size_t (&allocation_sizes)[ALLOCATIONS])
67{
68 const int ALIGN_BYTES = 256;
69 const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
70
71 // Compute exclusive prefix sum over allocation requests
72 size_t allocation_offsets[ALLOCATIONS];
73 size_t bytes_needed = 0;
74 for (int i = 0; i < ALLOCATIONS; ++i)
75 {
76 size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
77 allocation_offsets[i] = bytes_needed;
78 bytes_needed += allocation_bytes;
79 }
80 bytes_needed += ALIGN_BYTES - 1;
81
82 // Check if the caller is simply requesting the size of the storage allocation
83 if (!d_temp_storage)
84 {
85 temp_storage_bytes = bytes_needed;
86 return cudaSuccess;
87 }
88
89 // Check if enough storage provided
90 if (temp_storage_bytes < bytes_needed)
91 {
92 return CubDebug(cudaErrorInvalidValue);
93 }
94
95 // Alias
96 d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
97 for (int i = 0; i < ALLOCATIONS; ++i)
98 {
99 allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
100 }
101
102 return cudaSuccess;
103}
104
105
109template <typename T>
110__global__ void EmptyKernel(void) { }
111
112
113#endif // DOXYGEN_SHOULD_SKIP_THIS
114
118CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
119{
120 struct Dummy
121 {
123 typedef void (*EmptyKernelPtr)();
124
126 CUB_RUNTIME_FUNCTION __forceinline__
127 EmptyKernelPtr Empty()
128 {
129 return EmptyKernel<void>;
130 }
131 };
132
133
134#ifndef CUB_RUNTIME_ENABLED
135 (void)ptx_version;
136
137 // CUDA API calls not supported from this device
138 return cudaErrorInvalidConfiguration;
139
140#elif (CUB_PTX_ARCH > 0)
141
142 ptx_version = CUB_PTX_ARCH;
143 return cudaSuccess;
144
145#else
146
147 cudaError_t error = cudaSuccess;
148 do
149 {
150 cudaFuncAttributes empty_kernel_attrs;
151 if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
152 ptx_version = empty_kernel_attrs.ptxVersion * 10;
153 }
154 while (0);
155
156 return error;
157
158#endif
159}
160
161
165CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
166{
167#ifndef CUB_RUNTIME_ENABLED
168 (void)sm_version;
169 (void)device_ordinal;
170
171 // CUDA API calls not supported from this device
172 return cudaErrorInvalidConfiguration;
173
174#else
175
176 cudaError_t error = cudaSuccess;
177 do
178 {
179 // Fill in SM version
180 int major, minor;
181 if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
182 if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
183 sm_version = major * 100 + minor * 10;
184 }
185 while (0);
186
187 return error;
188
189#endif
190}
191
192
193#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
194
198CUB_RUNTIME_FUNCTION __forceinline__
199static cudaError_t SyncStream(cudaStream_t stream)
200{
201#if (CUB_PTX_ARCH == 0)
202 return cudaStreamSynchronize(stream);
203#else
204 (void)stream;
205 // Device can't yet sync on a specific stream
206 return cudaDeviceSynchronize();
207#endif
208}
209
210
242template <typename KernelPtr>
243CUB_RUNTIME_FUNCTION __forceinline__
244cudaError_t MaxSmOccupancy(
245 int &max_sm_occupancy,
246 KernelPtr kernel_ptr,
247 int block_threads,
248 int dynamic_smem_bytes = 0)
249{
250#ifndef CUB_RUNTIME_ENABLED
251 (void)dynamic_smem_bytes;
252 (void)block_threads;
253 (void)kernel_ptr;
254 (void)max_sm_occupancy;
255
256 // CUDA API calls not supported from this device
257 return CubDebug(cudaErrorInvalidConfiguration);
258
259#else
260
261 return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
262 &max_sm_occupancy,
263 kernel_ptr,
264 block_threads,
265 dynamic_smem_bytes);
266
267#endif // CUB_RUNTIME_ENABLED
268}
269
270
271/******************************************************************************
272 * Policy management
273 ******************************************************************************/
274
279{
280 int block_threads;
281 int items_per_thread;
282 int tile_size;
283 int sm_occupancy;
284
285 CUB_RUNTIME_FUNCTION __forceinline__
286 KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
287
288 template <typename AgentPolicyT, typename KernelPtrT>
289 CUB_RUNTIME_FUNCTION __forceinline__
290 cudaError_t Init(KernelPtrT kernel_ptr)
291 {
292 block_threads = AgentPolicyT::BLOCK_THREADS;
293 items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
294 tile_size = block_threads * items_per_thread;
295 cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
296 return retval;
297 }
298};
299
300
301
303template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
305{
307 typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
308
310 template <typename FunctorT>
311 CUB_RUNTIME_FUNCTION __forceinline__
312 static cudaError_t Invoke(int ptx_version, FunctorT &op)
313 {
314 if (ptx_version < PTX_VERSION) {
315 return PrevPolicyT::Invoke(ptx_version, op);
316 }
317 return op.template Invoke<PolicyT>();
318 }
319};
320
322template <int PTX_VERSION, typename PolicyT>
323struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
324{
326 typedef PolicyT ActivePolicy;
327
329 template <typename FunctorT>
330 CUB_RUNTIME_FUNCTION __forceinline__
331 static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) {
332 return op.template Invoke<PolicyT>();
333 }
334};
335
336
337
338
339#endif // Do not document
340
341
342
343 // end group UtilMgmt
345
346} // CUB namespace
347CUB_NS_POSTFIX // Optional outer namespace(s)
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__global__ void EmptyKernel(void)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
Retrieves the SM version (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Optional outer namespace(s)
PolicyT ActivePolicy
The policy for the active compiler pass.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Invoke(int, FunctorT &op)
Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
Helper for dispatching into a policy chain.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Invoke(int ptx_version, FunctorT &op)
Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
If<(CUB_PTX_ARCH< PTX_VERSION), typenamePrevPolicyT::ActivePolicy, PolicyT >::Type ActivePolicy
The policy for the active compiler pass.
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53