OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
util_device.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "util_type.cuh"
37 #include "util_arch.cuh"
38 #include "util_debug.cuh"
39 #include "util_namespace.cuh"
40 #include "util_macro.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
48 
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
55 
56 
60 template <int ALLOCATIONS>
61 __host__ __device__ __forceinline__
62 cudaError_t AliasTemporaries(
63  void *d_temp_storage,
64  size_t &temp_storage_bytes,
65  void* (&allocations)[ALLOCATIONS],
66  size_t (&allocation_sizes)[ALLOCATIONS])
67 {
68  const int ALIGN_BYTES = 256;
69  const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
70 
71  // Compute exclusive prefix sum over allocation requests
72  size_t allocation_offsets[ALLOCATIONS];
73  size_t bytes_needed = 0;
74  for (int i = 0; i < ALLOCATIONS; ++i)
75  {
76  size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
77  allocation_offsets[i] = bytes_needed;
78  bytes_needed += allocation_bytes;
79  }
80  bytes_needed += ALIGN_BYTES - 1;
81 
82  // Check if the caller is simply requesting the size of the storage allocation
83  if (!d_temp_storage)
84  {
85  temp_storage_bytes = bytes_needed;
86  return cudaSuccess;
87  }
88 
89  // Check if enough storage provided
90  if (temp_storage_bytes < bytes_needed)
91  {
92  return CubDebug(cudaErrorInvalidValue);
93  }
94 
95  // Alias
96  d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
97  for (int i = 0; i < ALLOCATIONS; ++i)
98  {
99  allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
100  }
101 
102  return cudaSuccess;
103 }
104 
105 
109 template <typename T>
110 __global__ void EmptyKernel(void) { }
111 
112 
113 #endif // DOXYGEN_SHOULD_SKIP_THIS
114 
118 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
119 {
120  struct Dummy
121  {
123  typedef void (*EmptyKernelPtr)();
124 
126  CUB_RUNTIME_FUNCTION __forceinline__
127  EmptyKernelPtr Empty()
128  {
129  return EmptyKernel<void>;
130  }
131  };
132 
133 
134 #ifndef CUB_RUNTIME_ENABLED
135  (void)ptx_version;
136 
137  // CUDA API calls not supported from this device
138  return cudaErrorInvalidConfiguration;
139 
140 #elif (CUB_PTX_ARCH > 0)
141 
142  ptx_version = CUB_PTX_ARCH;
143  return cudaSuccess;
144 
145 #else
146 
147  cudaError_t error = cudaSuccess;
148  do
149  {
150  cudaFuncAttributes empty_kernel_attrs;
151  if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
152  ptx_version = empty_kernel_attrs.ptxVersion * 10;
153  }
154  while (0);
155 
156  return error;
157 
158 #endif
159 }
160 
161 
165 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
166 {
167 #ifndef CUB_RUNTIME_ENABLED
168  (void)sm_version;
169  (void)device_ordinal;
170 
171  // CUDA API calls not supported from this device
172  return cudaErrorInvalidConfiguration;
173 
174 #else
175 
176  cudaError_t error = cudaSuccess;
177  do
178  {
179  // Fill in SM version
180  int major, minor;
181  if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
182  if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
183  sm_version = major * 100 + minor * 10;
184  }
185  while (0);
186 
187  return error;
188 
189 #endif
190 }
191 
192 
193 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
194 
198 CUB_RUNTIME_FUNCTION __forceinline__
199 static cudaError_t SyncStream(cudaStream_t stream)
200 {
201 #if (CUB_PTX_ARCH == 0)
202  return cudaStreamSynchronize(stream);
203 #else
204  (void)stream;
205  // Device can't yet sync on a specific stream
206  return cudaDeviceSynchronize();
207 #endif
208 }
209 
210 
242 template <typename KernelPtr>
243 CUB_RUNTIME_FUNCTION __forceinline__
244 cudaError_t MaxSmOccupancy(
245  int &max_sm_occupancy,
246  KernelPtr kernel_ptr,
247  int block_threads,
248  int dynamic_smem_bytes = 0)
249 {
250 #ifndef CUB_RUNTIME_ENABLED
251  (void)dynamic_smem_bytes;
252  (void)block_threads;
253  (void)kernel_ptr;
254  (void)max_sm_occupancy;
255 
256  // CUDA API calls not supported from this device
257  return CubDebug(cudaErrorInvalidConfiguration);
258 
259 #else
260 
261  return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
262  &max_sm_occupancy,
263  kernel_ptr,
264  block_threads,
265  dynamic_smem_bytes);
266 
267 #endif // CUB_RUNTIME_ENABLED
268 }
269 
270 
271 /******************************************************************************
272  * Policy management
273  ******************************************************************************/
274 
279 {
280  int block_threads;
281  int items_per_thread;
282  int tile_size;
283  int sm_occupancy;
284 
285  CUB_RUNTIME_FUNCTION __forceinline__
286  KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
287 
288  template <typename AgentPolicyT, typename KernelPtrT>
289  CUB_RUNTIME_FUNCTION __forceinline__
290  cudaError_t Init(KernelPtrT kernel_ptr)
291  {
292  block_threads = AgentPolicyT::BLOCK_THREADS;
293  items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
294  tile_size = block_threads * items_per_thread;
295  cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
296  return retval;
297  }
298 };
299 
300 
301 
303 template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
305 {
307  typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
308 
310  template <typename FunctorT>
311  CUB_RUNTIME_FUNCTION __forceinline__
312  static cudaError_t Invoke(int ptx_version, FunctorT &op)
313  {
314  if (ptx_version < PTX_VERSION) {
315  return PrevPolicyT::Invoke(ptx_version, op);
316  }
317  return op.template Invoke<PolicyT>();
318  }
319 };
320 
322 template <int PTX_VERSION, typename PolicyT>
323 struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
324 {
326  typedef PolicyT ActivePolicy;
327 
329  template <typename FunctorT>
330  CUB_RUNTIME_FUNCTION __forceinline__
331  static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) {
332  return op.template Invoke<PolicyT>();
333  }
334 };
335 
336 
337 
338 
339 #endif // Do not document
340 
341 
342 
343  // end group UtilMgmt
345 
346 } // CUB namespace
347 CUB_NS_POSTFIX // Optional outer namespace(s)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Invoke(int, FunctorT &op)
Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
If<(CUB_PTX_ARCH< PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT >::Type ActivePolicy
The policy for the active compiler pass.
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Optional outer namespace(s)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Invoke(int ptx_version, FunctorT &op)
Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
Helper for dispatching into a policy chain.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
__global__ void EmptyKernel(void)
PolicyT ActivePolicy
The policy for the active compiler pass.
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
Retrieves the SM version (major * 100 + minor * 10)