OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
dispatch_scan.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "../../agent/agent_scan.cuh"
41 #include "../../thread/thread_operators.cuh"
42 #include "../../grid/grid_queue.cuh"
43 #include "../../util_arch.cuh"
44 #include "../../util_debug.cuh"
45 #include "../../util_device.cuh"
46 #include "../../util_namespace.cuh"
47 
49 CUB_NS_PREFIX
50 
52 namespace cub {
53 
54 
55 /******************************************************************************
56  * Kernel entry points
57  *****************************************************************************/
58 
62 template <
63  typename ScanTileStateT>
64 __global__ void DeviceScanInitKernel(
65  ScanTileStateT tile_state,
66  int num_tiles)
67 {
68  // Initialize tile status
69  tile_state.InitializeStatus(num_tiles);
70 }
71 
75 template <
76  typename ScanTileStateT,
77  typename NumSelectedIteratorT>
78 __global__ void DeviceCompactInitKernel(
79  ScanTileStateT tile_state,
80  int num_tiles,
81  NumSelectedIteratorT d_num_selected_out)
82 {
83  // Initialize tile status
84  tile_state.InitializeStatus(num_tiles);
85 
86  // Initialize d_num_selected_out
87  if ((blockIdx.x == 0) && (threadIdx.x == 0))
88  *d_num_selected_out = 0;
89 }
90 
91 
95 template <
96  typename ScanPolicyT,
97  typename InputIteratorT,
98  typename OutputIteratorT,
99  typename ScanTileStateT,
100  typename ScanOpT,
101  typename InitValueT,
102  typename OffsetT>
103 __launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
104 __global__ void DeviceScanKernel(
105  InputIteratorT d_in,
106  OutputIteratorT d_out,
107  ScanTileStateT tile_state,
108  int start_tile,
109  ScanOpT scan_op,
110  InitValueT init_value,
112 {
113  // Thread block type for scanning input tiles
114  typedef AgentScan<
115  ScanPolicyT,
116  InputIteratorT,
117  OutputIteratorT,
118  ScanOpT,
119  InitValueT,
120  OffsetT> AgentScanT;
121 
122  // Shared memory for AgentScan
123  __shared__ typename AgentScanT::TempStorage temp_storage;
124 
125  // Process tiles
126  AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
127  num_items,
128  tile_state,
129  start_tile);
130 }
131 
132 
133 
134 
135 /******************************************************************************
136  * Dispatch
137  ******************************************************************************/
138 
139 
143 template <
144  typename InputIteratorT,
145  typename OutputIteratorT,
146  typename ScanOpT,
147  typename InitValueT,
148  typename OffsetT>
150 {
151  //---------------------------------------------------------------------
152  // Constants and Types
153  //---------------------------------------------------------------------
154 
155  enum
156  {
157  INIT_KERNEL_THREADS = 128
158  };
159 
160  // The output value type
161  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
162  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
163  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
164 
165  // Tile status descriptor interface type
167 
168 
169  //---------------------------------------------------------------------
170  // Tuning policies
171  //---------------------------------------------------------------------
172 
174  struct Policy600
175  {
176  typedef AgentScanPolicy<
177  CUB_SCALED_GRANULARITIES(128, 15, OutputT),
179  LOAD_DEFAULT,
182  ScanPolicyT;
183  };
184 
185 
187  struct Policy520
188  {
189  // Titan X: 32.47B items/s @ 48M 32-bit T
190  typedef AgentScanPolicy<
191  CUB_SCALED_GRANULARITIES(128, 12, OutputT),
193  LOAD_LDG,
196  ScanPolicyT;
197  };
198 
199 
201  struct Policy350
202  {
203  // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
204  typedef AgentScanPolicy<
205  CUB_SCALED_GRANULARITIES(128, 12, OutputT),
207  LOAD_LDG,
210  ScanPolicyT;
211  };
212 
214  struct Policy300
215  {
216  typedef AgentScanPolicy<
217  CUB_SCALED_GRANULARITIES(256, 9, OutputT),
219  LOAD_DEFAULT,
222  ScanPolicyT;
223  };
224 
226  struct Policy200
227  {
228  // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
229  typedef AgentScanPolicy<
230  CUB_SCALED_GRANULARITIES(128, 12, OutputT),
232  LOAD_DEFAULT,
235  ScanPolicyT;
236  };
237 
239  struct Policy130
240  {
241  typedef AgentScanPolicy<
242  CUB_SCALED_GRANULARITIES(96, 21, OutputT),
244  LOAD_DEFAULT,
247  ScanPolicyT;
248  };
249 
251  struct Policy100
252  {
253  typedef AgentScanPolicy<
254  CUB_SCALED_GRANULARITIES(64, 9, OutputT),
256  LOAD_DEFAULT,
259  ScanPolicyT;
260  };
261 
262 
263  //---------------------------------------------------------------------
264  // Tuning policies of current PTX compiler pass
265  //---------------------------------------------------------------------
266 
267 #if (CUB_PTX_ARCH >= 600)
268  typedef Policy600 PtxPolicy;
269 
270 #elif (CUB_PTX_ARCH >= 520)
271  typedef Policy520 PtxPolicy;
272 
273 #elif (CUB_PTX_ARCH >= 350)
274  typedef Policy350 PtxPolicy;
275 
276 #elif (CUB_PTX_ARCH >= 300)
277  typedef Policy300 PtxPolicy;
278 
279 #elif (CUB_PTX_ARCH >= 200)
280  typedef Policy200 PtxPolicy;
281 
282 #elif (CUB_PTX_ARCH >= 130)
283  typedef Policy130 PtxPolicy;
284 
285 #else
286  typedef Policy100 PtxPolicy;
287 
288 #endif
289 
290  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
292 
293 
294  //---------------------------------------------------------------------
295  // Utilities
296  //---------------------------------------------------------------------
297 
301  template <typename KernelConfig>
302  CUB_RUNTIME_FUNCTION __forceinline__
303  static void InitConfigs(
304  int ptx_version,
305  KernelConfig &scan_kernel_config)
306  {
307  #if (CUB_PTX_ARCH > 0)
308  (void)ptx_version;
309 
310  // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
311  scan_kernel_config.template Init<PtxAgentScanPolicy>();
312 
313  #else
314 
315  // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
316  if (ptx_version >= 600)
317  {
318  scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
319  }
320  else if (ptx_version >= 520)
321  {
322  scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
323  }
324  else if (ptx_version >= 350)
325  {
326  scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
327  }
328  else if (ptx_version >= 300)
329  {
330  scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
331  }
332  else if (ptx_version >= 200)
333  {
334  scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
335  }
336  else if (ptx_version >= 130)
337  {
338  scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
339  }
340  else
341  {
342  scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
343  }
344 
345  #endif
346  }
347 
348 
353  {
354  int block_threads;
355  int items_per_thread;
356  int tile_items;
357 
358  template <typename PolicyT>
359  CUB_RUNTIME_FUNCTION __forceinline__
360  void Init()
361  {
362  block_threads = PolicyT::BLOCK_THREADS;
363  items_per_thread = PolicyT::ITEMS_PER_THREAD;
364  tile_items = block_threads * items_per_thread;
365  }
366  };
367 
368 
369  //---------------------------------------------------------------------
370  // Dispatch entrypoints
371  //---------------------------------------------------------------------
372 
377  template <
378  typename ScanInitKernelPtrT,
379  typename ScanSweepKernelPtrT>
380  CUB_RUNTIME_FUNCTION __forceinline__
381  static cudaError_t Dispatch(
382  void* d_temp_storage,
383  size_t& temp_storage_bytes,
384  InputIteratorT d_in,
385  OutputIteratorT d_out,
386  ScanOpT scan_op,
387  InitValueT init_value,
389  cudaStream_t stream,
390  bool debug_synchronous,
391  int /*ptx_version*/,
392  ScanInitKernelPtrT init_kernel,
393  ScanSweepKernelPtrT scan_kernel,
394  KernelConfig scan_kernel_config)
395  {
396 
397 #ifndef CUB_RUNTIME_ENABLED
398  (void)d_temp_storage;
399  (void)temp_storage_bytes;
400  (void)d_in;
401  (void)d_out;
402  (void)scan_op;
403  (void)init_value;
404  (void)num_items;
405  (void)stream;
406  (void)debug_synchronous;
407  (void)init_kernel;
408  (void)scan_kernel;
409  (void)scan_kernel_config;
410 
411  // Kernel launch not supported from this device
412  return CubDebug(cudaErrorNotSupported);
413 
414 #else
415  cudaError error = cudaSuccess;
416  do
417  {
418  // Get device ordinal
419  int device_ordinal;
420  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
421 
422  // Get SM count
423  int sm_count;
424  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
425 
426  // Number of input tiles
427  int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
428  int num_tiles = (num_items + tile_size - 1) / tile_size;
429 
430  // Specify temporary storage allocation requirements
431  size_t allocation_sizes[1];
432  if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
433 
434  // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
435  void* allocations[1];
436  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
437  if (d_temp_storage == NULL)
438  {
439  // Return if the caller is simply requesting the size of the storage allocation
440  break;
441  }
442 
443  // Return if empty problem
444  if (num_items == 0)
445  break;
446 
447  // Construct the tile status interface
449  if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
450 
451  // Log init_kernel configuration
452  int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
453  if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
454 
455  // Invoke init_kernel to initialize tile descriptors
456  init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
457  tile_state,
458  num_tiles);
459 
460  // Check for failure to launch
461  if (CubDebug(error = cudaPeekAtLastError())) break;
462 
463  // Sync the stream if specified to flush runtime errors
464  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
465 
466  // Get SM occupancy for scan_kernel
467  int scan_sm_occupancy;
468  if (CubDebug(error = MaxSmOccupancy(
469  scan_sm_occupancy, // out
470  scan_kernel,
471  scan_kernel_config.block_threads))) break;
472 
473  // Get max x-dimension of grid
474  int max_dim_x;
475  if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
476 
477  // Run grids in epochs (in case number of tiles exceeds max x-dimension
478  int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
479  for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
480  {
481  // Log scan_kernel configuration
482  if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
483  start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
484 
485  // Invoke scan_kernel
486  scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
487  d_in,
488  d_out,
489  tile_state,
490  start_tile,
491  scan_op,
492  init_value,
493  num_items);
494 
495  // Check for failure to launch
496  if (CubDebug(error = cudaPeekAtLastError())) break;
497 
498  // Sync the stream if specified to flush runtime errors
499  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
500  }
501  }
502  while (0);
503 
504  return error;
505 
506 #endif // CUB_RUNTIME_ENABLED
507  }
508 
509 
513  CUB_RUNTIME_FUNCTION __forceinline__
514  static cudaError_t Dispatch(
515  void* d_temp_storage,
516  size_t& temp_storage_bytes,
517  InputIteratorT d_in,
518  OutputIteratorT d_out,
519  ScanOpT scan_op,
520  InitValueT init_value,
522  cudaStream_t stream,
523  bool debug_synchronous)
524  {
525  cudaError error = cudaSuccess;
526  do
527  {
528  // Get PTX version
529  int ptx_version;
530  if (CubDebug(error = PtxVersion(ptx_version))) break;
531 
532  // Get kernel kernel dispatch configurations
533  KernelConfig scan_kernel_config;
534  InitConfigs(ptx_version, scan_kernel_config);
535 
536  // Dispatch
537  if (CubDebug(error = Dispatch(
538  d_temp_storage,
539  temp_storage_bytes,
540  d_in,
541  d_out,
542  scan_op,
543  init_value,
544  num_items,
545  stream,
546  debug_synchronous,
547  ptx_version,
548  DeviceScanInitKernel<ScanTileStateT>,
549  DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
550  scan_kernel_config))) break;
551  }
552  while (0);
553 
554  return error;
555  }
556 };
557 
558 
559 
560 } // CUB namespace
561 CUB_NS_POSTFIX // Optional outer namespace(s)
562 
563 
Cache as texture.
Definition: thread_load.cuh:69
Type equality test.
Definition: util_type.cuh:98
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT d_num_selected_out
[out] Pointer to the total number of items selected (i.e., length of d_selected_out)
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.
Definition: util_arch.cuh:141
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
Definition: agent_scan.cuh:98
Default (no modifier)
Definition: thread_load.cuh:64
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Optional outer namespace(s)
__global__ void DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
< Output iterator type for recording the number of items selected
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &scan_kernel_config)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
OffsetT OffsetT
[in] Total number of input data items
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
< The BlockScan algorithm to use
Definition: agent_scan.cuh:67
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelPtrT init_kernel, ScanSweepKernelPtrT scan_kernel, KernelConfig scan_kernel_config)
< Function type of cub::DeviceScanKernelPtrT
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
OutputIteratorT ScanTileStateT int ScanOpT InitValueT init_value
Initial value to seed the exclusive scan.
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
__global__ void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
< Tile status interface type
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)