OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
dispatch_rle.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "dispatch_scan.cuh"
41 #include "../../agent/agent_rle.cuh"
42 #include "../../thread/thread_operators.cuh"
43 #include "../../grid/grid_queue.cuh"
44 #include "../../util_device.cuh"
45 #include "../../util_namespace.cuh"
46 
48 CUB_NS_PREFIX
49 
51 namespace cub {
52 
53 
54 /******************************************************************************
55  * Kernel entry points
56  *****************************************************************************/
57 
65 template <
66  typename AgentRlePolicyT,
67  typename InputIteratorT,
68  typename OffsetsOutputIteratorT,
69  typename LengthsOutputIteratorT,
70  typename NumRunsOutputIteratorT,
71  typename ScanTileStateT,
72  typename EqualityOpT,
73  typename OffsetT>
74 __launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
75 __global__ void DeviceRleSweepKernel(
76  InputIteratorT d_in,
77  OffsetsOutputIteratorT d_offsets_out,
78  LengthsOutputIteratorT d_lengths_out,
79  NumRunsOutputIteratorT d_num_runs_out,
80  ScanTileStateT tile_status,
81  EqualityOpT equality_op,
83  int num_tiles)
84 {
85  // Thread block type for selecting data from input tiles
86  typedef AgentRle<
87  AgentRlePolicyT,
88  InputIteratorT,
89  OffsetsOutputIteratorT,
90  LengthsOutputIteratorT,
91  EqualityOpT,
92  OffsetT> AgentRleT;
93 
94  // Shared memory for AgentRle
95  __shared__ typename AgentRleT::TempStorage temp_storage;
96 
97  // Process tiles
98  AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
99  num_tiles,
100  tile_status,
102 }
103 
104 
105 
106 
107 /******************************************************************************
108  * Dispatch
109  ******************************************************************************/
110 
114 template <
115  typename InputIteratorT,
116  typename OffsetsOutputIteratorT,
117  typename LengthsOutputIteratorT,
118  typename NumRunsOutputIteratorT,
119  typename EqualityOpT,
120  typename OffsetT>
122 {
123  /******************************************************************************
124  * Types and constants
125  ******************************************************************************/
126 
127  // The input value type
128  typedef typename std::iterator_traits<InputIteratorT>::value_type T;
129 
130  // The lengths output value type
131  typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ?
132  OffsetT, // ... then the OffsetT type,
133  typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT; // ... else the output iterator's value type
134 
135  enum
136  {
137  INIT_KERNEL_THREADS = 128,
138  };
139 
140  // Tile status descriptor interface type
142 
143 
144  /******************************************************************************
145  * Tuning policies
146  ******************************************************************************/
147 
149  struct Policy350
150  {
151  enum {
152  NOMINAL_4B_ITEMS_PER_THREAD = 15,
153  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
154  };
155 
156  typedef AgentRlePolicy<
157  96,
158  ITEMS_PER_THREAD,
160  LOAD_LDG,
161  true,
164  };
165 
167  struct Policy300
168  {
169  enum {
170  NOMINAL_4B_ITEMS_PER_THREAD = 5,
171  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
172  };
173 
174  typedef AgentRlePolicy<
175  256,
176  ITEMS_PER_THREAD,
178  LOAD_DEFAULT,
179  true,
182  };
183 
185  struct Policy200
186  {
187  enum {
188  NOMINAL_4B_ITEMS_PER_THREAD = 15,
189  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
190  };
191 
192  typedef AgentRlePolicy<
193  128,
194  ITEMS_PER_THREAD,
196  LOAD_DEFAULT,
197  false,
200  };
201 
203  struct Policy130
204  {
205  enum {
206  NOMINAL_4B_ITEMS_PER_THREAD = 9,
207  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
208  };
209 
210  typedef AgentRlePolicy<
211  64,
212  ITEMS_PER_THREAD,
214  LOAD_DEFAULT,
215  true,
218  };
219 
221  struct Policy100
222  {
223  enum {
224  NOMINAL_4B_ITEMS_PER_THREAD = 9,
225  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
226  };
227 
228  typedef AgentRlePolicy<
229  256,
230  ITEMS_PER_THREAD,
232  LOAD_DEFAULT,
233  true,
236  };
237 
238 
239  /******************************************************************************
240  * Tuning policies of current PTX compiler pass
241  ******************************************************************************/
242 
243 #if (CUB_PTX_ARCH >= 350)
244  typedef Policy350 PtxPolicy;
245 
246 #elif (CUB_PTX_ARCH >= 300)
247  typedef Policy300 PtxPolicy;
248 
249 #elif (CUB_PTX_ARCH >= 200)
250  typedef Policy200 PtxPolicy;
251 
252 #elif (CUB_PTX_ARCH >= 130)
253  typedef Policy130 PtxPolicy;
254 
255 #else
256  typedef Policy100 PtxPolicy;
257 
258 #endif
259 
260  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
262 
263 
264  /******************************************************************************
265  * Utilities
266  ******************************************************************************/
267 
271  template <typename KernelConfig>
272  CUB_RUNTIME_FUNCTION __forceinline__
273  static void InitConfigs(
274  int ptx_version,
275  KernelConfig& device_rle_config)
276  {
277  #if (CUB_PTX_ARCH > 0)
278 
279  // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
280  device_rle_config.template Init<PtxRleSweepPolicy>();
281 
282  #else
283 
284  // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
285  if (ptx_version >= 350)
286  {
287  device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
288  }
289  else if (ptx_version >= 300)
290  {
291  device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
292  }
293  else if (ptx_version >= 200)
294  {
295  device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
296  }
297  else if (ptx_version >= 130)
298  {
299  device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
300  }
301  else
302  {
303  device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
304  }
305 
306  #endif
307  }
308 
309 
314  {
315  int block_threads;
316  int items_per_thread;
317  BlockLoadAlgorithm load_policy;
318  bool store_warp_time_slicing;
319  BlockScanAlgorithm scan_algorithm;
320 
321  template <typename AgentRlePolicyT>
322  CUB_RUNTIME_FUNCTION __forceinline__
323  void Init()
324  {
325  block_threads = AgentRlePolicyT::BLOCK_THREADS;
326  items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD;
327  load_policy = AgentRlePolicyT::LOAD_ALGORITHM;
328  store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
329  scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM;
330  }
331 
332  CUB_RUNTIME_FUNCTION __forceinline__
333  void Print()
334  {
335  printf("%d, %d, %d, %d, %d",
336  block_threads,
337  items_per_thread,
338  load_policy,
339  store_warp_time_slicing,
340  scan_algorithm);
341  }
342  };
343 
344 
345  /******************************************************************************
346  * Dispatch entrypoints
347  ******************************************************************************/
348 
353  template <
354  typename DeviceScanInitKernelPtr,
355  typename DeviceRleSweepKernelPtr>
356  CUB_RUNTIME_FUNCTION __forceinline__
357  static cudaError_t Dispatch(
358  void* d_temp_storage,
359  size_t& temp_storage_bytes,
360  InputIteratorT d_in,
361  OffsetsOutputIteratorT d_offsets_out,
362  LengthsOutputIteratorT d_lengths_out,
363  NumRunsOutputIteratorT d_num_runs_out,
364  EqualityOpT equality_op,
366  cudaStream_t stream,
367  bool debug_synchronous,
368  int ptx_version,
369  DeviceScanInitKernelPtr device_scan_init_kernel,
370  DeviceRleSweepKernelPtr device_rle_sweep_kernel,
371  KernelConfig device_rle_config)
372  {
373 
374 #ifndef CUB_RUNTIME_ENABLED
375 
376  // Kernel launch not supported from this device
377  return CubDebug(cudaErrorNotSupported);
378 
379 #else
380 
381  cudaError error = cudaSuccess;
382  do
383  {
384  // Get device ordinal
385  int device_ordinal;
386  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
387 
388  // Get SM count
389  int sm_count;
390  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
391 
392  // Number of input tiles
393  int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
394  int num_tiles = (num_items + tile_size - 1) / tile_size;
395 
396  // Specify temporary storage allocation requirements
397  size_t allocation_sizes[1];
398  if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
399 
400  // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
401  void* allocations[1];
402  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
403  if (d_temp_storage == NULL)
404  {
405  // Return if the caller is simply requesting the size of the storage allocation
406  break;
407  }
408 
409  // Construct the tile status interface
411  if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
412 
413  // Log device_scan_init_kernel configuration
414  int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
415  if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
416 
417  // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
418  device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
419  tile_status,
420  num_tiles,
422 
423  // Check for failure to launch
424  if (CubDebug(error = cudaPeekAtLastError())) break;
425 
426  // Sync the stream if specified to flush runtime errors
427  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
428 
429  // Return if empty problem
430  if (num_items == 0)
431  break;
432 
433  // Get SM occupancy for device_rle_sweep_kernel
434  int device_rle_kernel_sm_occupancy;
435  if (CubDebug(error = MaxSmOccupancy(
436  device_rle_kernel_sm_occupancy, // out
437  device_rle_sweep_kernel,
438  device_rle_config.block_threads))) break;
439 
440  // Get max x-dimension of grid
441  int max_dim_x;
442  if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
443 
444  // Get grid size for scanning tiles
445  dim3 scan_grid_size;
446  scan_grid_size.z = 1;
447  scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
448  scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
449 
450  // Log device_rle_sweep_kernel configuration
451  if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
452  scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
453 
454  // Invoke device_rle_sweep_kernel
455  device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
456  d_in,
460  tile_status,
461  equality_op,
462  num_items,
463  num_tiles);
464 
465  // Check for failure to launch
466  if (CubDebug(error = cudaPeekAtLastError())) break;
467 
468  // Sync the stream if specified to flush runtime errors
469  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
470 
471  }
472  while (0);
473 
474  return error;
475 
476 #endif // CUB_RUNTIME_ENABLED
477  }
478 
479 
483  CUB_RUNTIME_FUNCTION __forceinline__
484  static cudaError_t Dispatch(
485  void* d_temp_storage,
486  size_t& temp_storage_bytes,
487  InputIteratorT d_in,
488  OffsetsOutputIteratorT d_offsets_out,
489  LengthsOutputIteratorT d_lengths_out,
490  NumRunsOutputIteratorT d_num_runs_out,
491  EqualityOpT equality_op,
493  cudaStream_t stream,
494  bool debug_synchronous)
495  {
496  cudaError error = cudaSuccess;
497  do
498  {
499  // Get PTX version
500  int ptx_version;
501  #if (CUB_PTX_ARCH == 0)
502  if (CubDebug(error = PtxVersion(ptx_version))) break;
503  #else
504  ptx_version = CUB_PTX_ARCH;
505  #endif
506 
507  // Get kernel kernel dispatch configurations
508  KernelConfig device_rle_config;
509  InitConfigs(ptx_version, device_rle_config);
510 
511  // Dispatch
512  if (CubDebug(error = Dispatch(
513  d_temp_storage,
514  temp_storage_bytes,
515  d_in,
519  equality_op,
520  num_items,
521  stream,
522  debug_synchronous,
523  ptx_version,
524  DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
525  DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
526  device_rle_config))) break;
527  }
528  while (0);
529 
530  return error;
531  }
532 };
533 
534 
535 } // CUB namespace
536 CUB_NS_POSTFIX // Optional outer namespace(s)
537 
538 
Cache as texture.
Definition: thread_load.cuh:69
Type equality test.
Definition: util_type.cuh:98
Default (no modifier)
Definition: thread_load.cuh:64
OffsetsOutputIteratorT d_offsets_out
< [in] Pointer to input sequence of data items
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Optional outer namespace(s)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
< The BlockScan algorithm to use
Definition: agent_rle.cuh:70
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_status
[in] Tile status interface
AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run...
Definition: agent_rle.cuh:102
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
Definition: block_load.cuh:473
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &device_rle_config)
OffsetsOutputIteratorT LengthsOutputIteratorT d_lengths_out
[out] Pointer to output sequence of run-lengths
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
OffsetT OffsetT
[in] Total number of input data items
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
< Signed integer type for global offsets
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition: util_macro.cuh:61
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version, DeviceScanInitKernelPtr device_scan_init_kernel, DeviceRleSweepKernelPtr device_rle_sweep_kernel, KernelConfig device_rle_config)
< Function type of cub::DeviceRleSweepKernelPtr
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
Definition: block_scan.cuh:57