OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
dispatch_rle.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
40#include "dispatch_scan.cuh"
41#include "../../agent/agent_rle.cuh"
42#include "../../thread/thread_operators.cuh"
43#include "../../grid/grid_queue.cuh"
44#include "../../util_device.cuh"
45#include "../../util_namespace.cuh"
46
48CUB_NS_PREFIX
49
51namespace cub {
52
53
54/******************************************************************************
55 * Kernel entry points
56 *****************************************************************************/
57
65template <
66 typename AgentRlePolicyT,
67 typename InputIteratorT,
68 typename OffsetsOutputIteratorT,
69 typename LengthsOutputIteratorT,
70 typename NumRunsOutputIteratorT,
71 typename ScanTileStateT,
72 typename EqualityOpT,
73 typename OffsetT>
74__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
75__global__ void DeviceRleSweepKernel(
76 InputIteratorT d_in,
77 OffsetsOutputIteratorT d_offsets_out,
78 LengthsOutputIteratorT d_lengths_out,
79 NumRunsOutputIteratorT d_num_runs_out,
80 ScanTileStateT tile_status,
81 EqualityOpT equality_op,
84{
85 // Thread block type for selecting data from input tiles
86 typedef AgentRle<
87 AgentRlePolicyT,
88 InputIteratorT,
89 OffsetsOutputIteratorT,
90 LengthsOutputIteratorT,
91 EqualityOpT,
92 OffsetT> AgentRleT;
93
94 // Shared memory for AgentRle
95 __shared__ typename AgentRleT::TempStorage temp_storage;
96
97 // Process tiles
98 AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
102}
103
104
105
106
107/******************************************************************************
108 * Dispatch
109 ******************************************************************************/
110
114template <
115 typename InputIteratorT,
116 typename OffsetsOutputIteratorT,
117 typename LengthsOutputIteratorT,
118 typename NumRunsOutputIteratorT,
119 typename EqualityOpT,
120 typename OffsetT>
122{
123 /******************************************************************************
124 * Types and constants
125 ******************************************************************************/
126
127 // The input value type
128 typedef typename std::iterator_traits<InputIteratorT>::value_type T;
129
130 // The lengths output value type
131 typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ?
132 OffsetT, // ... then the OffsetT type,
133 typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT; // ... else the output iterator's value type
134
135 enum
136 {
137 INIT_KERNEL_THREADS = 128,
138 };
139
140 // Tile status descriptor interface type
142
143
144 /******************************************************************************
145 * Tuning policies
146 ******************************************************************************/
147
150 {
151 enum {
152 NOMINAL_4B_ITEMS_PER_THREAD = 15,
153 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
154 };
155
156 typedef AgentRlePolicy<
157 96,
158 ITEMS_PER_THREAD,
160 LOAD_LDG,
161 true,
164 };
165
168 {
169 enum {
170 NOMINAL_4B_ITEMS_PER_THREAD = 5,
171 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
172 };
173
174 typedef AgentRlePolicy<
175 256,
176 ITEMS_PER_THREAD,
179 true,
182 };
183
186 {
187 enum {
188 NOMINAL_4B_ITEMS_PER_THREAD = 15,
189 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
190 };
191
192 typedef AgentRlePolicy<
193 128,
194 ITEMS_PER_THREAD,
197 false,
200 };
201
204 {
205 enum {
206 NOMINAL_4B_ITEMS_PER_THREAD = 9,
207 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
208 };
209
210 typedef AgentRlePolicy<
211 64,
212 ITEMS_PER_THREAD,
215 true,
218 };
219
222 {
223 enum {
224 NOMINAL_4B_ITEMS_PER_THREAD = 9,
225 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
226 };
227
228 typedef AgentRlePolicy<
229 256,
230 ITEMS_PER_THREAD,
233 true,
236 };
237
238
239 /******************************************************************************
240 * Tuning policies of current PTX compiler pass
241 ******************************************************************************/
242
243#if (CUB_PTX_ARCH >= 350)
244 typedef Policy350 PtxPolicy;
245
246#elif (CUB_PTX_ARCH >= 300)
247 typedef Policy300 PtxPolicy;
248
249#elif (CUB_PTX_ARCH >= 200)
250 typedef Policy200 PtxPolicy;
251
252#elif (CUB_PTX_ARCH >= 130)
253 typedef Policy130 PtxPolicy;
254
255#else
256 typedef Policy100 PtxPolicy;
257
258#endif
259
260 // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
262
263
264 /******************************************************************************
265 * Utilities
266 ******************************************************************************/
267
271 template <typename KernelConfig>
272 CUB_RUNTIME_FUNCTION __forceinline__
273 static void InitConfigs(
274 int ptx_version,
275 KernelConfig& device_rle_config)
276 {
277 #if (CUB_PTX_ARCH > 0)
278
279 // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
280 device_rle_config.template Init<PtxRleSweepPolicy>();
281
282 #else
283
284 // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
285 if (ptx_version >= 350)
286 {
287 device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
288 }
289 else if (ptx_version >= 300)
290 {
291 device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
292 }
293 else if (ptx_version >= 200)
294 {
295 device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
296 }
297 else if (ptx_version >= 130)
298 {
299 device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
300 }
301 else
302 {
303 device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
304 }
305
306 #endif
307 }
308
309
314 {
315 int block_threads;
316 int items_per_thread;
317 BlockLoadAlgorithm load_policy;
318 bool store_warp_time_slicing;
319 BlockScanAlgorithm scan_algorithm;
320
321 template <typename AgentRlePolicyT>
322 CUB_RUNTIME_FUNCTION __forceinline__
323 void Init()
324 {
325 block_threads = AgentRlePolicyT::BLOCK_THREADS;
326 items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD;
327 load_policy = AgentRlePolicyT::LOAD_ALGORITHM;
328 store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
329 scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM;
330 }
331
332 CUB_RUNTIME_FUNCTION __forceinline__
333 void Print()
334 {
335 printf("%d, %d, %d, %d, %d",
336 block_threads,
337 items_per_thread,
338 load_policy,
339 store_warp_time_slicing,
340 scan_algorithm);
341 }
342 };
343
344
345 /******************************************************************************
346 * Dispatch entrypoints
347 ******************************************************************************/
348
353 template <
354 typename DeviceScanInitKernelPtr,
355 typename DeviceRleSweepKernelPtr>
356 CUB_RUNTIME_FUNCTION __forceinline__
357 static cudaError_t Dispatch(
358 void* d_temp_storage,
359 size_t& temp_storage_bytes,
360 InputIteratorT d_in,
361 OffsetsOutputIteratorT d_offsets_out,
362 LengthsOutputIteratorT d_lengths_out,
363 NumRunsOutputIteratorT d_num_runs_out,
364 EqualityOpT equality_op,
366 cudaStream_t stream,
367 bool debug_synchronous,
368 int ptx_version,
369 DeviceScanInitKernelPtr device_scan_init_kernel,
370 DeviceRleSweepKernelPtr device_rle_sweep_kernel,
371 KernelConfig device_rle_config)
372 {
373
374#ifndef CUB_RUNTIME_ENABLED
375
376 // Kernel launch not supported from this device
377 return CubDebug(cudaErrorNotSupported);
378
379#else
380
381 cudaError error = cudaSuccess;
382 do
383 {
384 // Get device ordinal
385 int device_ordinal;
386 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
387
388 // Get SM count
389 int sm_count;
390 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
391
392 // Number of input tiles
393 int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
394 int num_tiles = (num_items + tile_size - 1) / tile_size;
395
396 // Specify temporary storage allocation requirements
397 size_t allocation_sizes[1];
398 if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
399
400 // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
401 void* allocations[1];
402 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
403 if (d_temp_storage == NULL)
404 {
405 // Return if the caller is simply requesting the size of the storage allocation
406 break;
407 }
408
409 // Construct the tile status interface
411 if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
412
413 // Log device_scan_init_kernel configuration
414 int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
415 if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
416
417 // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
418 device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
420 num_tiles,
422
423 // Check for failure to launch
424 if (CubDebug(error = cudaPeekAtLastError())) break;
425
426 // Sync the stream if specified to flush runtime errors
427 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
428
429 // Return if empty problem
430 if (num_items == 0)
431 break;
432
433 // Get SM occupancy for device_rle_sweep_kernel
434 int device_rle_kernel_sm_occupancy;
435 if (CubDebug(error = MaxSmOccupancy(
436 device_rle_kernel_sm_occupancy, // out
437 device_rle_sweep_kernel,
438 device_rle_config.block_threads))) break;
439
440 // Get max x-dimension of grid
441 int max_dim_x;
442 if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
443
444 // Get grid size for scanning tiles
445 dim3 scan_grid_size;
446 scan_grid_size.z = 1;
447 scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
448 scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
449
450 // Log device_rle_sweep_kernel configuration
451 if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
452 scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
453
454 // Invoke device_rle_sweep_kernel
455 device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
456 d_in,
462 num_items,
463 num_tiles);
464
465 // Check for failure to launch
466 if (CubDebug(error = cudaPeekAtLastError())) break;
467
468 // Sync the stream if specified to flush runtime errors
469 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
470
471 }
472 while (0);
473
474 return error;
475
476#endif // CUB_RUNTIME_ENABLED
477 }
478
479
483 CUB_RUNTIME_FUNCTION __forceinline__
484 static cudaError_t Dispatch(
485 void* d_temp_storage,
486 size_t& temp_storage_bytes,
487 InputIteratorT d_in,
488 OffsetsOutputIteratorT d_offsets_out,
489 LengthsOutputIteratorT d_lengths_out,
490 NumRunsOutputIteratorT d_num_runs_out,
491 EqualityOpT equality_op,
493 cudaStream_t stream,
494 bool debug_synchronous)
495 {
496 cudaError error = cudaSuccess;
497 do
498 {
499 // Get PTX version
500 int ptx_version;
501 #if (CUB_PTX_ARCH == 0)
502 if (CubDebug(error = PtxVersion(ptx_version))) break;
503 #else
504 ptx_version = CUB_PTX_ARCH;
505 #endif
506
507 // Get kernel kernel dispatch configurations
508 KernelConfig device_rle_config;
509 InitConfigs(ptx_version, device_rle_config);
510
511 // Dispatch
512 if (CubDebug(error = Dispatch(
513 d_temp_storage,
514 temp_storage_bytes,
515 d_in,
520 num_items,
521 stream,
522 debug_synchronous,
523 ptx_version,
524 DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
525 DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
526 device_rle_config))) break;
527 }
528 while (0);
529
530 return error;
531 }
532};
533
534
535} // CUB namespace
536CUB_NS_POSTFIX // Optional outer namespace(s)
537
538
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
@ BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
OffsetsOutputIteratorT d_offsets_out
< [in] Pointer to input sequence of data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OffsetsOutputIteratorT LengthsOutputIteratorT d_lengths_out
[out] Pointer to output sequence of run-lengths
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
@ BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_RAKING_MEMOIZE
OffsetT OffsetT
[in] Total number of input data items
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_status
[in] Tile status interface
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
Definition agent_rle.cuh:71
AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run...
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version, DeviceScanInitKernelPtr device_scan_init_kernel, DeviceRleSweepKernelPtr device_rle_sweep_kernel, KernelConfig device_rle_config)
< Function type of cub::DeviceRleSweepKernelPtr
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &device_rle_config)
Type equality test.
Definition util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53