OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
dispatch_reduce_by_key.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
40#include "dispatch_scan.cuh"
41#include "../../agent/agent_reduce_by_key.cuh"
42#include "../../thread/thread_operators.cuh"
43#include "../../grid/grid_queue.cuh"
44#include "../../util_device.cuh"
45#include "../../util_namespace.cuh"
46
48CUB_NS_PREFIX
49
51namespace cub {
52
53/******************************************************************************
54 * Kernel entry points
55 *****************************************************************************/
56
60template <
61 typename AgentReduceByKeyPolicyT,
62 typename KeysInputIteratorT,
63 typename UniqueOutputIteratorT,
64 typename ValuesInputIteratorT,
65 typename AggregatesOutputIteratorT,
66 typename NumRunsOutputIteratorT,
67 typename ScanTileStateT,
68 typename EqualityOpT,
69 typename ReductionOpT,
70 typename OffsetT>
71__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
72__global__ void DeviceReduceByKeyKernel(
73 KeysInputIteratorT d_keys_in,
74 UniqueOutputIteratorT d_unique_out,
75 ValuesInputIteratorT d_values_in,
76 AggregatesOutputIteratorT d_aggregates_out,
77 NumRunsOutputIteratorT d_num_runs_out,
78 ScanTileStateT tile_state,
80 EqualityOpT equality_op,
81 ReductionOpT reduction_op,
83{
84 // Thread block type for reducing tiles of value segments
85 typedef AgentReduceByKey<
86 AgentReduceByKeyPolicyT,
87 KeysInputIteratorT,
88 UniqueOutputIteratorT,
89 ValuesInputIteratorT,
90 AggregatesOutputIteratorT,
91 NumRunsOutputIteratorT,
92 EqualityOpT,
93 ReductionOpT,
94 OffsetT>
95 AgentReduceByKeyT;
96
97 // Shared memory for AgentReduceByKey
98 __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
99
100 // Process tiles
101 AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
102 num_items,
104 start_tile);
105}
106
107
108
109
110/******************************************************************************
111 * Dispatch
112 ******************************************************************************/
113
117template <
118 typename KeysInputIteratorT,
119 typename UniqueOutputIteratorT,
120 typename ValuesInputIteratorT,
121 typename AggregatesOutputIteratorT,
122 typename NumRunsOutputIteratorT,
123 typename EqualityOpT,
124 typename ReductionOpT,
125 typename OffsetT>
127{
128 //-------------------------------------------------------------------------
129 // Types and constants
130 //-------------------------------------------------------------------------
131
132 // The input keys type
133 typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
134
135 // The output keys type
136 typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ?
137 typename std::iterator_traits<KeysInputIteratorT>::value_type, // ... then the input iterator's value type,
138 typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT; // ... else the output iterator's value type
139
140 // The input values type
141 typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
142
143 // The output values type
144 typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ?
145 typename std::iterator_traits<ValuesInputIteratorT>::value_type, // ... then the input iterator's value type,
146 typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT; // ... else the output iterator's value type
147
148 enum
149 {
150 INIT_KERNEL_THREADS = 128,
151 MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
152 COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT),
153 };
154
155 // Tile status descriptor interface type
157
158
159 //-------------------------------------------------------------------------
160 // Tuning policies
161 //-------------------------------------------------------------------------
162
165 {
166 enum {
167 NOMINAL_4B_ITEMS_PER_THREAD = 6,
168 ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
169 };
170
172 128,
173 ITEMS_PER_THREAD,
175 LOAD_LDG,
178 };
179
182 {
183 enum {
184 NOMINAL_4B_ITEMS_PER_THREAD = 6,
185 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
186 };
187
189 128,
190 ITEMS_PER_THREAD,
195 };
196
199 {
200 enum {
201 NOMINAL_4B_ITEMS_PER_THREAD = 11,
202 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
203 };
204
206 128,
207 ITEMS_PER_THREAD,
212 };
213
216 {
217 enum {
218 NOMINAL_4B_ITEMS_PER_THREAD = 7,
219 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
220 };
221
223 128,
224 ITEMS_PER_THREAD,
229 };
230
233 {
234 enum {
235 NOMINAL_4B_ITEMS_PER_THREAD = 5,
236 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
237 };
238
240 64,
241 ITEMS_PER_THREAD,
246 };
247
248
249 /******************************************************************************
250 * Tuning policies of current PTX compiler pass
251 ******************************************************************************/
252
253#if (CUB_PTX_ARCH >= 350)
254 typedef Policy350 PtxPolicy;
255
256#elif (CUB_PTX_ARCH >= 300)
257 typedef Policy300 PtxPolicy;
258
259#elif (CUB_PTX_ARCH >= 200)
260 typedef Policy200 PtxPolicy;
261
262#elif (CUB_PTX_ARCH >= 130)
263 typedef Policy130 PtxPolicy;
264
265#else
266 typedef Policy110 PtxPolicy;
267
268#endif
269
270 // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
272
273
274 /******************************************************************************
275 * Utilities
276 ******************************************************************************/
277
281 template <typename KernelConfig>
282 CUB_RUNTIME_FUNCTION __forceinline__
283 static void InitConfigs(
284 int ptx_version,
285 KernelConfig &reduce_by_key_config)
286 {
287 #if (CUB_PTX_ARCH > 0)
288 (void)ptx_version;
289
290 // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
291 reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
292
293 #else
294
295 // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
296 if (ptx_version >= 350)
297 {
298 reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
299 }
300 else if (ptx_version >= 300)
301 {
302 reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
303 }
304 else if (ptx_version >= 200)
305 {
306 reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
307 }
308 else if (ptx_version >= 130)
309 {
310 reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
311 }
312 else
313 {
314 reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
315 }
316
317 #endif
318 }
319
320
325 {
326 int block_threads;
327 int items_per_thread;
328 int tile_items;
329
330 template <typename PolicyT>
331 CUB_RUNTIME_FUNCTION __forceinline__
332 void Init()
333 {
334 block_threads = PolicyT::BLOCK_THREADS;
335 items_per_thread = PolicyT::ITEMS_PER_THREAD;
336 tile_items = block_threads * items_per_thread;
337 }
338 };
339
340
341 //---------------------------------------------------------------------
342 // Dispatch entrypoints
343 //---------------------------------------------------------------------
344
349 template <
350 typename ScanInitKernelT,
351 typename ReduceByKeyKernelT>
352 CUB_RUNTIME_FUNCTION __forceinline__
353 static cudaError_t Dispatch(
354 void* d_temp_storage,
355 size_t& temp_storage_bytes,
356 KeysInputIteratorT d_keys_in,
357 UniqueOutputIteratorT d_unique_out,
358 ValuesInputIteratorT d_values_in,
359 AggregatesOutputIteratorT d_aggregates_out,
360 NumRunsOutputIteratorT d_num_runs_out,
361 EqualityOpT equality_op,
362 ReductionOpT reduction_op,
364 cudaStream_t stream,
365 bool debug_synchronous,
366 int /*ptx_version*/,
367 ScanInitKernelT init_kernel,
368 ReduceByKeyKernelT reduce_by_key_kernel,
369 KernelConfig reduce_by_key_config)
370 {
371
372#ifndef CUB_RUNTIME_ENABLED
373 (void)d_temp_storage;
374 (void)temp_storage_bytes;
375 (void)d_keys_in;
376 (void)d_unique_out;
377 (void)d_values_in;
378 (void)d_aggregates_out;
379 (void)d_num_runs_out;
380 (void)equality_op;
381 (void)reduction_op;
382 (void)num_items;
383 (void)stream;
384 (void)debug_synchronous;
385 (void)init_kernel;
386 (void)reduce_by_key_kernel;
387 (void)reduce_by_key_config;
388
389 // Kernel launch not supported from this device
390 return CubDebug(cudaErrorNotSupported);
391
392#else
393
394 cudaError error = cudaSuccess;
395 do
396 {
397 // Get device ordinal
398 int device_ordinal;
399 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
400
401 // Get SM count
402 int sm_count;
403 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
404
405 // Number of input tiles
406 int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
407 int num_tiles = (num_items + tile_size - 1) / tile_size;
408
409 // Specify temporary storage allocation requirements
410 size_t allocation_sizes[1];
411 if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
412
413 // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
414 void* allocations[1];
415 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
416 if (d_temp_storage == NULL)
417 {
418 // Return if the caller is simply requesting the size of the storage allocation
419 break;
420 }
421
422 // Construct the tile status interface
424 if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
425
426 // Log init_kernel configuration
427 int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
428 if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
429
430 // Invoke init_kernel to initialize tile descriptors
431 init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
433 num_tiles,
435
436 // Check for failure to launch
437 if (CubDebug(error = cudaPeekAtLastError())) break;
438
439 // Sync the stream if specified to flush runtime errors
440 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
441
442 // Return if empty problem
443 if (num_items == 0)
444 break;
445
446 // Get SM occupancy for reduce_by_key_kernel
447 int reduce_by_key_sm_occupancy;
448 if (CubDebug(error = MaxSmOccupancy(
449 reduce_by_key_sm_occupancy, // out
450 reduce_by_key_kernel,
451 reduce_by_key_config.block_threads))) break;
452
453 // Get max x-dimension of grid
454 int max_dim_x;
455 if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
456
457 // Run grids in epochs (in case number of tiles exceeds max x-dimension
458 int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
459 for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
460 {
461 // Log reduce_by_key_kernel configuration
462 if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
463 start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
464
465 // Invoke reduce_by_key_kernel
466 reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
467 d_keys_in,
476 num_items);
477
478 // Check for failure to launch
479 if (CubDebug(error = cudaPeekAtLastError())) break;
480
481 // Sync the stream if specified to flush runtime errors
482 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
483 }
484 }
485 while (0);
486
487 return error;
488
489#endif // CUB_RUNTIME_ENABLED
490 }
491
492
496 CUB_RUNTIME_FUNCTION __forceinline__
497 static cudaError_t Dispatch(
498 void* d_temp_storage,
499 size_t& temp_storage_bytes,
500 KeysInputIteratorT d_keys_in,
501 UniqueOutputIteratorT d_unique_out,
502 ValuesInputIteratorT d_values_in,
503 AggregatesOutputIteratorT d_aggregates_out,
504 NumRunsOutputIteratorT d_num_runs_out,
505 EqualityOpT equality_op,
506 ReductionOpT reduction_op,
508 cudaStream_t stream,
509 bool debug_synchronous)
510 {
511 cudaError error = cudaSuccess;
512 do
513 {
514 // Get PTX version
515 int ptx_version;
516 #if (CUB_PTX_ARCH == 0)
517 if (CubDebug(error = PtxVersion(ptx_version))) break;
518 #else
519 ptx_version = CUB_PTX_ARCH;
520 #endif
521
522 // Get kernel kernel dispatch configurations
523 KernelConfig reduce_by_key_config;
524 InitConfigs(ptx_version, reduce_by_key_config);
525
526 // Dispatch
527 if (CubDebug(error = Dispatch(
528 d_temp_storage,
529 temp_storage_bytes,
530 d_keys_in,
537 num_items,
538 stream,
539 debug_synchronous,
540 ptx_version,
541 DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
542 DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
543 reduce_by_key_config))) break;
544 }
545 while (0);
546
547 return error;
548 }
549};
550
551} // CUB namespace
552CUB_NS_POSTFIX // Optional outer namespace(s)
553
554
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
@ BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
KeyT const ValueT * d_values_in
[in] Input values buffer
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
@ BLOCK_SCAN_RAKING
@ BLOCK_SCAN_WARP_SCANS
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-...
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &reduce_by_key_config)
Type equality test.
Definition util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53