OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
dispatch_scan.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
40#include "../../agent/agent_scan.cuh"
41#include "../../thread/thread_operators.cuh"
42#include "../../grid/grid_queue.cuh"
43#include "../../util_arch.cuh"
44#include "../../util_debug.cuh"
45#include "../../util_device.cuh"
46#include "../../util_namespace.cuh"
47
49CUB_NS_PREFIX
50
52namespace cub {
53
54
55/******************************************************************************
56 * Kernel entry points
57 *****************************************************************************/
58
62template <
63 typename ScanTileStateT>
64__global__ void DeviceScanInitKernel(
65 ScanTileStateT tile_state,
66 int num_tiles)
67{
68 // Initialize tile status
69 tile_state.InitializeStatus(num_tiles);
70}
71
75template <
76 typename ScanTileStateT,
77 typename NumSelectedIteratorT>
79 ScanTileStateT tile_state,
80 int num_tiles,
81 NumSelectedIteratorT d_num_selected_out)
82{
83 // Initialize tile status
84 tile_state.InitializeStatus(num_tiles);
85
86 // Initialize d_num_selected_out
87 if ((blockIdx.x == 0) && (threadIdx.x == 0))
89}
90
91
95template <
96 typename ScanPolicyT,
97 typename InputIteratorT,
98 typename OutputIteratorT,
99 typename ScanTileStateT,
100 typename ScanOpT,
101 typename InitValueT,
102 typename OffsetT>
103__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
104__global__ void DeviceScanKernel(
105 InputIteratorT d_in,
106 OutputIteratorT d_out,
107 ScanTileStateT tile_state,
108 int start_tile,
109 ScanOpT scan_op,
110 InitValueT init_value,
112{
113 // Thread block type for scanning input tiles
114 typedef AgentScan<
115 ScanPolicyT,
116 InputIteratorT,
117 OutputIteratorT,
118 ScanOpT,
119 InitValueT,
120 OffsetT> AgentScanT;
121
122 // Shared memory for AgentScan
123 __shared__ typename AgentScanT::TempStorage temp_storage;
124
125 // Process tiles
126 AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
127 num_items,
129 start_tile);
130}
131
132
133
134
135/******************************************************************************
136 * Dispatch
137 ******************************************************************************/
138
139
143template <
144 typename InputIteratorT,
145 typename OutputIteratorT,
146 typename ScanOpT,
147 typename InitValueT,
148 typename OffsetT>
150{
151 //---------------------------------------------------------------------
152 // Constants and Types
153 //---------------------------------------------------------------------
154
155 enum
156 {
157 INIT_KERNEL_THREADS = 128
158 };
159
160 // The output value type
161 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
162 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
163 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
164
165 // Tile status descriptor interface type
167
168
169 //---------------------------------------------------------------------
170 // Tuning policies
171 //---------------------------------------------------------------------
172
175 {
176 typedef AgentScanPolicy<
177 CUB_SCALED_GRANULARITIES(128, 15, OutputT),
183 };
184
185
188 {
189 // Titan X: 32.47B items/s @ 48M 32-bit T
190 typedef AgentScanPolicy<
191 CUB_SCALED_GRANULARITIES(128, 12, OutputT),
193 LOAD_LDG,
197 };
198
199
202 {
203 // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
204 typedef AgentScanPolicy<
205 CUB_SCALED_GRANULARITIES(128, 12, OutputT),
207 LOAD_LDG,
211 };
212
215 {
216 typedef AgentScanPolicy<
217 CUB_SCALED_GRANULARITIES(256, 9, OutputT),
223 };
224
227 {
228 // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
229 typedef AgentScanPolicy<
230 CUB_SCALED_GRANULARITIES(128, 12, OutputT),
236 };
237
240 {
241 typedef AgentScanPolicy<
242 CUB_SCALED_GRANULARITIES(96, 21, OutputT),
248 };
249
252 {
253 typedef AgentScanPolicy<
254 CUB_SCALED_GRANULARITIES(64, 9, OutputT),
260 };
261
262
263 //---------------------------------------------------------------------
264 // Tuning policies of current PTX compiler pass
265 //---------------------------------------------------------------------
266
267#if (CUB_PTX_ARCH >= 600)
268 typedef Policy600 PtxPolicy;
269
270#elif (CUB_PTX_ARCH >= 520)
271 typedef Policy520 PtxPolicy;
272
273#elif (CUB_PTX_ARCH >= 350)
274 typedef Policy350 PtxPolicy;
275
276#elif (CUB_PTX_ARCH >= 300)
277 typedef Policy300 PtxPolicy;
278
279#elif (CUB_PTX_ARCH >= 200)
280 typedef Policy200 PtxPolicy;
281
282#elif (CUB_PTX_ARCH >= 130)
283 typedef Policy130 PtxPolicy;
284
285#else
286 typedef Policy100 PtxPolicy;
287
288#endif
289
290 // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
292
293
294 //---------------------------------------------------------------------
295 // Utilities
296 //---------------------------------------------------------------------
297
301 template <typename KernelConfig>
302 CUB_RUNTIME_FUNCTION __forceinline__
303 static void InitConfigs(
304 int ptx_version,
305 KernelConfig &scan_kernel_config)
306 {
307 #if (CUB_PTX_ARCH > 0)
308 (void)ptx_version;
309
310 // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
311 scan_kernel_config.template Init<PtxAgentScanPolicy>();
312
313 #else
314
315 // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
316 if (ptx_version >= 600)
317 {
318 scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
319 }
320 else if (ptx_version >= 520)
321 {
322 scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
323 }
324 else if (ptx_version >= 350)
325 {
326 scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
327 }
328 else if (ptx_version >= 300)
329 {
330 scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
331 }
332 else if (ptx_version >= 200)
333 {
334 scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
335 }
336 else if (ptx_version >= 130)
337 {
338 scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
339 }
340 else
341 {
342 scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
343 }
344
345 #endif
346 }
347
348
353 {
354 int block_threads;
355 int items_per_thread;
356 int tile_items;
357
358 template <typename PolicyT>
359 CUB_RUNTIME_FUNCTION __forceinline__
360 void Init()
361 {
362 block_threads = PolicyT::BLOCK_THREADS;
363 items_per_thread = PolicyT::ITEMS_PER_THREAD;
364 tile_items = block_threads * items_per_thread;
365 }
366 };
367
368
369 //---------------------------------------------------------------------
370 // Dispatch entrypoints
371 //---------------------------------------------------------------------
372
377 template <
378 typename ScanInitKernelPtrT,
379 typename ScanSweepKernelPtrT>
380 CUB_RUNTIME_FUNCTION __forceinline__
381 static cudaError_t Dispatch(
382 void* d_temp_storage,
383 size_t& temp_storage_bytes,
384 InputIteratorT d_in,
385 OutputIteratorT d_out,
386 ScanOpT scan_op,
387 InitValueT init_value,
389 cudaStream_t stream,
390 bool debug_synchronous,
391 int /*ptx_version*/,
392 ScanInitKernelPtrT init_kernel,
393 ScanSweepKernelPtrT scan_kernel,
394 KernelConfig scan_kernel_config)
395 {
396
397#ifndef CUB_RUNTIME_ENABLED
398 (void)d_temp_storage;
399 (void)temp_storage_bytes;
400 (void)d_in;
401 (void)d_out;
402 (void)scan_op;
403 (void)init_value;
404 (void)num_items;
405 (void)stream;
406 (void)debug_synchronous;
407 (void)init_kernel;
408 (void)scan_kernel;
409 (void)scan_kernel_config;
410
411 // Kernel launch not supported from this device
412 return CubDebug(cudaErrorNotSupported);
413
414#else
415 cudaError error = cudaSuccess;
416 do
417 {
418 // Get device ordinal
419 int device_ordinal;
420 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
421
422 // Get SM count
423 int sm_count;
424 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
425
426 // Number of input tiles
427 int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
428 int num_tiles = (num_items + tile_size - 1) / tile_size;
429
430 // Specify temporary storage allocation requirements
431 size_t allocation_sizes[1];
432 if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
433
434 // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
435 void* allocations[1];
436 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
437 if (d_temp_storage == NULL)
438 {
439 // Return if the caller is simply requesting the size of the storage allocation
440 break;
441 }
442
443 // Return if empty problem
444 if (num_items == 0)
445 break;
446
447 // Construct the tile status interface
449 if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
450
451 // Log init_kernel configuration
452 int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
453 if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
454
455 // Invoke init_kernel to initialize tile descriptors
456 init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
458 num_tiles);
459
460 // Check for failure to launch
461 if (CubDebug(error = cudaPeekAtLastError())) break;
462
463 // Sync the stream if specified to flush runtime errors
464 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
465
466 // Get SM occupancy for scan_kernel
467 int scan_sm_occupancy;
468 if (CubDebug(error = MaxSmOccupancy(
469 scan_sm_occupancy, // out
470 scan_kernel,
471 scan_kernel_config.block_threads))) break;
472
473 // Get max x-dimension of grid
474 int max_dim_x;
475 if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
476
477 // Run grids in epochs (in case number of tiles exceeds max x-dimension
478 int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
479 for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
480 {
481 // Log scan_kernel configuration
482 if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
483 start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
484
485 // Invoke scan_kernel
486 scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
487 d_in,
488 d_out,
491 scan_op,
493 num_items);
494
495 // Check for failure to launch
496 if (CubDebug(error = cudaPeekAtLastError())) break;
497
498 // Sync the stream if specified to flush runtime errors
499 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
500 }
501 }
502 while (0);
503
504 return error;
505
506#endif // CUB_RUNTIME_ENABLED
507 }
508
509
513 CUB_RUNTIME_FUNCTION __forceinline__
514 static cudaError_t Dispatch(
515 void* d_temp_storage,
516 size_t& temp_storage_bytes,
517 InputIteratorT d_in,
518 OutputIteratorT d_out,
519 ScanOpT scan_op,
520 InitValueT init_value,
522 cudaStream_t stream,
523 bool debug_synchronous)
524 {
525 cudaError error = cudaSuccess;
526 do
527 {
528 // Get PTX version
529 int ptx_version;
530 if (CubDebug(error = PtxVersion(ptx_version))) break;
531
532 // Get kernel kernel dispatch configurations
533 KernelConfig scan_kernel_config;
534 InitConfigs(ptx_version, scan_kernel_config);
535
536 // Dispatch
537 if (CubDebug(error = Dispatch(
538 d_temp_storage,
539 temp_storage_bytes,
540 d_in,
541 d_out,
542 scan_op,
544 num_items,
545 stream,
546 debug_synchronous,
547 ptx_version,
548 DeviceScanInitKernel<ScanTileStateT>,
549 DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
550 scan_kernel_config))) break;
551 }
552 while (0);
553
554 return error;
555 }
556};
557
558
559
560} // CUB namespace
561CUB_NS_POSTFIX // Optional outer namespace(s)
562
563
@ BLOCK_STORE_WARP_TRANSPOSE
@ BLOCK_STORE_TRANSPOSE
@ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
@ BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_WARP_TRANSPOSE
@ BLOCK_LOAD_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
__global__ void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
< Tile status interface type
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT ScanTileStateT int ScanOpT InitValueT init_value
Initial value to seed the exclusive scan.
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT d_num_selected_out
[out] Pointer to the total number of items selected (i.e., length of d_selected_out)
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
@ BLOCK_SCAN_RAKING
@ BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_RAKING_MEMOIZE
OffsetT OffsetT
[in] Total number of input data items
__global__ void DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
< Output iterator type for recording the number of items selected
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< The BlockScan algorithm to use
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelPtrT init_kernel, ScanSweepKernelPtrT scan_kernel, KernelConfig scan_kernel_config)
< Function type of cub::DeviceScanKernelPtrT
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &scan_kernel_config)
Type equality test.
Definition util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.