OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
dispatch_select_if.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
40#include "dispatch_scan.cuh"
41#include "../../agent/agent_select_if.cuh"
42#include "../../thread/thread_operators.cuh"
43#include "../../grid/grid_queue.cuh"
44#include "../../util_device.cuh"
45#include "../../util_namespace.cuh"
46
48CUB_NS_PREFIX
49
51namespace cub {
52
53/******************************************************************************
54 * Kernel entry points
55 *****************************************************************************/
56
64template <
65 typename AgentSelectIfPolicyT,
66 typename InputIteratorT,
67 typename FlagsInputIteratorT,
68 typename SelectedOutputIteratorT,
69 typename NumSelectedIteratorT,
70 typename ScanTileStateT,
71 typename SelectOpT,
72 typename EqualityOpT,
73 typename OffsetT,
74 bool KEEP_REJECTS>
75__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
76__global__ void DeviceSelectSweepKernel(
77 InputIteratorT d_in,
78 FlagsInputIteratorT d_flags,
79 SelectedOutputIteratorT d_selected_out,
80 NumSelectedIteratorT d_num_selected_out,
81 ScanTileStateT tile_status,
82 SelectOpT select_op,
83 EqualityOpT equality_op,
85 int num_tiles)
86{
87 // Thread block type for selecting data from input tiles
88 typedef AgentSelectIf<
89 AgentSelectIfPolicyT,
90 InputIteratorT,
91 FlagsInputIteratorT,
92 SelectedOutputIteratorT,
93 SelectOpT,
94 EqualityOpT,
95 OffsetT,
96 KEEP_REJECTS> AgentSelectIfT;
97
98 // Shared memory for AgentSelectIf
99 __shared__ typename AgentSelectIfT::TempStorage temp_storage;
100
101 // Process tiles
102 AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
103 num_tiles,
106}
107
108
109
110
111/******************************************************************************
112 * Dispatch
113 ******************************************************************************/
114
118template <
119 typename InputIteratorT,
120 typename FlagsInputIteratorT,
121 typename SelectedOutputIteratorT,
122 typename NumSelectedIteratorT,
123 typename SelectOpT,
124 typename EqualityOpT,
125 typename OffsetT,
126 bool KEEP_REJECTS>
128{
129 /******************************************************************************
130 * Types and constants
131 ******************************************************************************/
132
133 // The output value type
134 typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
135 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
136 typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
137
138 // The flag value type
139 typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
140
141 enum
142 {
143 INIT_KERNEL_THREADS = 128,
144 };
145
146 // Tile status descriptor interface type
148
149
150 /******************************************************************************
151 * Tuning policies
152 ******************************************************************************/
153
156 {
157 enum {
158 NOMINAL_4B_ITEMS_PER_THREAD = 10,
159 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
160 };
161
162 typedef AgentSelectIfPolicy<
163 128,
164 ITEMS_PER_THREAD,
166 LOAD_LDG,
169 };
170
173 {
174 enum {
175 NOMINAL_4B_ITEMS_PER_THREAD = 7,
176 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
177 };
178
179 typedef AgentSelectIfPolicy<
180 128,
181 ITEMS_PER_THREAD,
186 };
187
190 {
191 enum {
192 NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
193 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
194 };
195
196 typedef AgentSelectIfPolicy<
197 128,
198 ITEMS_PER_THREAD,
203 };
204
207 {
208 enum {
209 NOMINAL_4B_ITEMS_PER_THREAD = 9,
210 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
211 };
212
213 typedef AgentSelectIfPolicy<
214 64,
215 ITEMS_PER_THREAD,
220 };
221
224 {
225 enum {
226 NOMINAL_4B_ITEMS_PER_THREAD = 9,
227 ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
228 };
229
230 typedef AgentSelectIfPolicy<
231 64,
232 ITEMS_PER_THREAD,
237 };
238
239
240 /******************************************************************************
241 * Tuning policies of current PTX compiler pass
242 ******************************************************************************/
243
244#if (CUB_PTX_ARCH >= 350)
245 typedef Policy350 PtxPolicy;
246
247#elif (CUB_PTX_ARCH >= 300)
248 typedef Policy300 PtxPolicy;
249
250#elif (CUB_PTX_ARCH >= 200)
251 typedef Policy200 PtxPolicy;
252
253#elif (CUB_PTX_ARCH >= 130)
254 typedef Policy130 PtxPolicy;
255
256#else
257 typedef Policy100 PtxPolicy;
258
259#endif
260
261 // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
263
264
265 /******************************************************************************
266 * Utilities
267 ******************************************************************************/
268
272 template <typename KernelConfig>
273 CUB_RUNTIME_FUNCTION __forceinline__
274 static void InitConfigs(
275 int ptx_version,
276 KernelConfig &select_if_config)
277 {
278 #if (CUB_PTX_ARCH > 0)
279 (void)ptx_version;
280
281 // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
282 select_if_config.template Init<PtxSelectIfPolicyT>();
283
284 #else
285
286 // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
287 if (ptx_version >= 350)
288 {
289 select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
290 }
291 else if (ptx_version >= 300)
292 {
293 select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
294 }
295 else if (ptx_version >= 200)
296 {
297 select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
298 }
299 else if (ptx_version >= 130)
300 {
301 select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
302 }
303 else
304 {
305 select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
306 }
307
308 #endif
309 }
310
311
316 {
317 int block_threads;
318 int items_per_thread;
319 int tile_items;
320
321 template <typename PolicyT>
322 CUB_RUNTIME_FUNCTION __forceinline__
323 void Init()
324 {
325 block_threads = PolicyT::BLOCK_THREADS;
326 items_per_thread = PolicyT::ITEMS_PER_THREAD;
327 tile_items = block_threads * items_per_thread;
328 }
329 };
330
331
332 /******************************************************************************
333 * Dispatch entrypoints
334 ******************************************************************************/
335
340 template <
341 typename ScanInitKernelPtrT,
342 typename SelectIfKernelPtrT>
343 CUB_RUNTIME_FUNCTION __forceinline__
344 static cudaError_t Dispatch(
345 void* d_temp_storage,
346 size_t& temp_storage_bytes,
347 InputIteratorT d_in,
348 FlagsInputIteratorT d_flags,
349 SelectedOutputIteratorT d_selected_out,
350 NumSelectedIteratorT d_num_selected_out,
351 SelectOpT select_op,
352 EqualityOpT equality_op,
354 cudaStream_t stream,
355 bool debug_synchronous,
356 int /*ptx_version*/,
357 ScanInitKernelPtrT scan_init_kernel,
358 SelectIfKernelPtrT select_if_kernel,
359 KernelConfig select_if_config)
360 {
361
362#ifndef CUB_RUNTIME_ENABLED
363 (void)d_temp_storage;
364 (void)temp_storage_bytes;
365 (void)d_in;
366 (void)d_flags;
367 (void)d_selected_out;
368 (void)d_num_selected_out;
369 (void)select_op;
370 (void)equality_op;
371 (void)num_items;
372 (void)stream;
373 (void)debug_synchronous;
374 (void)scan_init_kernel;
375 (void)select_if_kernel;
376 (void)select_if_config;
377
378 // Kernel launch not supported from this device
379 return CubDebug(cudaErrorNotSupported);
380
381#else
382
383 cudaError error = cudaSuccess;
384 do
385 {
386 // Get device ordinal
387 int device_ordinal;
388 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
389
390 // Get SM count
391 int sm_count;
392 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
393
394 // Number of input tiles
395 int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
396 int num_tiles = (num_items + tile_size - 1) / tile_size;
397
398 // Specify temporary storage allocation requirements
399 size_t allocation_sizes[1];
400 if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
401
402 // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
403 void* allocations[1];
404 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
405 if (d_temp_storage == NULL)
406 {
407 // Return if the caller is simply requesting the size of the storage allocation
408 break;
409 }
410
411 // Construct the tile status interface
413 if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
414
415 // Log scan_init_kernel configuration
416 int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
417 if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
418
419 // Invoke scan_init_kernel to initialize tile descriptors
420 scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
422 num_tiles,
424
425 // Check for failure to launch
426 if (CubDebug(error = cudaPeekAtLastError())) break;
427
428 // Sync the stream if specified to flush runtime errors
429 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
430
431 // Return if empty problem
432 if (num_items == 0)
433 break;
434
435 // Get SM occupancy for select_if_kernel
436 int range_select_sm_occupancy;
437 if (CubDebug(error = MaxSmOccupancy(
438 range_select_sm_occupancy, // out
439 select_if_kernel,
440 select_if_config.block_threads))) break;
441
442 // Get max x-dimension of grid
443 int max_dim_x;
444 if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
445
446 // Get grid size for scanning tiles
447 dim3 scan_grid_size;
448 scan_grid_size.z = 1;
449 scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
450 scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
451
452 // Log select_if_kernel configuration
453 if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
454 scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
455
456 // Invoke select_if_kernel
457 select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
458 d_in,
459 d_flags,
463 select_op,
465 num_items,
466 num_tiles);
467
468 // Check for failure to launch
469 if (CubDebug(error = cudaPeekAtLastError())) break;
470
471 // Sync the stream if specified to flush runtime errors
472 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
473 }
474 while (0);
475
476 return error;
477
478#endif // CUB_RUNTIME_ENABLED
479 }
480
481
485 CUB_RUNTIME_FUNCTION __forceinline__
486 static cudaError_t Dispatch(
487 void* d_temp_storage,
488 size_t& temp_storage_bytes,
489 InputIteratorT d_in,
490 FlagsInputIteratorT d_flags,
491 SelectedOutputIteratorT d_selected_out,
492 NumSelectedIteratorT d_num_selected_out,
493 SelectOpT select_op,
494 EqualityOpT equality_op,
496 cudaStream_t stream,
497 bool debug_synchronous)
498 {
499 cudaError error = cudaSuccess;
500 do
501 {
502 // Get PTX version
503 int ptx_version;
504 #if (CUB_PTX_ARCH == 0)
505 if (CubDebug(error = PtxVersion(ptx_version))) break;
506 #else
507 ptx_version = CUB_PTX_ARCH;
508 #endif
509
510 // Get kernel kernel dispatch configurations
511 KernelConfig select_if_config;
512 InitConfigs(ptx_version, select_if_config);
513
514 // Dispatch
515 if (CubDebug(error = Dispatch(
516 d_temp_storage,
517 temp_storage_bytes,
518 d_in,
519 d_flags,
522 select_op,
524 num_items,
525 stream,
526 debug_synchronous,
527 ptx_version,
528 DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
529 DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
530 select_if_config))) break;
531 }
532 while (0);
533
534 return error;
535 }
536};
537
538
539} // CUB namespace
540CUB_NS_POSTFIX // Optional outer namespace(s)
541
542
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
@ BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT ScanTileStateT SelectOpT select_op
[in] Selection operator
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT d_num_selected_out
[out] Pointer to the total number of items selected (i.e., length of d_selected_out)
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
FlagsInputIteratorT SelectedOutputIteratorT d_selected_out
[out] Pointer to the output sequence of selected data items
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
@ BLOCK_SCAN_RAKING
@ BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_RAKING_MEMOIZE
FlagsInputIteratorT d_flags
< [in] Pointer to the input sequence of data items
OffsetT OffsetT
[in] Total number of input data items
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_status
[in] Tile status interface
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wid...
< Whether or not we push rejected items to the back of the output
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &select_if_config)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelPtrT scan_init_kernel, SelectIfKernelPtrT select_if_kernel, KernelConfig select_if_config)
< Function type of cub::SelectIfKernelPtrT
Type equality test.
Definition util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53