OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
dispatch_select_if.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "dispatch_scan.cuh"
41 #include "../../agent/agent_select_if.cuh"
42 #include "../../thread/thread_operators.cuh"
43 #include "../../grid/grid_queue.cuh"
44 #include "../../util_device.cuh"
45 #include "../../util_namespace.cuh"
46 
48 CUB_NS_PREFIX
49 
51 namespace cub {
52 
53 /******************************************************************************
54  * Kernel entry points
55  *****************************************************************************/
56 
64 template <
65  typename AgentSelectIfPolicyT,
66  typename InputIteratorT,
67  typename FlagsInputIteratorT,
68  typename SelectedOutputIteratorT,
69  typename NumSelectedIteratorT,
70  typename ScanTileStateT,
71  typename SelectOpT,
72  typename EqualityOpT,
73  typename OffsetT,
74  bool KEEP_REJECTS>
75 __launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
76 __global__ void DeviceSelectSweepKernel(
77  InputIteratorT d_in,
78  FlagsInputIteratorT d_flags,
79  SelectedOutputIteratorT d_selected_out,
80  NumSelectedIteratorT d_num_selected_out,
81  ScanTileStateT tile_status,
82  SelectOpT select_op,
83  EqualityOpT equality_op,
85  int num_tiles)
86 {
87  // Thread block type for selecting data from input tiles
88  typedef AgentSelectIf<
89  AgentSelectIfPolicyT,
90  InputIteratorT,
91  FlagsInputIteratorT,
92  SelectedOutputIteratorT,
93  SelectOpT,
94  EqualityOpT,
95  OffsetT,
96  KEEP_REJECTS> AgentSelectIfT;
97 
98  // Shared memory for AgentSelectIf
99  __shared__ typename AgentSelectIfT::TempStorage temp_storage;
100 
101  // Process tiles
102  AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
103  num_tiles,
104  tile_status,
106 }
107 
108 
109 
110 
111 /******************************************************************************
112  * Dispatch
113  ******************************************************************************/
114 
118 template <
119  typename InputIteratorT,
120  typename FlagsInputIteratorT,
121  typename SelectedOutputIteratorT,
122  typename NumSelectedIteratorT,
123  typename SelectOpT,
124  typename EqualityOpT,
125  typename OffsetT,
126  bool KEEP_REJECTS>
128 {
129  /******************************************************************************
130  * Types and constants
131  ******************************************************************************/
132 
133  // The output value type
134  typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
135  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
136  typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
137 
138  // The flag value type
139  typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
140 
141  enum
142  {
143  INIT_KERNEL_THREADS = 128,
144  };
145 
146  // Tile status descriptor interface type
148 
149 
150  /******************************************************************************
151  * Tuning policies
152  ******************************************************************************/
153 
155  struct Policy350
156  {
157  enum {
158  NOMINAL_4B_ITEMS_PER_THREAD = 10,
159  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
160  };
161 
162  typedef AgentSelectIfPolicy<
163  128,
164  ITEMS_PER_THREAD,
166  LOAD_LDG,
169  };
170 
172  struct Policy300
173  {
174  enum {
175  NOMINAL_4B_ITEMS_PER_THREAD = 7,
176  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
177  };
178 
179  typedef AgentSelectIfPolicy<
180  128,
181  ITEMS_PER_THREAD,
183  LOAD_DEFAULT,
186  };
187 
189  struct Policy200
190  {
191  enum {
192  NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
193  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
194  };
195 
196  typedef AgentSelectIfPolicy<
197  128,
198  ITEMS_PER_THREAD,
200  LOAD_DEFAULT,
203  };
204 
206  struct Policy130
207  {
208  enum {
209  NOMINAL_4B_ITEMS_PER_THREAD = 9,
210  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
211  };
212 
213  typedef AgentSelectIfPolicy<
214  64,
215  ITEMS_PER_THREAD,
217  LOAD_DEFAULT,
220  };
221 
223  struct Policy100
224  {
225  enum {
226  NOMINAL_4B_ITEMS_PER_THREAD = 9,
227  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
228  };
229 
230  typedef AgentSelectIfPolicy<
231  64,
232  ITEMS_PER_THREAD,
234  LOAD_DEFAULT,
237  };
238 
239 
240  /******************************************************************************
241  * Tuning policies of current PTX compiler pass
242  ******************************************************************************/
243 
244 #if (CUB_PTX_ARCH >= 350)
245  typedef Policy350 PtxPolicy;
246 
247 #elif (CUB_PTX_ARCH >= 300)
248  typedef Policy300 PtxPolicy;
249 
250 #elif (CUB_PTX_ARCH >= 200)
251  typedef Policy200 PtxPolicy;
252 
253 #elif (CUB_PTX_ARCH >= 130)
254  typedef Policy130 PtxPolicy;
255 
256 #else
257  typedef Policy100 PtxPolicy;
258 
259 #endif
260 
261  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
263 
264 
265  /******************************************************************************
266  * Utilities
267  ******************************************************************************/
268 
272  template <typename KernelConfig>
273  CUB_RUNTIME_FUNCTION __forceinline__
274  static void InitConfigs(
275  int ptx_version,
276  KernelConfig &select_if_config)
277  {
278  #if (CUB_PTX_ARCH > 0)
279  (void)ptx_version;
280 
281  // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
282  select_if_config.template Init<PtxSelectIfPolicyT>();
283 
284  #else
285 
286  // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
287  if (ptx_version >= 350)
288  {
289  select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
290  }
291  else if (ptx_version >= 300)
292  {
293  select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
294  }
295  else if (ptx_version >= 200)
296  {
297  select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
298  }
299  else if (ptx_version >= 130)
300  {
301  select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
302  }
303  else
304  {
305  select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
306  }
307 
308  #endif
309  }
310 
311 
316  {
317  int block_threads;
318  int items_per_thread;
319  int tile_items;
320 
321  template <typename PolicyT>
322  CUB_RUNTIME_FUNCTION __forceinline__
323  void Init()
324  {
325  block_threads = PolicyT::BLOCK_THREADS;
326  items_per_thread = PolicyT::ITEMS_PER_THREAD;
327  tile_items = block_threads * items_per_thread;
328  }
329  };
330 
331 
332  /******************************************************************************
333  * Dispatch entrypoints
334  ******************************************************************************/
335 
340  template <
341  typename ScanInitKernelPtrT,
342  typename SelectIfKernelPtrT>
343  CUB_RUNTIME_FUNCTION __forceinline__
344  static cudaError_t Dispatch(
345  void* d_temp_storage,
346  size_t& temp_storage_bytes,
347  InputIteratorT d_in,
348  FlagsInputIteratorT d_flags,
349  SelectedOutputIteratorT d_selected_out,
350  NumSelectedIteratorT d_num_selected_out,
351  SelectOpT select_op,
352  EqualityOpT equality_op,
354  cudaStream_t stream,
355  bool debug_synchronous,
356  int /*ptx_version*/,
357  ScanInitKernelPtrT scan_init_kernel,
358  SelectIfKernelPtrT select_if_kernel,
359  KernelConfig select_if_config)
360  {
361 
362 #ifndef CUB_RUNTIME_ENABLED
363  (void)d_temp_storage;
364  (void)temp_storage_bytes;
365  (void)d_in;
366  (void)d_flags;
367  (void)d_selected_out;
368  (void)d_num_selected_out;
369  (void)select_op;
370  (void)equality_op;
371  (void)num_items;
372  (void)stream;
373  (void)debug_synchronous;
374  (void)scan_init_kernel;
375  (void)select_if_kernel;
376  (void)select_if_config;
377 
378  // Kernel launch not supported from this device
379  return CubDebug(cudaErrorNotSupported);
380 
381 #else
382 
383  cudaError error = cudaSuccess;
384  do
385  {
386  // Get device ordinal
387  int device_ordinal;
388  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
389 
390  // Get SM count
391  int sm_count;
392  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
393 
394  // Number of input tiles
395  int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
396  int num_tiles = (num_items + tile_size - 1) / tile_size;
397 
398  // Specify temporary storage allocation requirements
399  size_t allocation_sizes[1];
400  if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
401 
402  // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
403  void* allocations[1];
404  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
405  if (d_temp_storage == NULL)
406  {
407  // Return if the caller is simply requesting the size of the storage allocation
408  break;
409  }
410 
411  // Construct the tile status interface
413  if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
414 
415  // Log scan_init_kernel configuration
416  int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
417  if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
418 
419  // Invoke scan_init_kernel to initialize tile descriptors
420  scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
421  tile_status,
422  num_tiles,
424 
425  // Check for failure to launch
426  if (CubDebug(error = cudaPeekAtLastError())) break;
427 
428  // Sync the stream if specified to flush runtime errors
429  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
430 
431  // Return if empty problem
432  if (num_items == 0)
433  break;
434 
435  // Get SM occupancy for select_if_kernel
436  int range_select_sm_occupancy;
437  if (CubDebug(error = MaxSmOccupancy(
438  range_select_sm_occupancy, // out
439  select_if_kernel,
440  select_if_config.block_threads))) break;
441 
442  // Get max x-dimension of grid
443  int max_dim_x;
444  if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
445 
446  // Get grid size for scanning tiles
447  dim3 scan_grid_size;
448  scan_grid_size.z = 1;
449  scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
450  scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
451 
452  // Log select_if_kernel configuration
453  if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
454  scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
455 
456  // Invoke select_if_kernel
457  select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
458  d_in,
459  d_flags,
462  tile_status,
463  select_op,
464  equality_op,
465  num_items,
466  num_tiles);
467 
468  // Check for failure to launch
469  if (CubDebug(error = cudaPeekAtLastError())) break;
470 
471  // Sync the stream if specified to flush runtime errors
472  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
473  }
474  while (0);
475 
476  return error;
477 
478 #endif // CUB_RUNTIME_ENABLED
479  }
480 
481 
485  CUB_RUNTIME_FUNCTION __forceinline__
486  static cudaError_t Dispatch(
487  void* d_temp_storage,
488  size_t& temp_storage_bytes,
489  InputIteratorT d_in,
490  FlagsInputIteratorT d_flags,
491  SelectedOutputIteratorT d_selected_out,
492  NumSelectedIteratorT d_num_selected_out,
493  SelectOpT select_op,
494  EqualityOpT equality_op,
496  cudaStream_t stream,
497  bool debug_synchronous)
498  {
499  cudaError error = cudaSuccess;
500  do
501  {
502  // Get PTX version
503  int ptx_version;
504  #if (CUB_PTX_ARCH == 0)
505  if (CubDebug(error = PtxVersion(ptx_version))) break;
506  #else
507  ptx_version = CUB_PTX_ARCH;
508  #endif
509 
510  // Get kernel kernel dispatch configurations
511  KernelConfig select_if_config;
512  InitConfigs(ptx_version, select_if_config);
513 
514  // Dispatch
515  if (CubDebug(error = Dispatch(
516  d_temp_storage,
517  temp_storage_bytes,
518  d_in,
519  d_flags,
522  select_op,
523  equality_op,
524  num_items,
525  stream,
526  debug_synchronous,
527  ptx_version,
528  DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
529  DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
530  select_if_config))) break;
531  }
532  while (0);
533 
534  return error;
535  }
536 };
537 
538 
539 } // CUB namespace
540 CUB_NS_POSTFIX // Optional outer namespace(s)
541 
542 
Cache as texture.
Definition: thread_load.cuh:69
Type equality test.
Definition: util_type.cuh:98
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT d_num_selected_out
[out] Pointer to the total number of items selected (i.e., length of d_selected_out)
Default (no modifier)
Definition: thread_load.cuh:64
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Optional outer namespace(s)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelPtrT scan_init_kernel, SelectIfKernelPtrT select_if_kernel, KernelConfig select_if_config)
< Function type of cub::SelectIfKernelPtrT
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_status
[in] Tile status interface
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT ScanTileStateT SelectOpT select_op
[in] Selection operator
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
OffsetT OffsetT
[in] Total number of input data items
FlagsInputIteratorT SelectedOutputIteratorT d_selected_out
[out] Pointer to the output sequence of selected data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &select_if_config)
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
< The BlockScan algorithm to use
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition: util_macro.cuh:61
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
FlagsInputIteratorT d_flags
< [in] Pointer to the input sequence of data items
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
< Whether or not we push rejected items to the back of the output
AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wid...