OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
dispatch_reduce_by_key.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "dispatch_scan.cuh"
41 #include "../../agent/agent_reduce_by_key.cuh"
42 #include "../../thread/thread_operators.cuh"
43 #include "../../grid/grid_queue.cuh"
44 #include "../../util_device.cuh"
45 #include "../../util_namespace.cuh"
46 
48 CUB_NS_PREFIX
49 
51 namespace cub {
52 
53 /******************************************************************************
54  * Kernel entry points
55  *****************************************************************************/
56 
60 template <
61  typename AgentReduceByKeyPolicyT,
62  typename KeysInputIteratorT,
63  typename UniqueOutputIteratorT,
64  typename ValuesInputIteratorT,
65  typename AggregatesOutputIteratorT,
66  typename NumRunsOutputIteratorT,
67  typename ScanTileStateT,
68  typename EqualityOpT,
69  typename ReductionOpT,
70  typename OffsetT>
71 __launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
72 __global__ void DeviceReduceByKeyKernel(
73  KeysInputIteratorT d_keys_in,
74  UniqueOutputIteratorT d_unique_out,
75  ValuesInputIteratorT d_values_in,
76  AggregatesOutputIteratorT d_aggregates_out,
77  NumRunsOutputIteratorT d_num_runs_out,
78  ScanTileStateT tile_state,
79  int start_tile,
80  EqualityOpT equality_op,
81  ReductionOpT reduction_op,
83 {
84  // Thread block type for reducing tiles of value segments
85  typedef AgentReduceByKey<
86  AgentReduceByKeyPolicyT,
87  KeysInputIteratorT,
88  UniqueOutputIteratorT,
89  ValuesInputIteratorT,
90  AggregatesOutputIteratorT,
91  NumRunsOutputIteratorT,
92  EqualityOpT,
93  ReductionOpT,
94  OffsetT>
95  AgentReduceByKeyT;
96 
97  // Shared memory for AgentReduceByKey
98  __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
99 
100  // Process tiles
101  AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
102  num_items,
103  tile_state,
104  start_tile);
105 }
106 
107 
108 
109 
110 /******************************************************************************
111  * Dispatch
112  ******************************************************************************/
113 
117 template <
118  typename KeysInputIteratorT,
119  typename UniqueOutputIteratorT,
120  typename ValuesInputIteratorT,
121  typename AggregatesOutputIteratorT,
122  typename NumRunsOutputIteratorT,
123  typename EqualityOpT,
124  typename ReductionOpT,
125  typename OffsetT>
127 {
128  //-------------------------------------------------------------------------
129  // Types and constants
130  //-------------------------------------------------------------------------
131 
132  // The input keys type
133  typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
134 
135  // The output keys type
136  typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ?
137  typename std::iterator_traits<KeysInputIteratorT>::value_type, // ... then the input iterator's value type,
138  typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT; // ... else the output iterator's value type
139 
140  // The input values type
141  typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
142 
143  // The output values type
144  typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ?
145  typename std::iterator_traits<ValuesInputIteratorT>::value_type, // ... then the input iterator's value type,
146  typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT; // ... else the output iterator's value type
147 
148  enum
149  {
150  INIT_KERNEL_THREADS = 128,
151  MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
152  COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT),
153  };
154 
155  // Tile status descriptor interface type
157 
158 
159  //-------------------------------------------------------------------------
160  // Tuning policies
161  //-------------------------------------------------------------------------
162 
164  struct Policy350
165  {
166  enum {
167  NOMINAL_4B_ITEMS_PER_THREAD = 6,
168  ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
169  };
170 
171  typedef AgentReduceByKeyPolicy<
172  128,
173  ITEMS_PER_THREAD,
175  LOAD_LDG,
178  };
179 
181  struct Policy300
182  {
183  enum {
184  NOMINAL_4B_ITEMS_PER_THREAD = 6,
185  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
186  };
187 
188  typedef AgentReduceByKeyPolicy<
189  128,
190  ITEMS_PER_THREAD,
192  LOAD_DEFAULT,
195  };
196 
198  struct Policy200
199  {
200  enum {
201  NOMINAL_4B_ITEMS_PER_THREAD = 11,
202  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
203  };
204 
205  typedef AgentReduceByKeyPolicy<
206  128,
207  ITEMS_PER_THREAD,
209  LOAD_DEFAULT,
212  };
213 
215  struct Policy130
216  {
217  enum {
218  NOMINAL_4B_ITEMS_PER_THREAD = 7,
219  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
220  };
221 
222  typedef AgentReduceByKeyPolicy<
223  128,
224  ITEMS_PER_THREAD,
226  LOAD_DEFAULT,
229  };
230 
232  struct Policy110
233  {
234  enum {
235  NOMINAL_4B_ITEMS_PER_THREAD = 5,
236  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
237  };
238 
239  typedef AgentReduceByKeyPolicy<
240  64,
241  ITEMS_PER_THREAD,
243  LOAD_DEFAULT,
246  };
247 
248 
249  /******************************************************************************
250  * Tuning policies of current PTX compiler pass
251  ******************************************************************************/
252 
253 #if (CUB_PTX_ARCH >= 350)
254  typedef Policy350 PtxPolicy;
255 
256 #elif (CUB_PTX_ARCH >= 300)
257  typedef Policy300 PtxPolicy;
258 
259 #elif (CUB_PTX_ARCH >= 200)
260  typedef Policy200 PtxPolicy;
261 
262 #elif (CUB_PTX_ARCH >= 130)
263  typedef Policy130 PtxPolicy;
264 
265 #else
266  typedef Policy110 PtxPolicy;
267 
268 #endif
269 
270  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
272 
273 
274  /******************************************************************************
275  * Utilities
276  ******************************************************************************/
277 
281  template <typename KernelConfig>
282  CUB_RUNTIME_FUNCTION __forceinline__
283  static void InitConfigs(
284  int ptx_version,
285  KernelConfig &reduce_by_key_config)
286  {
287  #if (CUB_PTX_ARCH > 0)
288  (void)ptx_version;
289 
290  // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
291  reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
292 
293  #else
294 
295  // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
296  if (ptx_version >= 350)
297  {
298  reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
299  }
300  else if (ptx_version >= 300)
301  {
302  reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
303  }
304  else if (ptx_version >= 200)
305  {
306  reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
307  }
308  else if (ptx_version >= 130)
309  {
310  reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
311  }
312  else
313  {
314  reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
315  }
316 
317  #endif
318  }
319 
320 
325  {
326  int block_threads;
327  int items_per_thread;
328  int tile_items;
329 
330  template <typename PolicyT>
331  CUB_RUNTIME_FUNCTION __forceinline__
332  void Init()
333  {
334  block_threads = PolicyT::BLOCK_THREADS;
335  items_per_thread = PolicyT::ITEMS_PER_THREAD;
336  tile_items = block_threads * items_per_thread;
337  }
338  };
339 
340 
341  //---------------------------------------------------------------------
342  // Dispatch entrypoints
343  //---------------------------------------------------------------------
344 
349  template <
350  typename ScanInitKernelT,
351  typename ReduceByKeyKernelT>
352  CUB_RUNTIME_FUNCTION __forceinline__
353  static cudaError_t Dispatch(
354  void* d_temp_storage,
355  size_t& temp_storage_bytes,
356  KeysInputIteratorT d_keys_in,
357  UniqueOutputIteratorT d_unique_out,
358  ValuesInputIteratorT d_values_in,
359  AggregatesOutputIteratorT d_aggregates_out,
360  NumRunsOutputIteratorT d_num_runs_out,
361  EqualityOpT equality_op,
362  ReductionOpT reduction_op,
364  cudaStream_t stream,
365  bool debug_synchronous,
366  int /*ptx_version*/,
367  ScanInitKernelT init_kernel,
368  ReduceByKeyKernelT reduce_by_key_kernel,
369  KernelConfig reduce_by_key_config)
370  {
371 
372 #ifndef CUB_RUNTIME_ENABLED
373  (void)d_temp_storage;
374  (void)temp_storage_bytes;
375  (void)d_keys_in;
376  (void)d_unique_out;
377  (void)d_values_in;
378  (void)d_aggregates_out;
379  (void)d_num_runs_out;
380  (void)equality_op;
381  (void)reduction_op;
382  (void)num_items;
383  (void)stream;
384  (void)debug_synchronous;
385  (void)init_kernel;
386  (void)reduce_by_key_kernel;
387  (void)reduce_by_key_config;
388 
389  // Kernel launch not supported from this device
390  return CubDebug(cudaErrorNotSupported);
391 
392 #else
393 
394  cudaError error = cudaSuccess;
395  do
396  {
397  // Get device ordinal
398  int device_ordinal;
399  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
400 
401  // Get SM count
402  int sm_count;
403  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
404 
405  // Number of input tiles
406  int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
407  int num_tiles = (num_items + tile_size - 1) / tile_size;
408 
409  // Specify temporary storage allocation requirements
410  size_t allocation_sizes[1];
411  if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
412 
413  // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
414  void* allocations[1];
415  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
416  if (d_temp_storage == NULL)
417  {
418  // Return if the caller is simply requesting the size of the storage allocation
419  break;
420  }
421 
422  // Construct the tile status interface
424  if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
425 
426  // Log init_kernel configuration
427  int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
428  if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
429 
430  // Invoke init_kernel to initialize tile descriptors
431  init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
432  tile_state,
433  num_tiles,
435 
436  // Check for failure to launch
437  if (CubDebug(error = cudaPeekAtLastError())) break;
438 
439  // Sync the stream if specified to flush runtime errors
440  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
441 
442  // Return if empty problem
443  if (num_items == 0)
444  break;
445 
446  // Get SM occupancy for reduce_by_key_kernel
447  int reduce_by_key_sm_occupancy;
448  if (CubDebug(error = MaxSmOccupancy(
449  reduce_by_key_sm_occupancy, // out
450  reduce_by_key_kernel,
451  reduce_by_key_config.block_threads))) break;
452 
453  // Get max x-dimension of grid
454  int max_dim_x;
455  if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
456 
457  // Run grids in epochs (in case number of tiles exceeds max x-dimension
458  int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
459  for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
460  {
461  // Log reduce_by_key_kernel configuration
462  if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
463  start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
464 
465  // Invoke reduce_by_key_kernel
466  reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
467  d_keys_in,
468  d_unique_out,
469  d_values_in,
472  tile_state,
473  start_tile,
474  equality_op,
475  reduction_op,
476  num_items);
477 
478  // Check for failure to launch
479  if (CubDebug(error = cudaPeekAtLastError())) break;
480 
481  // Sync the stream if specified to flush runtime errors
482  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
483  }
484  }
485  while (0);
486 
487  return error;
488 
489 #endif // CUB_RUNTIME_ENABLED
490  }
491 
492 
496  CUB_RUNTIME_FUNCTION __forceinline__
497  static cudaError_t Dispatch(
498  void* d_temp_storage,
499  size_t& temp_storage_bytes,
500  KeysInputIteratorT d_keys_in,
501  UniqueOutputIteratorT d_unique_out,
502  ValuesInputIteratorT d_values_in,
503  AggregatesOutputIteratorT d_aggregates_out,
504  NumRunsOutputIteratorT d_num_runs_out,
505  EqualityOpT equality_op,
506  ReductionOpT reduction_op,
508  cudaStream_t stream,
509  bool debug_synchronous)
510  {
511  cudaError error = cudaSuccess;
512  do
513  {
514  // Get PTX version
515  int ptx_version;
516  #if (CUB_PTX_ARCH == 0)
517  if (CubDebug(error = PtxVersion(ptx_version))) break;
518  #else
519  ptx_version = CUB_PTX_ARCH;
520  #endif
521 
522  // Get kernel kernel dispatch configurations
523  KernelConfig reduce_by_key_config;
524  InitConfigs(ptx_version, reduce_by_key_config);
525 
526  // Dispatch
527  if (CubDebug(error = Dispatch(
528  d_temp_storage,
529  temp_storage_bytes,
530  d_keys_in,
531  d_unique_out,
532  d_values_in,
535  equality_op,
536  reduction_op,
537  num_items,
538  stream,
539  debug_synchronous,
540  ptx_version,
541  DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
542  DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
543  reduce_by_key_config))) break;
544  }
545  while (0);
546 
547  return error;
548  }
549 };
550 
551 } // CUB namespace
552 CUB_NS_POSTFIX // Optional outer namespace(s)
553 
554 
Cache as texture.
Definition: thread_load.cuh:69
Type equality test.
Definition: util_type.cuh:98
< The BlockScan algorithm to use
Default (no modifier)
Definition: thread_load.cuh:64
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Optional outer namespace(s)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &reduce_by_key_config)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
KeyT const ValueT * d_values_in
[in] Input values buffer
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition: util_macro.cuh:61
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-...