OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
dispatch_histogram.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 #include <limits>
40 
41 #include "../../agent/agent_histogram.cuh"
42 #include "../../util_debug.cuh"
43 #include "../../util_device.cuh"
44 #include "../../thread/thread_search.cuh"
45 #include "../../grid/grid_queue.cuh"
46 #include "../../util_namespace.cuh"
47 
49 CUB_NS_PREFIX
50 
52 namespace cub {
53 
54 
55 
56 /******************************************************************************
57  * Histogram kernel entry points
58  *****************************************************************************/
59 
63 template <
64  int NUM_ACTIVE_CHANNELS,
65  typename CounterT,
66  typename OffsetT>
67 __global__ void DeviceHistogramInitKernel(
71 {
72  if ((threadIdx.x == 0) && (blockIdx.x == 0))
73  tile_queue.ResetDrain();
74 
75  int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
76 
77  #pragma unroll
78  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
79  {
80  if (output_bin < num_output_bins_wrapper.array[CHANNEL])
81  d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
82  }
83 }
84 
85 
89 template <
90  typename AgentHistogramPolicyT,
91  int PRIVATIZED_SMEM_BINS,
92  int NUM_CHANNELS,
93  int NUM_ACTIVE_CHANNELS,
94  typename SampleIteratorT,
95  typename CounterT,
96  typename PrivatizedDecodeOpT,
97  typename OutputDecodeOpT,
98  typename OffsetT>
99 __launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
100 __global__ void DeviceHistogramSweepKernel(
101  SampleIteratorT d_samples,
109  OffsetT num_rows,
111  int tiles_per_row,
113 {
114  // Thread block type for compositing input tiles
115  typedef AgentHistogram<
116  AgentHistogramPolicyT,
117  PRIVATIZED_SMEM_BINS,
118  NUM_CHANNELS,
119  NUM_ACTIVE_CHANNELS,
120  SampleIteratorT,
121  CounterT,
122  PrivatizedDecodeOpT,
123  OutputDecodeOpT,
124  OffsetT>
125  AgentHistogramT;
126 
127  // Shared memory for AgentHistogram
128  __shared__ typename AgentHistogramT::TempStorage temp_storage;
129 
130  AgentHistogramT agent(
131  temp_storage,
132  d_samples,
139 
140  // Initialize counters
141  agent.InitBinCounters();
142 
143  // Consume input tiles
144  agent.ConsumeTiles(
146  num_rows,
149  tile_queue);
150 
151  // Store output to global (if necessary)
152  agent.StoreOutput();
153 
154 }
155 
156 
157 
158 
159 
160 
161 /******************************************************************************
162  * Dispatch
163  ******************************************************************************/
164 
168 template <
169  int NUM_CHANNELS,
170  int NUM_ACTIVE_CHANNELS,
171  typename SampleIteratorT,
172  typename CounterT,
173  typename LevelT,
174  typename OffsetT>
176 {
177  //---------------------------------------------------------------------
178  // Types and constants
179  //---------------------------------------------------------------------
180 
182  typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
183 
184  enum
185  {
186  // Maximum number of bins per channel for which we will use a privatized smem strategy
187  MAX_PRIVATIZED_SMEM_BINS = 256
188  };
189 
190 
191  //---------------------------------------------------------------------
192  // Transform functors for converting samples to bin-ids
193  //---------------------------------------------------------------------
194 
195  // Searches for bin given a list of bin-boundary levels
196  template <typename LevelIteratorT>
198  {
199  LevelIteratorT d_levels; // Pointer to levels array
200  int num_output_levels; // Number of levels in array
201 
202  // Initializer
203  __host__ __device__ __forceinline__ void Init(
204  LevelIteratorT d_levels, // Pointer to levels array
205  int num_output_levels) // Number of levels in array
206  {
207  this->d_levels = d_levels;
208  this->num_output_levels = num_output_levels;
209  }
210 
211  // Method for converting samples to bin-ids
212  template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
213  __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
214  {
216  typedef typename If<IsPointer<LevelIteratorT>::VALUE,
217  CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
218  LevelIteratorT>::Type // Directly use the supplied input iterator type
219  WrappedLevelIteratorT;
220 
221  WrappedLevelIteratorT wrapped_levels(d_levels);
222 
223  int num_bins = num_output_levels - 1;
224  if (valid)
225  {
226  bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
227  if (bin >= num_bins)
228  bin = -1;
229  }
230  }
231  };
232 
233 
234  // Scales samples to evenly-spaced bins
236  {
237  int num_bins; // Number of levels in array
238  LevelT max; // Max sample level (exclusive)
239  LevelT min; // Min sample level (inclusive)
240  LevelT scale; // Bin scaling factor
241 
242  // Initializer
243  template <typename _LevelT>
244  __host__ __device__ __forceinline__ void Init(
245  int num_output_levels, // Number of levels in array
246  _LevelT max, // Max sample level (exclusive)
247  _LevelT min, // Min sample level (inclusive)
248  _LevelT scale) // Bin scaling factor
249  {
250  this->num_bins = num_output_levels - 1;
251  this->max = max;
252  this->min = min;
253  this->scale = scale;
254  }
255 
256  // Initializer (float specialization)
257  __host__ __device__ __forceinline__ void Init(
258  int num_output_levels, // Number of levels in array
259  float max, // Max sample level (exclusive)
260  float min, // Min sample level (inclusive)
261  float scale) // Bin scaling factor
262  {
263  this->num_bins = num_output_levels - 1;
264  this->max = max;
265  this->min = min;
266  this->scale = float(1.0) / scale;
267  }
268 
269  // Initializer (double specialization)
270  __host__ __device__ __forceinline__ void Init(
271  int num_output_levels, // Number of levels in array
272  double max, // Max sample level (exclusive)
273  double min, // Min sample level (inclusive)
274  double scale) // Bin scaling factor
275  {
276  this->num_bins = num_output_levels - 1;
277  this->max = max;
278  this->min = min;
279  this->scale = double(1.0) / scale;
280  }
281 
282  // Method for converting samples to bin-ids
283  template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
284  __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
285  {
286  LevelT level_sample = (LevelT) sample;
287 
288  if (valid && (level_sample >= min) && (level_sample < max))
289  bin = (int) ((level_sample - min) / scale);
290  }
291 
292  // Method for converting samples to bin-ids (float specialization)
293  template <CacheLoadModifier LOAD_MODIFIER>
294  __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
295  {
296  LevelT level_sample = (LevelT) sample;
297 
298  if (valid && (level_sample >= min) && (level_sample < max))
299  bin = (int) ((level_sample - min) * scale);
300  }
301 
302  // Method for converting samples to bin-ids (double specialization)
303  template <CacheLoadModifier LOAD_MODIFIER>
304  __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
305  {
306  LevelT level_sample = (LevelT) sample;
307 
308  if (valid && (level_sample >= min) && (level_sample < max))
309  bin = (int) ((level_sample - min) * scale);
310  }
311  };
312 
313 
314  // Pass-through bin transform operator
316  {
317  // Method for converting samples to bin-ids
318  template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
319  __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
320  {
321  if (valid)
322  bin = (int) sample;
323  }
324  };
325 
326 
327 
328  //---------------------------------------------------------------------
329  // Tuning policies
330  //---------------------------------------------------------------------
331 
332  template <int NOMINAL_ITEMS_PER_THREAD>
333  struct TScale
334  {
335  enum
336  {
337  V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
338  VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
339  };
340  };
341 
342 
344  struct Policy110
345  {
346  // HistogramSweepPolicy
347  typedef AgentHistogramPolicy<
348  512,
349  (NUM_CHANNELS == 1) ? 8 : 2,
351  LOAD_DEFAULT,
352  true,
353  GMEM,
354  false>
356  };
357 
359  struct Policy200
360  {
361  // HistogramSweepPolicy
362  typedef AgentHistogramPolicy<
363  (NUM_CHANNELS == 1) ? 256 : 128,
364  (NUM_CHANNELS == 1) ? 8 : 3,
365  (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
366  LOAD_DEFAULT,
367  true,
368  SMEM,
369  false>
371  };
372 
374  struct Policy300
375  {
376  // HistogramSweepPolicy
377  typedef AgentHistogramPolicy<
378  512,
379  (NUM_CHANNELS == 1) ? 8 : 2,
381  LOAD_DEFAULT,
382  true,
383  GMEM,
384  false>
386  };
387 
389  struct Policy350
390  {
391  // HistogramSweepPolicy
392  typedef AgentHistogramPolicy<
393  128,
396  LOAD_LDG,
397  true,
398  BLEND,
399  true>
401  };
402 
404  struct Policy500
405  {
406  // HistogramSweepPolicy
407  typedef AgentHistogramPolicy<
408  384,
411  LOAD_LDG,
412  true,
413  SMEM,
414  false>
416  };
417 
418 
419 
420  //---------------------------------------------------------------------
421  // Tuning policies of current PTX compiler pass
422  //---------------------------------------------------------------------
423 
424 #if (CUB_PTX_ARCH >= 500)
425  typedef Policy500 PtxPolicy;
426 
427 #elif (CUB_PTX_ARCH >= 350)
428  typedef Policy350 PtxPolicy;
429 
430 #elif (CUB_PTX_ARCH >= 300)
431  typedef Policy300 PtxPolicy;
432 
433 #elif (CUB_PTX_ARCH >= 200)
434  typedef Policy200 PtxPolicy;
435 
436 #else
437  typedef Policy110 PtxPolicy;
438 
439 #endif
440 
441  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
443 
444 
445  //---------------------------------------------------------------------
446  // Utilities
447  //---------------------------------------------------------------------
448 
452  template <typename KernelConfig>
453  CUB_RUNTIME_FUNCTION __forceinline__
454  static cudaError_t InitConfigs(
455  int ptx_version,
456  KernelConfig &histogram_sweep_config)
457  {
458  #if (CUB_PTX_ARCH > 0)
459 
460  // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
461  return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
462 
463  #else
464 
465  // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
466  if (ptx_version >= 500)
467  {
468  return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
469  }
470  else if (ptx_version >= 350)
471  {
472  return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
473  }
474  else if (ptx_version >= 300)
475  {
476  return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
477  }
478  else if (ptx_version >= 200)
479  {
480  return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
481  }
482  else if (ptx_version >= 110)
483  {
484  return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
485  }
486  else
487  {
488  // No global atomic support
489  return cudaErrorNotSupported;
490  }
491 
492  #endif
493  }
494 
495 
500  {
501  int block_threads;
502  int pixels_per_thread;
503 
504  template <typename BlockPolicy>
505  CUB_RUNTIME_FUNCTION __forceinline__
506  cudaError_t Init()
507  {
508  block_threads = BlockPolicy::BLOCK_THREADS;
509  pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD;
510 
511  return cudaSuccess;
512  }
513  };
514 
515 
516  //---------------------------------------------------------------------
517  // Dispatch entrypoints
518  //---------------------------------------------------------------------
519 
523  template <
524  typename PrivatizedDecodeOpT,
525  typename OutputDecodeOpT,
526  typename DeviceHistogramInitKernelT,
527  typename DeviceHistogramSweepKernelT>
528  CUB_RUNTIME_FUNCTION __forceinline__
529  static cudaError_t PrivatizedDispatch(
530  void* d_temp_storage,
531  size_t& temp_storage_bytes,
532  SampleIteratorT d_samples,
533  CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
534  int num_privatized_levels[NUM_ACTIVE_CHANNELS],
535  PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS],
536  int num_output_levels[NUM_ACTIVE_CHANNELS],
537  OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS],
538  int max_num_output_bins,
540  OffsetT num_rows,
542  DeviceHistogramInitKernelT histogram_init_kernel,
543  DeviceHistogramSweepKernelT histogram_sweep_kernel,
544  KernelConfig histogram_sweep_config,
545  cudaStream_t stream,
546  bool debug_synchronous)
547  {
548  #ifndef CUB_RUNTIME_ENABLED
549 
550  // Kernel launch not supported from this device
551  return CubDebug(cudaErrorNotSupported);
552 
553  #else
554 
555  cudaError error = cudaSuccess;
556  do
557  {
558  // Get device ordinal
559  int device_ordinal;
560  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
561 
562  // Get SM count
563  int sm_count;
564  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
565 
566  // Get SM occupancy for histogram_sweep_kernel
567  int histogram_sweep_sm_occupancy;
568  if (CubDebug(error = MaxSmOccupancy(
569  histogram_sweep_sm_occupancy,
570  histogram_sweep_kernel,
571  histogram_sweep_config.block_threads))) break;
572 
573  // Get device occupancy for histogram_sweep_kernel
574  int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
575 
576  if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
577  {
578  // Treat as a single linear array of samples
580  num_rows = 1;
581  row_stride_samples = num_row_pixels * NUM_CHANNELS;
582  }
583 
584  // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
585  int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
586  int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
587  int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
588  int blocks_per_col = (blocks_per_row > 0) ?
589  int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
590  0;
591  int num_thread_blocks = blocks_per_row * blocks_per_col;
592 
593  dim3 sweep_grid_dims;
594  sweep_grid_dims.x = (unsigned int) blocks_per_row;
595  sweep_grid_dims.y = (unsigned int) blocks_per_col;
596  sweep_grid_dims.z = 1;
597 
598  // Temporary storage allocation requirements
599  const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
600  void* allocations[NUM_ALLOCATIONS];
601  size_t allocation_sizes[NUM_ALLOCATIONS];
602 
603  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
604  allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
605 
606  allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
607 
608  // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
609  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
610  if (d_temp_storage == NULL)
611  {
612  // Return if the caller is simply requesting the size of the storage allocation
613  break;
614  }
615 
616  // Construct the grid queue descriptor
617  GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
618 
619  // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
621  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
622  d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
623 
624  // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
626  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
627  d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
628 
629  // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
631  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
632  privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
633 
634  // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
636  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
637  output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
638 
639  // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
641  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
642  num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
643 
644  // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
646  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
647  num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
648 
649  int histogram_init_block_threads = 256;
650  int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
651 
652  // Log DeviceHistogramInitKernel configuration
653  if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
654  histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
655 
656  // Invoke histogram_init_kernel
657  histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
660  tile_queue);
661 
662  // Return if empty problem
663  if ((blocks_per_row == 0) || (blocks_per_col == 0))
664  break;
665 
666  // Log histogram_sweep_kernel configuration
667  if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
668  sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
669  histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
670 
671  // Invoke histogram_sweep_kernel
672  histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
673  d_samples,
681  num_rows,
684  tile_queue);
685 
686  // Check for failure to launch
687  if (CubDebug(error = cudaPeekAtLastError())) break;
688 
689  // Sync the stream if specified to flush runtime errors
690  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
691 
692  }
693  while (0);
694 
695  return error;
696 
697  #endif // CUB_RUNTIME_ENABLED
698  }
699 
700 
701 
705  CUB_RUNTIME_FUNCTION
706  static cudaError_t DispatchRange(
707  void* d_temp_storage,
708  size_t& temp_storage_bytes,
709  SampleIteratorT d_samples,
710  CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
711  int num_output_levels[NUM_ACTIVE_CHANNELS],
712  LevelT *d_levels[NUM_ACTIVE_CHANNELS],
714  OffsetT num_rows,
716  cudaStream_t stream,
717  bool debug_synchronous,
718  Int2Type<false> is_byte_sample)
719  {
720  cudaError error = cudaSuccess;
721  do
722  {
723  // Get PTX version
724  int ptx_version;
725  #if (CUB_PTX_ARCH == 0)
726  if (CubDebug(error = PtxVersion(ptx_version))) break;
727  #else
728  ptx_version = CUB_PTX_ARCH;
729  #endif
730 
731  // Get kernel dispatch configurations
732  KernelConfig histogram_sweep_config;
733  if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
734  break;
735 
736  // Use the search transform op for converting samples to privatized bins
737  typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
738 
739  // Use the pass-thru transform op for converting privatized bins to output bins
740  typedef PassThruTransform OutputDecodeOpT;
741 
742  PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
743  OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
744  int max_levels = num_output_levels[0];
745 
746  for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
747  {
748  privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
749  if (num_output_levels[channel] > max_levels)
750  max_levels = num_output_levels[channel];
751  }
752  int max_num_output_bins = max_levels - 1;
753 
754  // Dispatch
755  if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
756  {
757  // Too many bins to keep in shared memory.
758  const int PRIVATIZED_SMEM_BINS = 0;
759 
760  if (CubDebug(error = PrivatizedDispatch(
761  d_temp_storage,
762  temp_storage_bytes,
763  d_samples,
764  d_output_histograms,
765  num_output_levels,
766  privatized_decode_op,
767  num_output_levels,
768  output_decode_op,
769  max_num_output_bins,
771  num_rows,
773  DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
774  DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
775  histogram_sweep_config,
776  stream,
777  debug_synchronous))) break;
778  }
779  else
780  {
781  // Dispatch shared-privatized approach
782  const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
783 
784  if (CubDebug(error = PrivatizedDispatch(
785  d_temp_storage,
786  temp_storage_bytes,
787  d_samples,
788  d_output_histograms,
789  num_output_levels,
790  privatized_decode_op,
791  num_output_levels,
792  output_decode_op,
793  max_num_output_bins,
795  num_rows,
797  DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
798  DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
799  histogram_sweep_config,
800  stream,
801  debug_synchronous))) break;
802  }
803 
804  } while (0);
805 
806  return error;
807  }
808 
809 
813  CUB_RUNTIME_FUNCTION
814  static cudaError_t DispatchRange(
815  void* d_temp_storage,
816  size_t& temp_storage_bytes,
817  SampleIteratorT d_samples,
818  CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
819  int num_output_levels[NUM_ACTIVE_CHANNELS],
820  LevelT *d_levels[NUM_ACTIVE_CHANNELS],
822  OffsetT num_rows,
824  cudaStream_t stream,
825  bool debug_synchronous,
826  Int2Type<true> is_byte_sample)
827  {
828  cudaError error = cudaSuccess;
829  do
830  {
831  // Get PTX version
832  int ptx_version;
833  #if (CUB_PTX_ARCH == 0)
834  if (CubDebug(error = PtxVersion(ptx_version))) break;
835  #else
836  ptx_version = CUB_PTX_ARCH;
837  #endif
838 
839  // Get kernel dispatch configurations
840  KernelConfig histogram_sweep_config;
841  if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
842  break;
843 
844  // Use the pass-thru transform op for converting samples to privatized bins
845  typedef PassThruTransform PrivatizedDecodeOpT;
846 
847  // Use the search transform op for converting privatized bins to output bins
848  typedef SearchTransform<LevelT*> OutputDecodeOpT;
849 
850  int num_privatized_levels[NUM_ACTIVE_CHANNELS];
851  PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
852  OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
853  int max_levels = num_output_levels[0]; // Maximum number of levels in any channel
854 
855  for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
856  {
857  num_privatized_levels[channel] = 257;
858  output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
859 
860  if (num_output_levels[channel] > max_levels)
861  max_levels = num_output_levels[channel];
862  }
863  int max_num_output_bins = max_levels - 1;
864 
865  const int PRIVATIZED_SMEM_BINS = 256;
866 
867  if (CubDebug(error = PrivatizedDispatch(
868  d_temp_storage,
869  temp_storage_bytes,
870  d_samples,
871  d_output_histograms,
872  num_privatized_levels,
873  privatized_decode_op,
874  num_output_levels,
875  output_decode_op,
876  max_num_output_bins,
878  num_rows,
880  DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
881  DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
882  histogram_sweep_config,
883  stream,
884  debug_synchronous))) break;
885 
886  } while (0);
887 
888  return error;
889  }
890 
891 
895  CUB_RUNTIME_FUNCTION __forceinline__
896  static cudaError_t DispatchEven(
897  void* d_temp_storage,
898  size_t& temp_storage_bytes,
899  SampleIteratorT d_samples,
900  CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
901  int num_output_levels[NUM_ACTIVE_CHANNELS],
902  LevelT lower_level[NUM_ACTIVE_CHANNELS],
903  LevelT upper_level[NUM_ACTIVE_CHANNELS],
905  OffsetT num_rows,
907  cudaStream_t stream,
908  bool debug_synchronous,
909  Int2Type<false> is_byte_sample)
910  {
911  cudaError error = cudaSuccess;
912  do
913  {
914  // Get PTX version
915  int ptx_version;
916  #if (CUB_PTX_ARCH == 0)
917  if (CubDebug(error = PtxVersion(ptx_version))) break;
918  #else
919  ptx_version = CUB_PTX_ARCH;
920  #endif
921 
922  // Get kernel dispatch configurations
923  KernelConfig histogram_sweep_config;
924  if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
925  break;
926 
927  // Use the scale transform op for converting samples to privatized bins
928  typedef ScaleTransform PrivatizedDecodeOpT;
929 
930  // Use the pass-thru transform op for converting privatized bins to output bins
931  typedef PassThruTransform OutputDecodeOpT;
932 
933  PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
934  OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
935  int max_levels = num_output_levels[0];
936 
937  for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
938  {
939  int bins = num_output_levels[channel] - 1;
940  LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
941 
942  privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
943 
944  if (num_output_levels[channel] > max_levels)
945  max_levels = num_output_levels[channel];
946  }
947  int max_num_output_bins = max_levels - 1;
948 
949  if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
950  {
951  // Dispatch shared-privatized approach
952  const int PRIVATIZED_SMEM_BINS = 0;
953 
954  if (CubDebug(error = PrivatizedDispatch(
955  d_temp_storage,
956  temp_storage_bytes,
957  d_samples,
958  d_output_histograms,
959  num_output_levels,
960  privatized_decode_op,
961  num_output_levels,
962  output_decode_op,
963  max_num_output_bins,
965  num_rows,
967  DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
968  DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
969  histogram_sweep_config,
970  stream,
971  debug_synchronous))) break;
972  }
973  else
974  {
975  // Dispatch shared-privatized approach
976  const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
977 
978  if (CubDebug(error = PrivatizedDispatch(
979  d_temp_storage,
980  temp_storage_bytes,
981  d_samples,
982  d_output_histograms,
983  num_output_levels,
984  privatized_decode_op,
985  num_output_levels,
986  output_decode_op,
987  max_num_output_bins,
989  num_rows,
991  DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
992  DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
993  histogram_sweep_config,
994  stream,
995  debug_synchronous))) break;
996  }
997  }
998  while (0);
999 
1000  return error;
1001  }
1002 
1003 
1007  CUB_RUNTIME_FUNCTION __forceinline__
1008  static cudaError_t DispatchEven(
1009  void* d_temp_storage,
1010  size_t& temp_storage_bytes,
1011  SampleIteratorT d_samples,
1012  CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
1013  int num_output_levels[NUM_ACTIVE_CHANNELS],
1014  LevelT lower_level[NUM_ACTIVE_CHANNELS],
1015  LevelT upper_level[NUM_ACTIVE_CHANNELS],
1017  OffsetT num_rows,
1019  cudaStream_t stream,
1020  bool debug_synchronous,
1021  Int2Type<true> is_byte_sample)
1022  {
1023  cudaError error = cudaSuccess;
1024  do
1025  {
1026  // Get PTX version
1027  int ptx_version;
1028  #if (CUB_PTX_ARCH == 0)
1029  if (CubDebug(error = PtxVersion(ptx_version))) break;
1030  #else
1031  ptx_version = CUB_PTX_ARCH;
1032  #endif
1033 
1034  // Get kernel dispatch configurations
1035  KernelConfig histogram_sweep_config;
1036  if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
1037  break;
1038 
1039  // Use the pass-thru transform op for converting samples to privatized bins
1040  typedef PassThruTransform PrivatizedDecodeOpT;
1041 
1042  // Use the scale transform op for converting privatized bins to output bins
1043  typedef ScaleTransform OutputDecodeOpT;
1044 
1045  int num_privatized_levels[NUM_ACTIVE_CHANNELS];
1046  PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
1047  OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
1048  int max_levels = num_output_levels[0];
1049 
1050  for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1051  {
1052  num_privatized_levels[channel] = 257;
1053 
1054  int bins = num_output_levels[channel] - 1;
1055  LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
1056  output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
1057 
1058  if (num_output_levels[channel] > max_levels)
1059  max_levels = num_output_levels[channel];
1060  }
1061  int max_num_output_bins = max_levels - 1;
1062 
1063  const int PRIVATIZED_SMEM_BINS = 256;
1064 
1065  if (CubDebug(error = PrivatizedDispatch(
1066  d_temp_storage,
1067  temp_storage_bytes,
1068  d_samples,
1069  d_output_histograms,
1070  num_privatized_levels,
1071  privatized_decode_op,
1072  num_output_levels,
1073  output_decode_op,
1074  max_num_output_bins,
1076  num_rows,
1078  DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
1079  DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
1080  histogram_sweep_config,
1081  stream,
1082  debug_synchronous))) break;
1083 
1084  }
1085  while (0);
1086 
1087  return error;
1088  }
1089 
1090 };
1091 
1092 
1093 } // CUB namespace
1094 CUB_NS_POSTFIX // Optional outer namespace(s)
1095 
1096 
Cache as texture.
Definition: thread_load.cuh:69
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_output_bins_wrapper
< Input data to reduce
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
< Signed integer type for global offsets
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_privatized_histograms_wrapper
Reference to privatized histograms.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
Default (no modifier)
Definition: thread_load.cuh:64
__global__ void DeviceHistogramInitKernel(ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_output_bins_wrapper, ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_output_histograms_wrapper, GridQueue< int > tile_queue)
< Signed integer type for global offsets
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Optional outer namespace(s)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t PrivatizedDispatch(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel, KernelConfig histogram_sweep_config, cudaStream_t stream, bool debug_synchronous)
< Function type of cub::DeviceHistogramSweepKernel
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
__host__ __device__ static __forceinline__ size_t AllocationSize()
Returns the device allocation size in bytes needed to construct a GridQueue instance.
Definition: grid_queue.cuh:100
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_privatized_bins_wrapper
The number bins per privatized histogram.
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
OffsetT OffsetT
[in] Total number of input data items
__device__ __forceinline__ OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val)
Returns the offset of the first value within input which compares greater than val.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_output_histograms_wrapper
Reference to final output histograms.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > privatized_decode_op_wrapper
The transform operator for determining privatized counter indices from samples, one for each channel.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > output_decode_op_wrapper
The transform operator for determining output bin-ids from privatized counter indices,...
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample value type of the input iterator.
A wrapper for passing simple static arrays as kernel parameters.
Definition: util_type.cuh:770
< Whether to dequeue tiles from a global work queue
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition: util_macro.cuh:61
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t InitConfigs(int ptx_version, KernelConfig &histogram_sweep_config)
GridQueue is a descriptor utility for dynamic queue management.
Definition: grid_queue.cuh:82