41#include "../../agent/agent_histogram.cuh"
42#include "../../util_debug.cuh"
43#include "../../util_device.cuh"
44#include "../../thread/thread_search.cuh"
45#include "../../grid/grid_queue.cuh"
46#include "../../util_namespace.cuh"
64 int NUM_ACTIVE_CHANNELS,
72 if ((threadIdx.x == 0) && (blockIdx.x == 0))
75 int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
78 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
90 typename AgentHistogramPolicyT,
91 int PRIVATIZED_SMEM_BINS,
93 int NUM_ACTIVE_CHANNELS,
94 typename SampleIteratorT,
96 typename PrivatizedDecodeOpT,
97 typename OutputDecodeOpT,
100__global__
void DeviceHistogramSweepKernel(
101 SampleIteratorT d_samples,
116 AgentHistogramPolicyT,
117 PRIVATIZED_SMEM_BINS,
128 __shared__
typename AgentHistogramT::TempStorage temp_storage;
130 AgentHistogramT agent(
141 agent.InitBinCounters();
170 int NUM_ACTIVE_CHANNELS,
171 typename SampleIteratorT,
182 typedef typename std::iterator_traits<SampleIteratorT>::value_type
SampleT;
187 MAX_PRIVATIZED_SMEM_BINS = 256
196 template <
typename LevelIteratorT>
199 LevelIteratorT d_levels;
200 int num_output_levels;
203 __host__ __device__ __forceinline__
void Init(
204 LevelIteratorT d_levels,
205 int num_output_levels)
207 this->d_levels = d_levels;
208 this->num_output_levels = num_output_levels;
212 template <CacheLoadModifier LOAD_MODIFIER,
typename _SampleT>
213 __host__ __device__ __forceinline__
void BinSelect(_SampleT sample,
int &bin,
bool valid)
218 LevelIteratorT>::Type
219 WrappedLevelIteratorT;
221 WrappedLevelIteratorT wrapped_levels(d_levels);
223 int num_bins = num_output_levels - 1;
226 bin =
UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
243 template <
typename _LevelT>
244 __host__ __device__ __forceinline__
void Init(
245 int num_output_levels,
250 this->num_bins = num_output_levels - 1;
257 __host__ __device__ __forceinline__
void Init(
258 int num_output_levels,
263 this->num_bins = num_output_levels - 1;
266 this->scale = float(1.0) / scale;
270 __host__ __device__ __forceinline__
void Init(
271 int num_output_levels,
276 this->num_bins = num_output_levels - 1;
279 this->scale = double(1.0) / scale;
283 template <CacheLoadModifier LOAD_MODIFIER,
typename _SampleT>
284 __host__ __device__ __forceinline__
void BinSelect(_SampleT sample,
int &bin,
bool valid)
286 LevelT level_sample = (LevelT) sample;
288 if (valid && (level_sample >= min) && (level_sample < max))
289 bin = (
int) ((level_sample - min) / scale);
293 template <CacheLoadModifier LOAD_MODIFIER>
294 __host__ __device__ __forceinline__
void BinSelect(
float sample,
int &bin,
bool valid)
296 LevelT level_sample = (LevelT) sample;
298 if (valid && (level_sample >= min) && (level_sample < max))
299 bin = (
int) ((level_sample - min) * scale);
303 template <CacheLoadModifier LOAD_MODIFIER>
304 __host__ __device__ __forceinline__
void BinSelect(
double sample,
int &bin,
bool valid)
306 LevelT level_sample = (LevelT) sample;
308 if (valid && (level_sample >= min) && (level_sample < max))
309 bin = (
int) ((level_sample - min) * scale);
318 template <CacheLoadModifier LOAD_MODIFIER,
typename _SampleT>
319 __host__ __device__ __forceinline__
void BinSelect(_SampleT sample,
int &bin,
bool valid)
332 template <
int NOMINAL_ITEMS_PER_THREAD>
337 V_SCALE = (
sizeof(
SampleT) +
sizeof(
int) - 1) /
sizeof(
int),
338 VALUE =
CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
349 (NUM_CHANNELS == 1) ? 8 : 2,
363 (NUM_CHANNELS == 1) ? 256 : 128,
364 (NUM_CHANNELS == 1) ? 8 : 3,
379 (NUM_CHANNELS == 1) ? 8 : 2,
424#if (CUB_PTX_ARCH >= 500)
427#elif (CUB_PTX_ARCH >= 350)
430#elif (CUB_PTX_ARCH >= 300)
433#elif (CUB_PTX_ARCH >= 200)
452 template <
typename KernelConfig>
453 CUB_RUNTIME_FUNCTION __forceinline__
458 #if (CUB_PTX_ARCH > 0)
461 return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
466 if (ptx_version >= 500)
468 return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
470 else if (ptx_version >= 350)
472 return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
474 else if (ptx_version >= 300)
476 return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
478 else if (ptx_version >= 200)
480 return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
482 else if (ptx_version >= 110)
484 return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
489 return cudaErrorNotSupported;
502 int pixels_per_thread;
504 template <
typename BlockPolicy>
505 CUB_RUNTIME_FUNCTION __forceinline__
508 block_threads = BlockPolicy::BLOCK_THREADS;
509 pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD;
524 typename PrivatizedDecodeOpT,
525 typename OutputDecodeOpT,
526 typename DeviceHistogramInitKernelT,
527 typename DeviceHistogramSweepKernelT>
528 CUB_RUNTIME_FUNCTION __forceinline__
530 void* d_temp_storage,
531 size_t& temp_storage_bytes,
532 SampleIteratorT d_samples,
533 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
534 int num_privatized_levels[NUM_ACTIVE_CHANNELS],
535 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS],
536 int num_output_levels[NUM_ACTIVE_CHANNELS],
537 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS],
538 int max_num_output_bins,
542 DeviceHistogramInitKernelT histogram_init_kernel,
543 DeviceHistogramSweepKernelT histogram_sweep_kernel,
546 bool debug_synchronous)
548 #ifndef CUB_RUNTIME_ENABLED
551 return CubDebug(cudaErrorNotSupported);
555 cudaError error = cudaSuccess;
560 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
564 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
567 int histogram_sweep_sm_occupancy;
569 histogram_sweep_sm_occupancy,
570 histogram_sweep_kernel,
571 histogram_sweep_config.block_threads)))
break;
574 int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
585 int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
588 int blocks_per_col = (blocks_per_row > 0) ?
591 int num_thread_blocks = blocks_per_row * blocks_per_col;
593 dim3 sweep_grid_dims;
594 sweep_grid_dims.x = (
unsigned int) blocks_per_row;
595 sweep_grid_dims.y = (
unsigned int) blocks_per_col;
596 sweep_grid_dims.z = 1;
599 const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
600 void* allocations[NUM_ALLOCATIONS];
601 size_t allocation_sizes[NUM_ALLOCATIONS];
603 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
604 allocation_sizes[CHANNEL] =
size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) *
sizeof(CounterT);
610 if (d_temp_storage == NULL)
621 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
626 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
631 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
636 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
641 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
646 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
649 int histogram_init_block_threads = 256;
650 int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
653 if (debug_synchronous)
_CubLog(
"Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
654 histogram_init_grid_dims, histogram_init_block_threads, (
long long) stream);
657 histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
663 if ((blocks_per_row == 0) || (blocks_per_col == 0))
667 if (debug_synchronous)
_CubLog(
"Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
668 sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
669 histogram_sweep_config.block_threads, (
long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
672 histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
687 if (
CubDebug(error = cudaPeekAtLastError()))
break;
707 void* d_temp_storage,
708 size_t& temp_storage_bytes,
709 SampleIteratorT d_samples,
710 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
711 int num_output_levels[NUM_ACTIVE_CHANNELS],
712 LevelT *d_levels[NUM_ACTIVE_CHANNELS],
717 bool debug_synchronous,
720 cudaError error = cudaSuccess;
725 #if (CUB_PTX_ARCH == 0)
742 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
743 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
744 int max_levels = num_output_levels[0];
746 for (
int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
748 privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
749 if (num_output_levels[channel] > max_levels)
750 max_levels = num_output_levels[channel];
752 int max_num_output_bins = max_levels - 1;
755 if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
758 const int PRIVATIZED_SMEM_BINS = 0;
766 privatized_decode_op,
773 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
774 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
775 histogram_sweep_config,
777 debug_synchronous)))
break;
782 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
790 privatized_decode_op,
797 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
798 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
799 histogram_sweep_config,
801 debug_synchronous)))
break;
815 void* d_temp_storage,
816 size_t& temp_storage_bytes,
817 SampleIteratorT d_samples,
818 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
819 int num_output_levels[NUM_ACTIVE_CHANNELS],
820 LevelT *d_levels[NUM_ACTIVE_CHANNELS],
825 bool debug_synchronous,
828 cudaError error = cudaSuccess;
833 #if (CUB_PTX_ARCH == 0)
850 int num_privatized_levels[NUM_ACTIVE_CHANNELS];
851 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
852 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
853 int max_levels = num_output_levels[0];
855 for (
int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
857 num_privatized_levels[channel] = 257;
858 output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
860 if (num_output_levels[channel] > max_levels)
861 max_levels = num_output_levels[channel];
863 int max_num_output_bins = max_levels - 1;
865 const int PRIVATIZED_SMEM_BINS = 256;
872 num_privatized_levels,
873 privatized_decode_op,
880 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
881 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
882 histogram_sweep_config,
884 debug_synchronous)))
break;
895 CUB_RUNTIME_FUNCTION __forceinline__
897 void* d_temp_storage,
898 size_t& temp_storage_bytes,
899 SampleIteratorT d_samples,
900 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
901 int num_output_levels[NUM_ACTIVE_CHANNELS],
902 LevelT lower_level[NUM_ACTIVE_CHANNELS],
903 LevelT upper_level[NUM_ACTIVE_CHANNELS],
908 bool debug_synchronous,
911 cudaError error = cudaSuccess;
916 #if (CUB_PTX_ARCH == 0)
933 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
934 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
935 int max_levels = num_output_levels[0];
937 for (
int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
939 int bins = num_output_levels[channel] - 1;
940 LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
942 privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
944 if (num_output_levels[channel] > max_levels)
945 max_levels = num_output_levels[channel];
947 int max_num_output_bins = max_levels - 1;
949 if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
952 const int PRIVATIZED_SMEM_BINS = 0;
960 privatized_decode_op,
967 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
968 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
969 histogram_sweep_config,
971 debug_synchronous)))
break;
976 const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
984 privatized_decode_op,
991 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
992 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
993 histogram_sweep_config,
995 debug_synchronous)))
break;
1007 CUB_RUNTIME_FUNCTION __forceinline__
1009 void* d_temp_storage,
1010 size_t& temp_storage_bytes,
1011 SampleIteratorT d_samples,
1012 CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
1013 int num_output_levels[NUM_ACTIVE_CHANNELS],
1014 LevelT lower_level[NUM_ACTIVE_CHANNELS],
1015 LevelT upper_level[NUM_ACTIVE_CHANNELS],
1019 cudaStream_t stream,
1020 bool debug_synchronous,
1023 cudaError error = cudaSuccess;
1028 #if (CUB_PTX_ARCH == 0)
1045 int num_privatized_levels[NUM_ACTIVE_CHANNELS];
1046 PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS];
1047 OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS];
1048 int max_levels = num_output_levels[0];
1050 for (
int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
1052 num_privatized_levels[channel] = 257;
1054 int bins = num_output_levels[channel] - 1;
1055 LevelT scale = (upper_level[channel] - lower_level[channel]) / bins;
1056 output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
1058 if (num_output_levels[channel] > max_levels)
1059 max_levels = num_output_levels[channel];
1061 int max_num_output_bins = max_levels - 1;
1063 const int PRIVATIZED_SMEM_BINS = 256;
1069 d_output_histograms,
1070 num_privatized_levels,
1071 privatized_decode_op,
1074 max_num_output_bins,
1078 DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
1079 DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
1080 histogram_sweep_config,
1082 debug_synchronous)))
break;
GridQueue is a descriptor utility for dynamic queue management.
__host__ __device__ static __forceinline__ size_t AllocationSize()
Returns the device allocation size in bytes needed to construct a GridQueue instance.
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
__device__ __forceinline__ OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val)
Returns the offset of the first value within input which compares greater than val.
__global__ void DeviceHistogramInitKernel(ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_output_bins_wrapper, ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_output_histograms_wrapper, GridQueue< int > tile_queue)
< Signed integer type for global offsets
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_output_bins_wrapper
< Input data to reduce
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_privatized_histograms_wrapper
Reference to privatized histograms.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > privatized_decode_op_wrapper
The transform operator for determining privatized counter indices from samples, one for each channel.
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_output_histograms_wrapper
Reference to final output histograms.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
OffsetT OffsetT
[in] Total number of input data items
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_privatized_bins_wrapper
The number bins per privatized histogram.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > output_decode_op_wrapper
The transform operator for determining output bin-ids from privatized counter indices,...
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
< Whether to dequeue tiles from a global work queue
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
A wrapper for passing simple static arrays as kernel parameters.
T array[COUNT]
Statically-sized array of type T.
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t InitConfigs(int ptx_version, KernelConfig &histogram_sweep_config)
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t PrivatizedDispatch(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel, KernelConfig histogram_sweep_config, cudaStream_t stream, bool debug_synchronous)
< Function type of cub::DeviceHistogramSweepKernel
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample value type of the input iterator.
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
Type selection (IF ? ThenType : ElseType)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...