template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
struct cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >
< Signed integer type for global offsets
Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
Definition at line 175 of file dispatch_histogram.cuh.
|
template<typename KernelConfig > |
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t | InitConfigs (int ptx_version, KernelConfig &histogram_sweep_config) |
|
template<typename PrivatizedDecodeOpT , typename OutputDecodeOpT , typename DeviceHistogramInitKernelT , typename DeviceHistogramSweepKernelT > |
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t | PrivatizedDispatch (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel, KernelConfig histogram_sweep_config, cudaStream_t stream, bool debug_synchronous) |
| < Function type of cub::DeviceHistogramSweepKernel More...
|
|
static CUB_RUNTIME_FUNCTION cudaError_t | DispatchRange (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample) |
|
static CUB_RUNTIME_FUNCTION cudaError_t | DispatchRange (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample) |
|
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t | DispatchEven (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample) |
|
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t | DispatchEven (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample) |
|
template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::DispatchEven |
( |
void * |
d_temp_storage, |
|
|
size_t & |
temp_storage_bytes, |
|
|
SampleIteratorT |
d_samples, |
|
|
CounterT * |
d_output_histograms[NUM_ACTIVE_CHANNELS], |
|
|
int |
num_output_levels[NUM_ACTIVE_CHANNELS], |
|
|
LevelT |
lower_level[NUM_ACTIVE_CHANNELS], |
|
|
LevelT |
upper_level[NUM_ACTIVE_CHANNELS], |
|
|
OffsetT |
num_row_pixels, |
|
|
OffsetT |
num_rows, |
|
|
OffsetT |
row_stride_samples, |
|
|
cudaStream_t |
stream, |
|
|
bool |
debug_synchronous, |
|
|
Int2Type< false > |
is_byte_sample |
|
) |
| |
|
inlinestatic |
Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
- Parameters
-
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_output_levels | The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | lower_level | The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. |
[in] | upper_level | The upper sample value bound (exclusive) for the highest histogram bin in each active channel. |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
[in] | is_byte_sample | Marker type indicating whether or not SampleT is a 8b type |
Definition at line 896 of file dispatch_histogram.cuh.
template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::DispatchEven |
( |
void * |
d_temp_storage, |
|
|
size_t & |
temp_storage_bytes, |
|
|
SampleIteratorT |
d_samples, |
|
|
CounterT * |
d_output_histograms[NUM_ACTIVE_CHANNELS], |
|
|
int |
num_output_levels[NUM_ACTIVE_CHANNELS], |
|
|
LevelT |
lower_level[NUM_ACTIVE_CHANNELS], |
|
|
LevelT |
upper_level[NUM_ACTIVE_CHANNELS], |
|
|
OffsetT |
num_row_pixels, |
|
|
OffsetT |
num_rows, |
|
|
OffsetT |
row_stride_samples, |
|
|
cudaStream_t |
stream, |
|
|
bool |
debug_synchronous, |
|
|
Int2Type< true > |
is_byte_sample |
|
) |
| |
|
inlinestatic |
Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
- Parameters
-
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_output_levels | The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | lower_level | The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. |
[in] | upper_level | The upper sample value bound (exclusive) for the highest histogram bin in each active channel. |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
[in] | is_byte_sample | Marker type indicating whether or not SampleT is a 8b type |
Definition at line 1008 of file dispatch_histogram.cuh.
template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
static CUB_RUNTIME_FUNCTION cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::DispatchRange |
( |
void * |
d_temp_storage, |
|
|
size_t & |
temp_storage_bytes, |
|
|
SampleIteratorT |
d_samples, |
|
|
CounterT * |
d_output_histograms[NUM_ACTIVE_CHANNELS], |
|
|
int |
num_output_levels[NUM_ACTIVE_CHANNELS], |
|
|
LevelT * |
d_levels[NUM_ACTIVE_CHANNELS], |
|
|
OffsetT |
num_row_pixels, |
|
|
OffsetT |
num_rows, |
|
|
OffsetT |
row_stride_samples, |
|
|
cudaStream_t |
stream, |
|
|
bool |
debug_synchronous, |
|
|
Int2Type< false > |
is_byte_sample |
|
) |
| |
|
inlinestatic |
Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
- Parameters
-
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_output_levels | The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | d_levels | The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
[in] | is_byte_sample | Marker type indicating whether or not SampleT is a 8b type |
Definition at line 706 of file dispatch_histogram.cuh.
template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
static CUB_RUNTIME_FUNCTION cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::DispatchRange |
( |
void * |
d_temp_storage, |
|
|
size_t & |
temp_storage_bytes, |
|
|
SampleIteratorT |
d_samples, |
|
|
CounterT * |
d_output_histograms[NUM_ACTIVE_CHANNELS], |
|
|
int |
num_output_levels[NUM_ACTIVE_CHANNELS], |
|
|
LevelT * |
d_levels[NUM_ACTIVE_CHANNELS], |
|
|
OffsetT |
num_row_pixels, |
|
|
OffsetT |
num_rows, |
|
|
OffsetT |
row_stride_samples, |
|
|
cudaStream_t |
stream, |
|
|
bool |
debug_synchronous, |
|
|
Int2Type< true > |
is_byte_sample |
|
) |
| |
|
inlinestatic |
Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
- Parameters
-
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_output_levels | The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | d_levels | The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
[in] | is_byte_sample | Marker type indicating whether or not SampleT is a 8b type |
Definition at line 814 of file dispatch_histogram.cuh.
template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
template<typename KernelConfig >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::InitConfigs |
( |
int |
ptx_version, |
|
|
KernelConfig & |
histogram_sweep_config |
|
) |
| |
|
inlinestatic |
Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
Definition at line 454 of file dispatch_histogram.cuh.
template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
template<typename PrivatizedDecodeOpT , typename OutputDecodeOpT , typename DeviceHistogramInitKernelT , typename DeviceHistogramSweepKernelT >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::PrivatizedDispatch |
( |
void * |
d_temp_storage, |
|
|
size_t & |
temp_storage_bytes, |
|
|
SampleIteratorT |
d_samples, |
|
|
CounterT * |
d_output_histograms[NUM_ACTIVE_CHANNELS], |
|
|
int |
num_privatized_levels[NUM_ACTIVE_CHANNELS], |
|
|
PrivatizedDecodeOpT |
privatized_decode_op[NUM_ACTIVE_CHANNELS], |
|
|
int |
num_output_levels[NUM_ACTIVE_CHANNELS], |
|
|
OutputDecodeOpT |
output_decode_op[NUM_ACTIVE_CHANNELS], |
|
|
int |
max_num_output_bins, |
|
|
OffsetT |
num_row_pixels, |
|
|
OffsetT |
num_rows, |
|
|
OffsetT |
row_stride_samples, |
|
|
DeviceHistogramInitKernelT |
histogram_init_kernel, |
|
|
DeviceHistogramSweepKernelT |
histogram_sweep_kernel, |
|
|
KernelConfig |
histogram_sweep_config, |
|
|
cudaStream_t |
stream, |
|
|
bool |
debug_synchronous |
|
) |
| |
|
inlinestatic |
< Function type of cub::DeviceHistogramSweepKernel
Privatization-based dispatch routine
- Parameters
-
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_privatized_levels | The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | privatized_decode_op | Transform operators for determining bin-ids from samples, one for each channel |
[in] | num_output_levels | The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | output_decode_op | Transform operators for determining bin-ids from samples, one for each channel |
[in] | max_num_output_bins | Maximum number of output bins in any channel |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | histogram_init_kernel | Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel |
[in] | histogram_sweep_kernel | Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel |
[in] | histogram_sweep_config | Dispatch parameters that match the policy that histogram_sweep_kernel was compiled for |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
Definition at line 529 of file dispatch_histogram.cuh.