< Signed integer type for global offsets More...
< Signed integer type for global offsets
Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
Definition at line 175 of file dispatch_histogram.cuh.
Data Structures | |
struct | KernelConfig |
struct | PassThruTransform |
struct | Policy110 |
SM11. More... | |
struct | Policy200 |
SM20. More... | |
struct | Policy300 |
SM30. More... | |
struct | Policy350 |
SM35. More... | |
struct | Policy500 |
SM50. More... | |
struct | PtxHistogramSweepPolicy |
struct | ScaleTransform |
struct | SearchTransform |
struct | TScale |
Public Types | |
enum | { MAX_PRIVATIZED_SMEM_BINS = 256 } |
typedef std::iterator_traits< SampleIteratorT >::value_type | SampleT |
The sample value type of the input iterator. | |
typedef Policy110 | PtxPolicy |
Static Public Member Functions | |
template<typename KernelConfig > | |
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t | InitConfigs (int ptx_version, KernelConfig &histogram_sweep_config) |
template<typename PrivatizedDecodeOpT , typename OutputDecodeOpT , typename DeviceHistogramInitKernelT , typename DeviceHistogramSweepKernelT > | |
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t | PrivatizedDispatch (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel, KernelConfig histogram_sweep_config, cudaStream_t stream, bool debug_synchronous) |
< Function type of cub::DeviceHistogramSweepKernel | |
static CUB_RUNTIME_FUNCTION cudaError_t | DispatchRange (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample) |
static CUB_RUNTIME_FUNCTION cudaError_t | DispatchRange (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample) |
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t | DispatchEven (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample) |
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t | DispatchEven (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample) |
typedef Policy110 cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::PtxPolicy |
Definition at line 437 of file dispatch_histogram.cuh.
typedef std::iterator_traits<SampleIteratorT>::value_type cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::SampleT |
The sample value type of the input iterator.
Definition at line 182 of file dispatch_histogram.cuh.
anonymous enum |
Definition at line 184 of file dispatch_histogram.cuh.
|
inlinestatic |
Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_output_levels | The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | lower_level | The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. |
[in] | upper_level | The upper sample value bound (exclusive) for the highest histogram bin in each active channel. |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
[in] | is_byte_sample | Marker type indicating whether or not SampleT is a 8b type |
Definition at line 896 of file dispatch_histogram.cuh.
|
inlinestatic |
Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_output_levels | The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | lower_level | The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. |
[in] | upper_level | The upper sample value bound (exclusive) for the highest histogram bin in each active channel. |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
[in] | is_byte_sample | Marker type indicating whether or not SampleT is a 8b type |
Definition at line 1008 of file dispatch_histogram.cuh.
|
inlinestatic |
Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_output_levels | The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | d_levels | The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
[in] | is_byte_sample | Marker type indicating whether or not SampleT is a 8b type |
Definition at line 706 of file dispatch_histogram.cuh.
|
inlinestatic |
Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_output_levels | The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | d_levels | The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
[in] | is_byte_sample | Marker type indicating whether or not SampleT is a 8b type |
Definition at line 814 of file dispatch_histogram.cuh.
|
inlinestatic |
Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
Definition at line 454 of file dispatch_histogram.cuh.
|
inlinestatic |
< Function type of cub::DeviceHistogramSweepKernel
Privatization-based dispatch routine
[in] | d_temp_storage | Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done. |
[in,out] | temp_storage_bytes | Reference to size in bytes of d_temp_storage allocation |
[in] | d_samples | The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). |
[out] | d_output_histograms | The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. |
[in] | num_privatized_levels | The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | privatized_decode_op | Transform operators for determining bin-ids from samples, one for each channel |
[in] | num_output_levels | The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. |
[in] | output_decode_op | Transform operators for determining bin-ids from samples, one for each channel |
[in] | max_num_output_bins | Maximum number of output bins in any channel |
[in] | num_row_pixels | The number of multi-channel pixels per row in the region of interest |
[in] | num_rows | The number of rows in the region of interest |
[in] | row_stride_samples | The number of samples between starts of consecutive rows in the region of interest |
[in] | histogram_init_kernel | Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel |
[in] | histogram_sweep_kernel | Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel |
[in] | histogram_sweep_config | Dispatch parameters that match the policy that histogram_sweep_kernel was compiled for |
[in] | stream | CUDA stream to launch kernels within. Default is stream0. |
[in] | debug_synchronous | Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false . |
Definition at line 529 of file dispatch_histogram.cuh.