OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT > Struct Template Reference

< Signed integer type for global offsets More...

Detailed Description

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
struct cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >

< Signed integer type for global offsets

Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram

Definition at line 175 of file dispatch_histogram.cuh.

Data Structures

struct  KernelConfig
 
struct  PassThruTransform
 
struct  Policy110
 SM11. More...
 
struct  Policy200
 SM20. More...
 
struct  Policy300
 SM30. More...
 
struct  Policy350
 SM35. More...
 
struct  Policy500
 SM50. More...
 
struct  PtxHistogramSweepPolicy
 
struct  ScaleTransform
 
struct  SearchTransform
 
struct  TScale
 

Public Types

enum  { MAX_PRIVATIZED_SMEM_BINS = 256 }
 
typedef std::iterator_traits< SampleIteratorT >::value_type SampleT
 The sample value type of the input iterator.
 
typedef Policy110 PtxPolicy
 

Static Public Member Functions

template<typename KernelConfig >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t InitConfigs (int ptx_version, KernelConfig &histogram_sweep_config)
 
template<typename PrivatizedDecodeOpT , typename OutputDecodeOpT , typename DeviceHistogramInitKernelT , typename DeviceHistogramSweepKernelT >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t PrivatizedDispatch (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel, KernelConfig histogram_sweep_config, cudaStream_t stream, bool debug_synchronous)
 < Function type of cub::DeviceHistogramSweepKernel
 
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
 
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
 
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
 
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven (void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
 

Member Typedef Documentation

◆ PtxPolicy

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
typedef Policy110 cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::PtxPolicy

Definition at line 437 of file dispatch_histogram.cuh.

◆ SampleT

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
typedef std::iterator_traits<SampleIteratorT>::value_type cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::SampleT

The sample value type of the input iterator.

Definition at line 182 of file dispatch_histogram.cuh.

Member Enumeration Documentation

◆ anonymous enum

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
anonymous enum

Definition at line 184 of file dispatch_histogram.cuh.

Member Function Documentation

◆ DispatchEven() [1/2]

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::DispatchEven ( void *  d_temp_storage,
size_t &  temp_storage_bytes,
SampleIteratorT  d_samples,
CounterT *  d_output_histograms[NUM_ACTIVE_CHANNELS],
int  num_output_levels[NUM_ACTIVE_CHANNELS],
LevelT  lower_level[NUM_ACTIVE_CHANNELS],
LevelT  upper_level[NUM_ACTIVE_CHANNELS],
OffsetT  num_row_pixels,
OffsetT  num_rows,
OffsetT  row_stride_samples,
cudaStream_t  stream,
bool  debug_synchronous,
Int2Type< false >  is_byte_sample 
)
inlinestatic

Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit

Parameters
[in]d_temp_storageDevice-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done.
[in,out]temp_storage_bytesReference to size in bytes of d_temp_storage allocation
[in]d_samplesThe pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
[out]d_output_histogramsThe pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1.
[in]num_output_levelsThe number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1.
[in]lower_levelThe lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
[in]upper_levelThe upper sample value bound (exclusive) for the highest histogram bin in each active channel.
[in]num_row_pixelsThe number of multi-channel pixels per row in the region of interest
[in]num_rowsThe number of rows in the region of interest
[in]row_stride_samplesThe number of samples between starts of consecutive rows in the region of interest
[in]streamCUDA stream to launch kernels within. Default is stream0.
[in]debug_synchronousWhether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false.
[in]is_byte_sampleMarker type indicating whether or not SampleT is a 8b type

Definition at line 896 of file dispatch_histogram.cuh.

◆ DispatchEven() [2/2]

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::DispatchEven ( void *  d_temp_storage,
size_t &  temp_storage_bytes,
SampleIteratorT  d_samples,
CounterT *  d_output_histograms[NUM_ACTIVE_CHANNELS],
int  num_output_levels[NUM_ACTIVE_CHANNELS],
LevelT  lower_level[NUM_ACTIVE_CHANNELS],
LevelT  upper_level[NUM_ACTIVE_CHANNELS],
OffsetT  num_row_pixels,
OffsetT  num_rows,
OffsetT  row_stride_samples,
cudaStream_t  stream,
bool  debug_synchronous,
Int2Type< true >  is_byte_sample 
)
inlinestatic

Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)

Parameters
[in]d_temp_storageDevice-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done.
[in,out]temp_storage_bytesReference to size in bytes of d_temp_storage allocation
[in]d_samplesThe pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
[out]d_output_histogramsThe pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1.
[in]num_output_levelsThe number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1.
[in]lower_levelThe lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
[in]upper_levelThe upper sample value bound (exclusive) for the highest histogram bin in each active channel.
[in]num_row_pixelsThe number of multi-channel pixels per row in the region of interest
[in]num_rowsThe number of rows in the region of interest
[in]row_stride_samplesThe number of samples between starts of consecutive rows in the region of interest
[in]streamCUDA stream to launch kernels within. Default is stream0.
[in]debug_synchronousWhether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false.
[in]is_byte_sampleMarker type indicating whether or not SampleT is a 8b type

Definition at line 1008 of file dispatch_histogram.cuh.

◆ DispatchRange() [1/2]

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
static CUB_RUNTIME_FUNCTION cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::DispatchRange ( void *  d_temp_storage,
size_t &  temp_storage_bytes,
SampleIteratorT  d_samples,
CounterT *  d_output_histograms[NUM_ACTIVE_CHANNELS],
int  num_output_levels[NUM_ACTIVE_CHANNELS],
LevelT *  d_levels[NUM_ACTIVE_CHANNELS],
OffsetT  num_row_pixels,
OffsetT  num_rows,
OffsetT  row_stride_samples,
cudaStream_t  stream,
bool  debug_synchronous,
Int2Type< false >  is_byte_sample 
)
inlinestatic

Dispatch routine for HistogramRange, specialized for sample types larger than 8bit

Parameters
[in]d_temp_storageDevice-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done.
[in,out]temp_storage_bytesReference to size in bytes of d_temp_storage allocation
[in]d_samplesThe pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
[out]d_output_histogramsThe pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1.
[in]num_output_levelsThe number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1.
[in]d_levelsThe pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
[in]num_row_pixelsThe number of multi-channel pixels per row in the region of interest
[in]num_rowsThe number of rows in the region of interest
[in]row_stride_samplesThe number of samples between starts of consecutive rows in the region of interest
[in]streamCUDA stream to launch kernels within. Default is stream0.
[in]debug_synchronousWhether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false.
[in]is_byte_sampleMarker type indicating whether or not SampleT is a 8b type

Definition at line 706 of file dispatch_histogram.cuh.

◆ DispatchRange() [2/2]

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
static CUB_RUNTIME_FUNCTION cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::DispatchRange ( void *  d_temp_storage,
size_t &  temp_storage_bytes,
SampleIteratorT  d_samples,
CounterT *  d_output_histograms[NUM_ACTIVE_CHANNELS],
int  num_output_levels[NUM_ACTIVE_CHANNELS],
LevelT *  d_levels[NUM_ACTIVE_CHANNELS],
OffsetT  num_row_pixels,
OffsetT  num_rows,
OffsetT  row_stride_samples,
cudaStream_t  stream,
bool  debug_synchronous,
Int2Type< true >  is_byte_sample 
)
inlinestatic

Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)

Parameters
[in]d_temp_storageDevice-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done.
[in,out]temp_storage_bytesReference to size in bytes of d_temp_storage allocation
[in]d_samplesThe pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
[out]d_output_histogramsThe pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1.
[in]num_output_levelsThe number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1.
[in]d_levelsThe pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
[in]num_row_pixelsThe number of multi-channel pixels per row in the region of interest
[in]num_rowsThe number of rows in the region of interest
[in]row_stride_samplesThe number of samples between starts of consecutive rows in the region of interest
[in]streamCUDA stream to launch kernels within. Default is stream0.
[in]debug_synchronousWhether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false.
[in]is_byte_sampleMarker type indicating whether or not SampleT is a 8b type

Definition at line 814 of file dispatch_histogram.cuh.

◆ InitConfigs()

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
template<typename KernelConfig >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::InitConfigs ( int  ptx_version,
KernelConfig histogram_sweep_config 
)
inlinestatic

Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use

Definition at line 454 of file dispatch_histogram.cuh.

◆ PrivatizedDispatch()

template<int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT , typename CounterT , typename LevelT , typename OffsetT >
template<typename PrivatizedDecodeOpT , typename OutputDecodeOpT , typename DeviceHistogramInitKernelT , typename DeviceHistogramSweepKernelT >
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t cub::DipatchHistogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT >::PrivatizedDispatch ( void *  d_temp_storage,
size_t &  temp_storage_bytes,
SampleIteratorT  d_samples,
CounterT *  d_output_histograms[NUM_ACTIVE_CHANNELS],
int  num_privatized_levels[NUM_ACTIVE_CHANNELS],
PrivatizedDecodeOpT  privatized_decode_op[NUM_ACTIVE_CHANNELS],
int  num_output_levels[NUM_ACTIVE_CHANNELS],
OutputDecodeOpT  output_decode_op[NUM_ACTIVE_CHANNELS],
int  max_num_output_bins,
OffsetT  num_row_pixels,
OffsetT  num_rows,
OffsetT  row_stride_samples,
DeviceHistogramInitKernelT  histogram_init_kernel,
DeviceHistogramSweepKernelT  histogram_sweep_kernel,
KernelConfig  histogram_sweep_config,
cudaStream_t  stream,
bool  debug_synchronous 
)
inlinestatic

< Function type of cub::DeviceHistogramSweepKernel

Privatization-based dispatch routine

Parameters
[in]d_temp_storageDevice-accessible allocation of temporary storage. When NULL, the required allocation size is written to temp_storage_bytes and no work is done.
[in,out]temp_storage_bytesReference to size in bytes of d_temp_storage allocation
[in]d_samplesThe pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
[out]d_output_histogramsThe pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1.
[in]num_privatized_levelsThe number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1.
[in]privatized_decode_opTransform operators for determining bin-ids from samples, one for each channel
[in]num_output_levelsThe number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1.
[in]output_decode_opTransform operators for determining bin-ids from samples, one for each channel
[in]max_num_output_binsMaximum number of output bins in any channel
[in]num_row_pixelsThe number of multi-channel pixels per row in the region of interest
[in]num_rowsThe number of rows in the region of interest
[in]row_stride_samplesThe number of samples between starts of consecutive rows in the region of interest
[in]histogram_init_kernelKernel function pointer to parameterization of cub::DeviceHistogramInitKernel
[in]histogram_sweep_kernelKernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
[in]histogram_sweep_configDispatch parameters that match the policy that histogram_sweep_kernel was compiled for
[in]streamCUDA stream to launch kernels within. Default is stream0.
[in]debug_synchronousWhether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is false.

Definition at line 529 of file dispatch_histogram.cuh.


The documentation for this struct was generated from the following file: