template<typename AgentHistogramPolicyT, int PRIVATIZED_SMEM_BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT, typename CounterT, typename PrivatizedDecodeOpT, typename OutputDecodeOpT, typename OffsetT, int PTX_ARCH = CUB_PTX_ARCH>
struct cub::AgentHistogram< AgentHistogramPolicyT, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT, PTX_ARCH >
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
< PTX compute capability
Definition at line 111 of file agent_histogram.cuh.
|
enum | {
BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,
TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
MEM_PREFERENCE,
IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING
} |
| Constants.
|
|
typedef std::iterator_traits< SampleIteratorT >::value_type | SampleT |
| The sample type of the input iterator.
|
|
typedef CubVector< SampleT, NUM_CHANNELS >::Type | PixelT |
| The pixel type of SampleT.
|
|
typedef CubVector< SampleT, 4 >::Type | QuadT |
| The quad type of SampleT.
|
|
typedef If< IsPointer< SampleIteratorT >::VALUE, CacheModifiedInputIterator< LOAD_MODIFIER, SampleT, OffsetT >, SampleIteratorT >::Type | WrappedSampleIteratorT |
| Input iterator wrapper type (for applying cache modifier)
|
|
typedef CacheModifiedInputIterator< LOAD_MODIFIER, PixelT, OffsetT > | WrappedPixelIteratorT |
| Pixel input iterator type (for applying cache modifier)
|
|
typedef CacheModifiedInputIterator< LOAD_MODIFIER, QuadT, OffsetT > | WrappedQuadIteratorT |
| Qaud input iterator type (for applying cache modifier)
|
|
typedef BlockLoad< SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > | BlockLoadSampleT |
| Parameterized BlockLoad type for samples.
|
|
typedef BlockLoad< PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > | BlockLoadPixelT |
| Parameterized BlockLoad type for pixels.
|
|
typedef BlockLoad< QuadT, BLOCK_THREADS, QUADS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > | BlockLoadQuadT |
| Parameterized BlockLoad type for quads.
|
|
|
__device__ __forceinline__ void | InitBinCounters (CounterT *privatized_histograms[NUM_ACTIVE_CHANNELS]) |
|
__device__ __forceinline__ void | InitSmemBinCounters () |
|
__device__ __forceinline__ void | InitGmemBinCounters () |
|
__device__ __forceinline__ void | StoreOutput (CounterT *privatized_histograms[NUM_ACTIVE_CHANNELS]) |
|
__device__ __forceinline__ void | StoreSmemOutput () |
|
__device__ __forceinline__ void | StoreGmemOutput () |
|
__device__ __forceinline__ void | AccumulatePixels (SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD], CounterT *privatized_histograms[NUM_ACTIVE_CHANNELS], Int2Type< true > is_rle_compress) |
|
__device__ __forceinline__ void | AccumulatePixels (SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD], CounterT *privatized_histograms[NUM_ACTIVE_CHANNELS], Int2Type< false > is_rle_compress) |
|
__device__ __forceinline__ void | AccumulateSmemPixels (SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD]) |
|
__device__ __forceinline__ void | AccumulateGmemPixels (SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD]) |
|
template<int _NUM_ACTIVE_CHANNELS> |
__device__ __forceinline__ void | LoadFullAlignedTile (OffsetT block_offset, int valid_samples, SampleT(&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type< _NUM_ACTIVE_CHANNELS > num_active_channels) |
|
__device__ __forceinline__ void | LoadFullAlignedTile (OffsetT block_offset, int valid_samples, SampleT(&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type< 1 > num_active_channels) |
|
__device__ __forceinline__ void | LoadTile (OffsetT block_offset, int valid_samples, SampleT(&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type< true > is_full_tile, Int2Type< true > is_aligned) |
|
__device__ __forceinline__ void | LoadTile (OffsetT block_offset, int valid_samples, SampleT(&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type< true > is_full_tile, Int2Type< false > is_aligned) |
|
__device__ __forceinline__ void | LoadTile (OffsetT block_offset, int valid_samples, SampleT(&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type< false > is_full_tile, Int2Type< true > is_aligned) |
|
__device__ __forceinline__ void | LoadTile (OffsetT block_offset, int valid_samples, SampleT(&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type< false > is_full_tile, Int2Type< false > is_aligned) |
|
template<bool IS_ALIGNED, bool IS_FULL_TILE> |
__device__ __forceinline__ void | ConsumeTile (OffsetT block_offset, int valid_samples) |
|
template<bool IS_ALIGNED> |
__device__ __forceinline__ void | ConsumeTiles (OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< true > is_work_stealing) |
|
template<bool IS_ALIGNED> |
__device__ __forceinline__ void | ConsumeTiles (OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< false > is_work_stealing) |
|
template<CacheLoadModifier _MODIFIER, typename _ValueT , typename _OffsetT > |
__device__ __forceinline__ SampleT * | NativePointer (CacheModifiedInputIterator< _MODIFIER, _ValueT, _OffsetT > itr) |
|
template<typename IteratorT > |
__device__ __forceinline__ SampleT * | NativePointer (IteratorT itr) |
|
__device__ __forceinline__ | AgentHistogram (TempStorage &temp_storage, SampleIteratorT d_samples, int(&num_output_bins)[NUM_ACTIVE_CHANNELS], int(&num_privatized_bins)[NUM_ACTIVE_CHANNELS], CounterT *(&d_output_histograms)[NUM_ACTIVE_CHANNELS], CounterT *(&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], OutputDecodeOpT(&output_decode_op)[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT(&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) |
|
__device__ __forceinline__ void | ConsumeTiles (OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue) |
|
__device__ __forceinline__ void | InitBinCounters () |
|
__device__ __forceinline__ void | StoreOutput () |
|
|
_TempStorage & | temp_storage |
| Reference to temp_storage.
|
|
WrappedSampleIteratorT | d_wrapped_samples |
| Sample input iterator (with cache modifier applied, if possible)
|
|
SampleT * | d_native_samples |
| Native pointer for input samples (possibly NULL if unavailable)
|
|
int(& | num_output_bins )[NUM_ACTIVE_CHANNELS] |
| The number of output bins for each channel.
|
|
int(& | num_privatized_bins )[NUM_ACTIVE_CHANNELS] |
| The number of privatized bins for each channel.
|
|
CounterT * | d_privatized_histograms [NUM_ACTIVE_CHANNELS] |
| Reference to gmem privatized histograms for each channel.
|
|
CounterT *(& | d_output_histograms )[NUM_ACTIVE_CHANNELS] |
| Reference to final output histograms (gmem)
|
|
OutputDecodeOpT(& | output_decode_op )[NUM_ACTIVE_CHANNELS] |
| The transform operator for determining output bin-ids from privatized counter indices, one for each channel.
|
|
PrivatizedDecodeOpT(& | privatized_decode_op )[NUM_ACTIVE_CHANNELS] |
| The transform operator for determining privatized counter indices from samples, one for each channel.
|
|
bool | prefer_smem |
| Whether to prefer privatized smem counters vs privatized global counters.
|
|