38#include "../util_type.cuh"
39#include "../block/block_load.cuh"
40#include "../grid/grid_queue.cuh"
41#include "../iterator/cache_modified_input_iterator.cuh"
42#include "../util_namespace.cuh"
58enum BlockHistogramMemoryPreference
71 int _PIXELS_PER_THREAD,
75 BlockHistogramMemoryPreference _MEM_PREFERENCE,
101 typename AgentHistogramPolicyT,
102 int PRIVATIZED_SMEM_BINS,
104 int NUM_ACTIVE_CHANNELS,
105 typename SampleIteratorT,
107 typename PrivatizedDecodeOpT,
108 typename OutputDecodeOpT,
118 typedef typename std::iterator_traits<SampleIteratorT>::value_type
SampleT;
129 BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
131 PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
132 SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
133 QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,
135 TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
136 TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
138 IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
140 MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ?
141 AgentHistogramPolicyT::MEM_PREFERENCE :
144 IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
154 SampleIteratorT>::Type
170 AgentHistogramPolicyT::LOAD_ALGORITHM>
178 AgentHistogramPolicyT::LOAD_ALGORITHM>
186 AgentHistogramPolicyT::LOAD_ALGORITHM>
192 CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];
251 __device__ __forceinline__
void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
255 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
257 for (
int privatized_bin = threadIdx.x; privatized_bin <
num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
259 privatized_histograms[CHANNEL][privatized_bin] = 0;
269 __device__ __forceinline__
void InitSmemBinCounters()
271 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
273 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
274 privatized_histograms[CHANNEL] =
temp_storage.histograms[CHANNEL];
281 __device__ __forceinline__
void InitGmemBinCounters()
292 __device__ __forceinline__
void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
299 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
302 for (
int privatized_bin = threadIdx.x;
303 privatized_bin < channel_bins;
304 privatized_bin += BLOCK_THREADS)
307 CounterT count = privatized_histograms[CHANNEL][privatized_bin];
308 bool is_valid = count > 0;
323 __device__ __forceinline__
void StoreSmemOutput()
325 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
326 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
327 privatized_histograms[CHANNEL] =
temp_storage.histograms[CHANNEL];
334 __device__ __forceinline__
void StoreGmemOutput()
345 __device__ __forceinline__
void AccumulatePixels(
346 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
347 bool is_valid[PIXELS_PER_THREAD],
348 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
349 Int2Type<true> is_rle_compress)
352 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
355 int bins[PIXELS_PER_THREAD];
358 for (
int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
361 privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
364 CounterT accumulator = 1;
367 for (
int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
369 if (bins[PIXEL] != bins[PIXEL + 1])
371 if (bins[PIXEL] >= 0)
372 atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
380 if (bins[PIXELS_PER_THREAD - 1] >= 0)
381 atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
387 __device__ __forceinline__
void AccumulatePixels(
388 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
389 bool is_valid[PIXELS_PER_THREAD],
390 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
391 Int2Type<false> is_rle_compress)
394 for (
int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
397 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
400 privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
402 atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
412 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
413 bool is_valid[PIXELS_PER_THREAD])
415 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
417 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
418 privatized_histograms[CHANNEL] =
temp_storage.histograms[CHANNEL];
428 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
429 bool is_valid[PIXELS_PER_THREAD])
441 template <
int _NUM_ACTIVE_CHANNELS>
442 __device__ __forceinline__
void LoadFullAlignedTile(
445 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
448 typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
455 reinterpret_cast<AliasedPixels&
>(samples));
459 __device__ __forceinline__
void LoadFullAlignedTile(
462 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
463 Int2Type<1> num_active_channels)
465 typedef QuadT AliasedQuads[QUADS_PER_THREAD];
472 reinterpret_cast<AliasedQuads&
>(samples));
476 __device__ __forceinline__
void LoadTile(
479 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
480 Int2Type<true> is_full_tile,
481 Int2Type<true> is_aligned)
483 LoadFullAlignedTile(
block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
487 __device__ __forceinline__
void LoadTile(
490 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
491 Int2Type<true> is_full_tile,
492 Int2Type<false> is_aligned)
494 typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
499 reinterpret_cast<AliasedSamples&
>(samples));
503 __device__ __forceinline__
void LoadTile(
506 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
507 Int2Type<false> is_full_tile,
508 Int2Type<true> is_aligned)
510 typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
514 int valid_pixels = valid_samples / NUM_CHANNELS;
519 reinterpret_cast<AliasedPixels&
>(samples),
524 __device__ __forceinline__
void LoadTile(
527 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
528 Int2Type<false> is_full_tile,
529 Int2Type<false> is_aligned)
531 typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
535 reinterpret_cast<AliasedSamples&
>(samples),
550 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS];
551 bool is_valid[PIXELS_PER_THREAD];
558 Int2Type<IS_FULL_TILE>(),
559 Int2Type<IS_ALIGNED>());
563 for (
int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
564 is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
567#if CUB_PTX_ARCH >= 120
580 template <
bool IS_ALIGNED>
591 int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
592 OffsetT num_even_share_tiles = gridDim.x * gridDim.y;
599 OffsetT col_offset = (col * TILE_SAMPLES);
600 OffsetT tile_offset = row_offset + col_offset;
606 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
611 ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
617 if (threadIdx.x == 0)
628 template <
bool IS_ALIGNED>
637 for (
int row = blockIdx.y; row <
num_rows; row += gridDim.y)
641 OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
643 while (tile_offset < row_end)
645 OffsetT num_remaining = row_end - tile_offset;
647 if (num_remaining < TILE_SAMPLES)
650 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
655 ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
656 tile_offset += gridDim.x * TILE_SAMPLES;
677 template <
typename IteratorT>
678 __device__ __forceinline__
SampleT* NativePointer(IteratorT itr)
695 SampleIteratorT d_samples,
713 (MEM_PREFERENCE == GMEM) ?
717 int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
720 for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
740 bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&
742 ((
num_rows == 1) || ((row_bytes & quad_mask) == 0));
744 bool pixel_aligned_rows = (NUM_CHANNELS > 1) &&
746 ((row_bytes & pixel_mask) == 0);
749 if ((
d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
762 InitSmemBinCounters();
764 InitGmemBinCounters();
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
GridQueue is a descriptor utility for dynamic queue management.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Optional outer namespace(s)
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
OffsetT OffsetT
[in] Total number of input data items
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
< Whether to dequeue tiles from a global work queue
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
@ IS_RLE_COMPRESS
Whether to perform localized RLE to compress samples before histogramming.
@ PIXELS_PER_THREAD
Pixels per thread (per tile of input)
@ MEM_PREFERENCE
Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
@ IS_WORK_STEALING
Whether to dequeue tiles from a global work queue.
@ BLOCK_THREADS
Threads per thread block.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
Temporary storage type (unionable)
Shared memory type required by this thread block.
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
CacheModifiedInputIterator< LOAD_MODIFIER, PixelT, OffsetT > WrappedPixelIteratorT
Pixel input iterator type (for applying cache modifier)
BlockLoad< SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadSampleT
Parameterized BlockLoad type for samples.
WrappedSampleIteratorT d_wrapped_samples
Sample input iterator (with cache modifier applied, if possible)
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample type of the input iterator.
BlockLoad< PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadPixelT
Parameterized BlockLoad type for pixels.
CounterT * d_privatized_histograms[NUM_ACTIVE_CHANNELS]
Reference to gmem privatized histograms for each channel.
If< IsPointer< SampleIteratorT >::VALUE, CacheModifiedInputIterator< LOAD_MODIFIER, SampleT, OffsetT >, SampleIteratorT >::Type WrappedSampleIteratorT
Input iterator wrapper type (for applying cache modifier)
bool prefer_smem
Whether to prefer privatized smem counters vs privatized global counters.
int(& num_privatized_bins)[NUM_ACTIVE_CHANNELS]
The number of privatized bins for each channel.
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< true > is_work_stealing)
CubVector< SampleT, 4 >::Type QuadT
The quad type of SampleT.
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< false > is_work_stealing)
CounterT *(& d_output_histograms)[NUM_ACTIVE_CHANNELS]
Reference to final output histograms (gmem)
_TempStorage & temp_storage
Reference to temp_storage.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
int(& num_output_bins)[NUM_ACTIVE_CHANNELS]
The number of output bins for each channel.
CubVector< SampleT, NUM_CHANNELS >::Type PixelT
The pixel type of SampleT.
BlockLoad< QuadT, BLOCK_THREADS, QUADS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadQuadT
Parameterized BlockLoad type for quads.
PrivatizedDecodeOpT(& privatized_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining privatized counter indices from samples, one for each channel.
__device__ __forceinline__ AgentHistogram(TempStorage &temp_storage, SampleIteratorT d_samples, int(&num_output_bins)[NUM_ACTIVE_CHANNELS], int(&num_privatized_bins)[NUM_ACTIVE_CHANNELS], CounterT *(&d_output_histograms)[NUM_ACTIVE_CHANNELS], CounterT *(&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], OutputDecodeOpT(&output_decode_op)[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT(&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
__device__ __forceinline__ void AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
SampleT * d_native_samples
Native pointer for input samples (possibly NULL if unavailable)
__device__ __forceinline__ void AccumulateSmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
CacheModifiedInputIterator< LOAD_MODIFIER, QuadT, OffsetT > WrappedQuadIteratorT
Qaud input iterator type (for applying cache modifier)
__device__ __forceinline__ void InitBinCounters()
__device__ __forceinline__ void StoreOutput()
OutputDecodeOpT(& output_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining output bin-ids from privatized counter indices,...
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue)
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Type selection (IF ? ThenType : ElseType)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...