38#include "../util_type.cuh" 
   39#include "../block/block_load.cuh" 
   40#include "../grid/grid_queue.cuh" 
   41#include "../iterator/cache_modified_input_iterator.cuh" 
   42#include "../util_namespace.cuh" 
   58enum BlockHistogramMemoryPreference
 
   71    int                             _PIXELS_PER_THREAD,             
 
   75    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                
 
  101    typename    AgentHistogramPolicyT,     
 
  102    int         PRIVATIZED_SMEM_BINS,           
 
  104    int         NUM_ACTIVE_CHANNELS,            
 
  105    typename    SampleIteratorT,                
 
  107    typename    PrivatizedDecodeOpT,            
 
  108    typename    OutputDecodeOpT,                
 
  118    typedef typename std::iterator_traits<SampleIteratorT>::value_type 
SampleT;
 
  129        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
 
  131        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
 
  132        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
 
  133        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
 
  135        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
 
  136        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
 
  138        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
 
  140        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
 
  141                                        AgentHistogramPolicyT::MEM_PREFERENCE :
 
  144        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
 
  154            SampleIteratorT>::Type                                           
 
  170            AgentHistogramPolicyT::LOAD_ALGORITHM>
 
  178            AgentHistogramPolicyT::LOAD_ALGORITHM>
 
  186            AgentHistogramPolicyT::LOAD_ALGORITHM>
 
  192        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     
 
  251    __device__ __forceinline__ 
void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
 
  255        for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
 
  257            for (
int privatized_bin = threadIdx.x; privatized_bin < 
num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
 
  259                privatized_histograms[CHANNEL][privatized_bin] = 0;
 
  269    __device__ __forceinline__ 
void InitSmemBinCounters()
 
  271        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
 
  273        for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
 
  274            privatized_histograms[CHANNEL] = 
temp_storage.histograms[CHANNEL];
 
  281    __device__ __forceinline__ 
void InitGmemBinCounters()
 
  292    __device__ __forceinline__ 
void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
 
  299        for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
 
  302            for (
int privatized_bin = threadIdx.x; 
 
  303                    privatized_bin < channel_bins;  
 
  304                    privatized_bin += BLOCK_THREADS)
 
  307                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
 
  308                bool        is_valid    = count > 0;
 
  323    __device__ __forceinline__ 
void StoreSmemOutput()
 
  325        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
 
  326        for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
 
  327            privatized_histograms[CHANNEL] = 
temp_storage.histograms[CHANNEL];
 
  334    __device__ __forceinline__ 
void StoreGmemOutput()
 
  345    __device__ __forceinline__ 
void AccumulatePixels(
 
  346        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  347        bool                is_valid[PIXELS_PER_THREAD],
 
  348        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
 
  349        Int2Type<true>      is_rle_compress)
 
  352        for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
 
  355            int bins[PIXELS_PER_THREAD];
 
  358            for (
int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
 
  361                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
 
  364            CounterT accumulator = 1;
 
  367            for (
int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
 
  369                if (bins[PIXEL] != bins[PIXEL + 1])
 
  371                    if (bins[PIXEL] >= 0)
 
  372                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
 
  380            if (bins[PIXELS_PER_THREAD - 1] >= 0)
 
  381                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
 
  387    __device__ __forceinline__ 
void AccumulatePixels(
 
  388        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  389        bool                is_valid[PIXELS_PER_THREAD],
 
  390        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
 
  391        Int2Type<false>     is_rle_compress)
 
  394        for (
int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
 
  397            for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
 
  400                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
 
  402                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
 
  412        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  413        bool                is_valid[PIXELS_PER_THREAD])
 
  415        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
 
  417        for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
 
  418            privatized_histograms[CHANNEL] = 
temp_storage.histograms[CHANNEL];
 
  428        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  429        bool                is_valid[PIXELS_PER_THREAD])
 
  441    template <
int _NUM_ACTIVE_CHANNELS>
 
  442    __device__ __forceinline__ 
void LoadFullAlignedTile(
 
  445        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  448        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
 
  455            reinterpret_cast<AliasedPixels&
>(samples));
 
  459    __device__ __forceinline__ 
void LoadFullAlignedTile(
 
  462        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  463        Int2Type<1>                     num_active_channels)
 
  465        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
 
  472            reinterpret_cast<AliasedQuads&
>(samples));
 
  476    __device__ __forceinline__ 
void LoadTile(
 
  479        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  480        Int2Type<true>  is_full_tile,
 
  481        Int2Type<true>  is_aligned)
 
  483        LoadFullAlignedTile(
block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
 
  487    __device__ __forceinline__ 
void LoadTile(
 
  490        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  491        Int2Type<true>  is_full_tile,
 
  492        Int2Type<false> is_aligned)
 
  494        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
 
  499            reinterpret_cast<AliasedSamples&
>(samples));
 
  503    __device__ __forceinline__ 
void LoadTile(
 
  506        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  507        Int2Type<false> is_full_tile,
 
  508        Int2Type<true>  is_aligned)
 
  510        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
 
  514        int valid_pixels = valid_samples / NUM_CHANNELS;
 
  519            reinterpret_cast<AliasedPixels&
>(samples),
 
  524    __device__ __forceinline__ 
void LoadTile(
 
  527        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
 
  528        Int2Type<false> is_full_tile,
 
  529        Int2Type<false> is_aligned)
 
  531        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
 
  535            reinterpret_cast<AliasedSamples&
>(samples),
 
  550        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
 
  551        bool        is_valid[PIXELS_PER_THREAD];
 
  558            Int2Type<IS_FULL_TILE>(),
 
  559            Int2Type<IS_ALIGNED>());
 
  563        for (
int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
 
  564            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
 
  567#if CUB_PTX_ARCH >= 120 
  580    template <
bool IS_ALIGNED>
 
  591        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
 
  592        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
 
  599            OffsetT col_offset      = (col * TILE_SAMPLES);
 
  600            OffsetT tile_offset     = row_offset + col_offset;
 
  606                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
 
  611                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
 
  617            if (threadIdx.x == 0)
 
  628    template <
bool IS_ALIGNED>
 
  637        for (
int row = blockIdx.y; row < 
num_rows; row += gridDim.y)
 
  641            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
 
  643            while (tile_offset < row_end)
 
  645                OffsetT num_remaining = row_end - tile_offset;
 
  647                if (num_remaining < TILE_SAMPLES)
 
  650                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
 
  655                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
 
  656                tile_offset += gridDim.x * TILE_SAMPLES;
 
  677    template <
typename IteratorT>
 
  678    __device__ __forceinline__ 
SampleT* NativePointer(IteratorT itr)
 
  695        SampleIteratorT     d_samples,                                          
 
  713            (MEM_PREFERENCE == GMEM) ?
 
  717        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
 
  720        for (
int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
 
  740        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     
 
  742                                        ((
num_rows == 1) || ((row_bytes & quad_mask) == 0));    
 
  744        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     
 
  746                                        ((row_bytes & pixel_mask) == 0);                        
 
  749        if ((
d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
 
  762            InitSmemBinCounters();
 
  764            InitGmemBinCounters();
 
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
GridQueue is a descriptor utility for dynamic queue management.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Optional outer namespace(s)
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
OffsetT OffsetT
[in] Total number of input data items
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
< Whether to dequeue tiles from a global work queue
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
@ IS_RLE_COMPRESS
Whether to perform localized RLE to compress samples before histogramming.
@ PIXELS_PER_THREAD
Pixels per thread (per tile of input)
@ MEM_PREFERENCE
Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
@ IS_WORK_STEALING
Whether to dequeue tiles from a global work queue.
@ BLOCK_THREADS
Threads per thread block.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
Temporary storage type (unionable)
Shared memory type required by this thread block.
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
CacheModifiedInputIterator< LOAD_MODIFIER, PixelT, OffsetT > WrappedPixelIteratorT
Pixel input iterator type (for applying cache modifier)
BlockLoad< SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadSampleT
Parameterized BlockLoad type for samples.
WrappedSampleIteratorT d_wrapped_samples
Sample input iterator (with cache modifier applied, if possible)
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample type of the input iterator.
BlockLoad< PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadPixelT
Parameterized BlockLoad type for pixels.
CounterT * d_privatized_histograms[NUM_ACTIVE_CHANNELS]
Reference to gmem privatized histograms for each channel.
If< IsPointer< SampleIteratorT >::VALUE, CacheModifiedInputIterator< LOAD_MODIFIER, SampleT, OffsetT >, SampleIteratorT >::Type WrappedSampleIteratorT
Input iterator wrapper type (for applying cache modifier)
bool prefer_smem
Whether to prefer privatized smem counters vs privatized global counters.
int(& num_privatized_bins)[NUM_ACTIVE_CHANNELS]
The number of privatized bins for each channel.
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< true > is_work_stealing)
CubVector< SampleT, 4 >::Type QuadT
The quad type of SampleT.
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< false > is_work_stealing)
CounterT *(& d_output_histograms)[NUM_ACTIVE_CHANNELS]
Reference to final output histograms (gmem)
_TempStorage & temp_storage
Reference to temp_storage.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
int(& num_output_bins)[NUM_ACTIVE_CHANNELS]
The number of output bins for each channel.
CubVector< SampleT, NUM_CHANNELS >::Type PixelT
The pixel type of SampleT.
BlockLoad< QuadT, BLOCK_THREADS, QUADS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadQuadT
Parameterized BlockLoad type for quads.
PrivatizedDecodeOpT(& privatized_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining privatized counter indices from samples, one for each channel.
__device__ __forceinline__ AgentHistogram(TempStorage &temp_storage, SampleIteratorT d_samples, int(&num_output_bins)[NUM_ACTIVE_CHANNELS], int(&num_privatized_bins)[NUM_ACTIVE_CHANNELS], CounterT *(&d_output_histograms)[NUM_ACTIVE_CHANNELS], CounterT *(&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], OutputDecodeOpT(&output_decode_op)[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT(&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
__device__ __forceinline__ void AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
SampleT * d_native_samples
Native pointer for input samples (possibly NULL if unavailable)
__device__ __forceinline__ void AccumulateSmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
CacheModifiedInputIterator< LOAD_MODIFIER, QuadT, OffsetT > WrappedQuadIteratorT
Qaud input iterator type (for applying cache modifier)
__device__ __forceinline__ void InitBinCounters()
__device__ __forceinline__ void StoreOutput()
OutputDecodeOpT(& output_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining output bin-ids from privatized counter indices,...
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue)
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Type selection (IF ? ThenType : ElseType)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...