doxygen/openfpm/agent__histogram_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <iterator>


#include "../util_type.cuh"

#include "../block/block_load.cuh"

#include "../grid/grid_queue.cuh"

#include "../iterator/cache_modified_input_iterator.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Tuning policy

 ******************************************************************************/


enum BlockHistogramMemoryPreference

{

    GMEM,

    SMEM,

    BLEND

};


template <

    int                             _BLOCK_THREADS,

    int                             _PIXELS_PER_THREAD,

    BlockLoadAlgorithm              _LOAD_ALGORITHM,

    CacheLoadModifier               _LOAD_MODIFIER,

    bool                            _RLE_COMPRESS,

    BlockHistogramMemoryPreference  _MEM_PREFERENCE,

    bool                            _WORK_STEALING>

struct AgentHistogramPolicy

{

    enum

    {

        BLOCK_THREADS           = _BLOCK_THREADS,

        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,

        IS_RLE_COMPRESS         = _RLE_COMPRESS,

        MEM_PREFERENCE          = _MEM_PREFERENCE,

        IS_WORK_STEALING        = _WORK_STEALING,

    };


    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;

    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;

};


/******************************************************************************

 * Thread block abstractions

 ******************************************************************************/


template <

    typename    AgentHistogramPolicyT,

    int         PRIVATIZED_SMEM_BINS,

    int         NUM_CHANNELS,

    int         NUM_ACTIVE_CHANNELS,

    typename    SampleIteratorT,

    typename    CounterT,

    typename    PrivatizedDecodeOpT,

    typename    OutputDecodeOpT,

    typename    OffsetT,

    int         PTX_ARCH = CUB_PTX_ARCH>

struct AgentHistogram

{

    //---------------------------------------------------------------------

    // Types and constants

    //---------------------------------------------------------------------


    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;


    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;


    typedef typename CubVector<SampleT, 4>::Type QuadT;


    enum

    {

        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,


        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,

        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,

        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,


        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,

        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,


        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,


        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?

                                        AgentHistogramPolicyT::MEM_PREFERENCE :

                                        GMEM,


        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,

    };


    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;


    typedef typename If<IsPointer<SampleIteratorT>::VALUE,

            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator

            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type

        WrappedSampleIteratorT;


    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>

        WrappedPixelIteratorT;


    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>

        WrappedQuadIteratorT;


    typedef BlockLoad<

            SampleT,

            BLOCK_THREADS,

            SAMPLES_PER_THREAD,

            AgentHistogramPolicyT::LOAD_ALGORITHM>

        BlockLoadSampleT;


    typedef BlockLoad<

            PixelT,

            BLOCK_THREADS,

            PIXELS_PER_THREAD,

            AgentHistogramPolicyT::LOAD_ALGORITHM>

        BlockLoadPixelT;


    typedef BlockLoad<

            QuadT,

            BLOCK_THREADS,

            QUADS_PER_THREAD,

            AgentHistogramPolicyT::LOAD_ALGORITHM>

        BlockLoadQuadT;


    struct _TempStorage

    {

        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)


        int tile_idx;


        // Aliasable storage layout

        union Aliasable

        {

            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples

            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels

            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads


        } aliasable;

    };


    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------

    // Per-thread fields

    //---------------------------------------------------------------------


    _TempStorage &temp_storage;


    WrappedSampleIteratorT d_wrapped_samples;


    SampleT* d_native_samples;


    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];


    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];


    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];


    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];


    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];


    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];


    bool prefer_smem;


    //---------------------------------------------------------------------

    // Initialize privatized bin counters

    //---------------------------------------------------------------------


    // Initialize privatized bin counters

    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])

    {

        // Initialize histogram bin counts to zeros

        #pragma unroll

        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

        {

            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)

            {

                privatized_histograms[CHANNEL][privatized_bin] = 0;

            }

        }


        // Barrier to make sure all threads are done updating counters

        CTA_SYNC();

    }


    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters

    __device__ __forceinline__ void InitSmemBinCounters()

    {

        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];


        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];


        InitBinCounters(privatized_histograms);

    }


    // Initialize privatized bin counters.  Specialized for privatized global-memory counters

    __device__ __forceinline__ void InitGmemBinCounters()

    {

        InitBinCounters(d_privatized_histograms);

    }


    //---------------------------------------------------------------------

    // Update final output histograms

    //---------------------------------------------------------------------


    // Update final output histograms from privatized histograms

    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])

    {

        // Barrier to make sure all threads are done updating counters

        CTA_SYNC();


        // Apply privatized bin counts to output bin counts

        #pragma unroll

        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

        {

            int channel_bins = num_privatized_bins[CHANNEL];

            for (int privatized_bin = threadIdx.x;

                    privatized_bin < channel_bins;

                    privatized_bin += BLOCK_THREADS)

            {

                int         output_bin  = -1;

                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];

                bool        is_valid    = count > 0;


                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);


                if (output_bin >= 0)

                {

                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);

                }


            }

        }

    }


    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters

    __device__ __forceinline__ void StoreSmemOutput()

    {

        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];

        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];


        StoreOutput(privatized_histograms);

    }


    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters

    __device__ __forceinline__ void StoreGmemOutput()

    {

        StoreOutput(d_privatized_histograms);

    }


    //---------------------------------------------------------------------

    // Tile accumulation

    //---------------------------------------------------------------------


    // Accumulate pixels.  Specialized for RLE compression.

    __device__ __forceinline__ void AccumulatePixels(

        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],

        bool                is_valid[PIXELS_PER_THREAD],

        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],

        Int2Type<true>      is_rle_compress)

    {

        #pragma unroll

        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

        {

            // Bin pixels

            int bins[PIXELS_PER_THREAD];


            #pragma unroll

            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)

            {

                bins[PIXEL] = -1;

                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);

            }


            CounterT accumulator = 1;


            #pragma unroll

            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)

            {

                if (bins[PIXEL] != bins[PIXEL + 1])

                {

                    if (bins[PIXEL] >= 0)

                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);


                     accumulator = 0;

                }

                accumulator++;

            }


            // Last pixel

            if (bins[PIXELS_PER_THREAD - 1] >= 0)

                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);

        }

    }


    // Accumulate pixels.  Specialized for individual accumulation of each pixel.

    __device__ __forceinline__ void AccumulatePixels(

        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],

        bool                is_valid[PIXELS_PER_THREAD],

        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],

        Int2Type<false>     is_rle_compress)

    {

        #pragma unroll

        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)

        {

            #pragma unroll

            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

            {

                int bin = -1;

                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);

                if (bin >= 0)

                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);

            }

        }

    }


    __device__ __forceinline__ void AccumulateSmemPixels(

        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],

        bool                is_valid[PIXELS_PER_THREAD])

    {

        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];


        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];


        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());

    }


    __device__ __forceinline__ void AccumulateGmemPixels(

        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],

        bool                is_valid[PIXELS_PER_THREAD])

    {

        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());

    }


    //---------------------------------------------------------------------

    // Tile loading

    //---------------------------------------------------------------------


    // Load full, aligned tile using pixel iterator (multi-channel)

    template <int _NUM_ACTIVE_CHANNELS>

    __device__ __forceinline__ void LoadFullAlignedTile(

        OffsetT                         block_offset,

        int                             valid_samples,

        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],

        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)

    {

        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];


        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));


        // Load using a wrapped pixel iterator

        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(

            d_wrapped_pixels,

            reinterpret_cast<AliasedPixels&>(samples));

    }


    // Load full, aligned tile using quad iterator (single-channel)

    __device__ __forceinline__ void LoadFullAlignedTile(

        OffsetT                         block_offset,

        int                             valid_samples,

        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],

        Int2Type<1>                     num_active_channels)

    {

        typedef QuadT AliasedQuads[QUADS_PER_THREAD];


        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));


        // Load using a wrapped quad iterator

        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(

            d_wrapped_quads,

            reinterpret_cast<AliasedQuads&>(samples));

    }


    // Load full, aligned tile

    __device__ __forceinline__ void LoadTile(

        OffsetT         block_offset,

        int             valid_samples,

        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],

        Int2Type<true>  is_full_tile,

        Int2Type<true>  is_aligned)

    {

        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());

    }


    // Load full, mis-aligned tile using sample iterator

    __device__ __forceinline__ void LoadTile(

        OffsetT         block_offset,

        int             valid_samples,

        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],

        Int2Type<true>  is_full_tile,

        Int2Type<false> is_aligned)

    {

        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];


        // Load using sample iterator

        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(

            d_wrapped_samples + block_offset,

            reinterpret_cast<AliasedSamples&>(samples));

    }


    // Load partially-full, aligned tile using the pixel iterator

    __device__ __forceinline__ void LoadTile(

        OffsetT         block_offset,

        int             valid_samples,

        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],

        Int2Type<false> is_full_tile,

        Int2Type<true>  is_aligned)

    {

        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];


        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));


        int valid_pixels = valid_samples / NUM_CHANNELS;


        // Load using a wrapped pixel iterator

        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(

            d_wrapped_pixels,

            reinterpret_cast<AliasedPixels&>(samples),

            valid_pixels);

    }


    // Load partially-full, mis-aligned tile using sample iterator

    __device__ __forceinline__ void LoadTile(

        OffsetT         block_offset,

        int             valid_samples,

        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],

        Int2Type<false> is_full_tile,

        Int2Type<false> is_aligned)

    {

        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];


        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(

            d_wrapped_samples + block_offset,

            reinterpret_cast<AliasedSamples&>(samples),

            valid_samples);

    }


    //---------------------------------------------------------------------

    // Tile processing

    //---------------------------------------------------------------------


    // Consume a tile of data samples

    template <

        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)

        bool IS_FULL_TILE>      // Whether the tile is full

    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)

    {

        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];

        bool        is_valid[PIXELS_PER_THREAD];


        // Load tile

        LoadTile(

            block_offset,

            valid_samples,

            samples,

            Int2Type<IS_FULL_TILE>(),

            Int2Type<IS_ALIGNED>());


        // Set valid flags

        #pragma unroll

        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)

            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);


        // Accumulate samples

#if CUB_PTX_ARCH >= 120

        if (prefer_smem)

            AccumulateSmemPixels(samples, is_valid);

        else

            AccumulateGmemPixels(samples, is_valid);

#else

        AccumulateGmemPixels(samples, is_valid);

#endif


    }


    // Consume row tiles.  Specialized for work-stealing from queue

    template <bool IS_ALIGNED>

    __device__ __forceinline__ void ConsumeTiles(

        OffsetT             num_row_pixels,

        OffsetT             num_rows,

        OffsetT             row_stride_samples,

        int                 tiles_per_row,

        GridQueue<int>      tile_queue,

        Int2Type<true>      is_work_stealing)

    {


        int         num_tiles                   = num_rows * tiles_per_row;

        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;

        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;


        while (tile_idx < num_tiles)

        {

            int     row             = tile_idx / tiles_per_row;

            int     col             = tile_idx - (row * tiles_per_row);

            OffsetT row_offset      = row * row_stride_samples;

            OffsetT col_offset      = (col * TILE_SAMPLES);

            OffsetT tile_offset     = row_offset + col_offset;


            if (col == tiles_per_row - 1)

            {

                // Consume a partially-full tile at the end of the row

                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;

                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);

            }

            else

            {

                // Consume full tile

                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);

            }


            CTA_SYNC();


            // Get next tile

            if (threadIdx.x == 0)

                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;


            CTA_SYNC();


            tile_idx = temp_storage.tile_idx;

        }

    }


    // Consume row tiles.  Specialized for even-share (striped across thread blocks)

    template <bool IS_ALIGNED>

    __device__ __forceinline__ void ConsumeTiles(

        OffsetT             num_row_pixels,

        OffsetT             num_rows,

        OffsetT             row_stride_samples,

        int                 tiles_per_row,

        GridQueue<int>      tile_queue,

        Int2Type<false>     is_work_stealing)

    {

        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)

        {

            OffsetT row_begin   = row * row_stride_samples;

            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);

            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);


            while (tile_offset < row_end)

            {

                OffsetT num_remaining = row_end - tile_offset;


                if (num_remaining < TILE_SAMPLES)

                {

                    // Consume partial tile

                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);

                    break;

                }


                // Consume full tile

                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);

                tile_offset += gridDim.x * TILE_SAMPLES;

            }

        }

    }


    //---------------------------------------------------------------------

    // Parameter extraction

    //---------------------------------------------------------------------


    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)

    template <

        CacheLoadModifier   _MODIFIER,

        typename            _ValueT,

        typename            _OffsetT>

    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)

    {

        return itr.ptr;

    }


    // Return a native pixel pointer (specialized for other types)

    template <typename IteratorT>

    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)

    {

        return NULL;

    }


    //---------------------------------------------------------------------

    // Interface

    //---------------------------------------------------------------------


    __device__ __forceinline__ AgentHistogram(

        TempStorage         &temp_storage,

        SampleIteratorT     d_samples,

        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],

        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],

        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],

        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],

        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],

        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])

    :

        temp_storage(temp_storage.Alias()),

        d_wrapped_samples(d_samples),

        num_output_bins(num_output_bins),

        num_privatized_bins(num_privatized_bins),

        d_output_histograms(d_output_histograms),

        privatized_decode_op(privatized_decode_op),

        output_decode_op(output_decode_op),

        d_native_samples(NativePointer(d_wrapped_samples)),

        prefer_smem((MEM_PREFERENCE == SMEM) ?

            true :                              // prefer smem privatized histograms

            (MEM_PREFERENCE == GMEM) ?

                false :                         // prefer gmem privatized histograms

                blockIdx.x & 1)                 // prefer blended privatized histograms

    {

        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;


        // Initialize the locations of this block's privatized histograms

        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);

    }


    __device__ __forceinline__ void ConsumeTiles(

        OffsetT             num_row_pixels,

        OffsetT             num_rows,

        OffsetT             row_stride_samples,

        int                 tiles_per_row,

        GridQueue<int>      tile_queue)

    {

        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)

        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;

        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;

        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;


        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel

                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned

                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad


        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel

                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned

                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel


        // Whether rows are aligned and can be vectorized

        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))

            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());

        else

            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());

    }


    __device__ __forceinline__ void InitBinCounters()

    {

        if (prefer_smem)

            InitSmemBinCounters();

        else

            InitGmemBinCounters();

    }


    __device__ __forceinline__ void StoreOutput()

    {

        if (prefer_smem)

            StoreSmemOutput();

        else

            StoreGmemOutput();

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockLoad
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition block_load.cuh:641

cub::CacheModifiedInputIterator
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
Definition cache_modified_input_iterator.cuh:108

cub::CacheModifiedInputIterator::ptr
ValueType * ptr
Wrapped native pointer.
Definition cache_modified_input_iterator.cuh:134

cub::GridQueue
GridQueue is a descriptor utility for dynamic queue management.
Definition grid_queue.cuh:83

cub::BlockLoadAlgorithm
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
Definition block_load.cuh:474

cub::CacheLoadModifier
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition thread_load.cuh:63

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::tiles_per_row
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
Definition dispatch_histogram.cuh:111

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

cub::tile_queue
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
Definition dispatch_histogram.cuh:113

cub::num_rows
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
Definition dispatch_histogram.cuh:109

cub::num_tiles
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
Definition dispatch_rle.cuh:84

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::num_row_pixels
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
Definition dispatch_histogram.cuh:108

cub::row_stride_samples
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
Definition dispatch_histogram.cuh:110

block_offset
Definition SparseGridGpu_ker.cuh:14

cub::AgentHistogramPolicy
< Whether to dequeue tiles from a global work queue
Definition agent_histogram.cuh:78

cub::AgentHistogramPolicy::LOAD_ALGORITHM
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
Definition agent_histogram.cuh:88

cub::AgentHistogramPolicy::IS_RLE_COMPRESS
@ IS_RLE_COMPRESS
Whether to perform localized RLE to compress samples before histogramming.
Definition agent_histogram.cuh:83

cub::AgentHistogramPolicy::PIXELS_PER_THREAD
@ PIXELS_PER_THREAD
Pixels per thread (per tile of input)
Definition agent_histogram.cuh:82

cub::AgentHistogramPolicy::MEM_PREFERENCE
@ MEM_PREFERENCE
Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
Definition agent_histogram.cuh:84

cub::AgentHistogramPolicy::IS_WORK_STEALING
@ IS_WORK_STEALING
Whether to dequeue tiles from a global work queue.
Definition agent_histogram.cuh:85

cub::AgentHistogramPolicy::BLOCK_THREADS
@ BLOCK_THREADS
Threads per thread block.
Definition agent_histogram.cuh:81

cub::AgentHistogramPolicy::LOAD_MODIFIER
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
Definition agent_histogram.cuh:89

cub::AgentHistogram::TempStorage
Temporary storage type (unionable)
Definition agent_histogram.cuh:208

cub::AgentHistogram::_TempStorage
Shared memory type required by this thread block.
Definition agent_histogram.cuh:191

cub::AgentHistogram
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
Definition agent_histogram.cuh:112

cub::AgentHistogram::WrappedPixelIteratorT
CacheModifiedInputIterator< LOAD_MODIFIER, PixelT, OffsetT > WrappedPixelIteratorT
Pixel input iterator type (for applying cache modifier)
Definition agent_histogram.cuh:159

cub::AgentHistogram::BlockLoadSampleT
BlockLoad< SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadSampleT
Parameterized BlockLoad type for samples.
Definition agent_histogram.cuh:171

cub::AgentHistogram::d_wrapped_samples
WrappedSampleIteratorT d_wrapped_samples
Sample input iterator (with cache modifier applied, if possible)
Definition agent_histogram.cuh:219

cub::AgentHistogram::SampleT
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample type of the input iterator.
Definition agent_histogram.cuh:118

cub::AgentHistogram::BlockLoadPixelT
BlockLoad< PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadPixelT
Parameterized BlockLoad type for pixels.
Definition agent_histogram.cuh:179

cub::AgentHistogram::d_privatized_histograms
CounterT * d_privatized_histograms[NUM_ACTIVE_CHANNELS]
Reference to gmem privatized histograms for each channel.
Definition agent_histogram.cuh:231

cub::AgentHistogram::WrappedSampleIteratorT
If< IsPointer< SampleIteratorT >::VALUE, CacheModifiedInputIterator< LOAD_MODIFIER, SampleT, OffsetT >, SampleIteratorT >::Type WrappedSampleIteratorT
Input iterator wrapper type (for applying cache modifier)
Definition agent_histogram.cuh:155

cub::AgentHistogram::prefer_smem
bool prefer_smem
Whether to prefer privatized smem counters vs privatized global counters.
Definition agent_histogram.cuh:243

cub::AgentHistogram::num_privatized_bins
int(& num_privatized_bins)[NUM_ACTIVE_CHANNELS]
The number of privatized bins for each channel.
Definition agent_histogram.cuh:228

cub::AgentHistogram::ConsumeTiles
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< true > is_work_stealing)
Definition agent_histogram.cuh:581

cub::AgentHistogram::QuadT
CubVector< SampleT, 4 >::Type QuadT
The quad type of SampleT.
Definition agent_histogram.cuh:124

cub::AgentHistogram::ConsumeTiles
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue, Int2Type< false > is_work_stealing)
Definition agent_histogram.cuh:629

cub::AgentHistogram::d_output_histograms
CounterT *(& d_output_histograms)[NUM_ACTIVE_CHANNELS]
Reference to final output histograms (gmem)
Definition agent_histogram.cuh:234

cub::AgentHistogram::temp_storage
_TempStorage & temp_storage
Reference to temp_storage.
Definition agent_histogram.cuh:216

cub::AgentHistogram::LOAD_MODIFIER
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
Definition agent_histogram.cuh:148

cub::AgentHistogram::num_output_bins
int(& num_output_bins)[NUM_ACTIVE_CHANNELS]
The number of output bins for each channel.
Definition agent_histogram.cuh:225

cub::AgentHistogram::PixelT
CubVector< SampleT, NUM_CHANNELS >::Type PixelT
The pixel type of SampleT.
Definition agent_histogram.cuh:121

cub::AgentHistogram::BlockLoadQuadT
BlockLoad< QuadT, BLOCK_THREADS, QUADS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM > BlockLoadQuadT
Parameterized BlockLoad type for quads.
Definition agent_histogram.cuh:187

cub::AgentHistogram::privatized_decode_op
PrivatizedDecodeOpT(& privatized_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining privatized counter indices from samples, one for each channel.
Definition agent_histogram.cuh:240

cub::AgentHistogram::AgentHistogram
__device__ __forceinline__ AgentHistogram(TempStorage &temp_storage, SampleIteratorT d_samples, int(&num_output_bins)[NUM_ACTIVE_CHANNELS], int(&num_privatized_bins)[NUM_ACTIVE_CHANNELS], CounterT *(&d_output_histograms)[NUM_ACTIVE_CHANNELS], CounterT *(&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], OutputDecodeOpT(&output_decode_op)[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT(&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
Definition agent_histogram.cuh:693

cub::AgentHistogram::AccumulateGmemPixels
__device__ __forceinline__ void AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
Definition agent_histogram.cuh:427

cub::AgentHistogram::d_native_samples
SampleT * d_native_samples
Native pointer for input samples (possibly NULL if unavailable)
Definition agent_histogram.cuh:222

cub::AgentHistogram::AccumulateSmemPixels
__device__ __forceinline__ void AccumulateSmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
Definition agent_histogram.cuh:411

cub::AgentHistogram::WrappedQuadIteratorT
CacheModifiedInputIterator< LOAD_MODIFIER, QuadT, OffsetT > WrappedQuadIteratorT
Qaud input iterator type (for applying cache modifier)
Definition agent_histogram.cuh:163

cub::AgentHistogram::InitBinCounters
__device__ __forceinline__ void InitBinCounters()
Definition agent_histogram.cuh:759

cub::AgentHistogram::StoreOutput
__device__ __forceinline__ void StoreOutput()
Definition agent_histogram.cuh:771

cub::AgentHistogram::output_decode_op
OutputDecodeOpT(& output_decode_op)[NUM_ACTIVE_CHANNELS]
The transform operator for determining output bin-ids from privatized counter indices,...
Definition agent_histogram.cuh:237

cub::AgentHistogram::ConsumeTiles
__device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue< int > tile_queue)
Definition agent_histogram.cuh:728

cub::AlignBytes
Structure alignment.
Definition util_type.cuh:291

cub::BlockLoad::TempStorage
\smemstorage{BlockLoad}
Definition block_load.cuh:1055

cub::CubVector
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition util_type.cuh:454

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

cub::AgentHistogram::_TempStorage::Aliasable
Definition agent_histogram.cuh:198

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53