doxygen/openfpm/dispatch__histogram_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <stdio.h>

#include <iterator>

#include <limits>


#include "../../agent/agent_histogram.cuh"

#include "../../util_debug.cuh"

#include "../../util_device.cuh"

#include "../../thread/thread_search.cuh"

#include "../../grid/grid_queue.cuh"

#include "../../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Histogram kernel entry points

 *****************************************************************************/


template <

    int                                             NUM_ACTIVE_CHANNELS,

    typename                                        CounterT,

    typename                                        OffsetT>

__global__ void DeviceHistogramInitKernel(

    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,

    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,

    GridQueue<int>                                  tile_queue)

{

    if ((threadIdx.x == 0) && (blockIdx.x == 0))

        tile_queue.ResetDrain();


    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;


    #pragma unroll

    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

    {

        if (output_bin < num_output_bins_wrapper.array[CHANNEL])

            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;

    }

}


template <

    typename                                            AgentHistogramPolicyT,

    int                                                 PRIVATIZED_SMEM_BINS,

    int                                                 NUM_CHANNELS,

    int                                                 NUM_ACTIVE_CHANNELS,

    typename                                            SampleIteratorT,

    typename                                            CounterT,

    typename                                            PrivatizedDecodeOpT,

    typename                                            OutputDecodeOpT,

    typename                                            OffsetT>

__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))

__global__ void DeviceHistogramSweepKernel(

    SampleIteratorT                                         d_samples,

    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,

    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,

    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,

    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,

    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,

    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,

    OffsetT                                                 num_row_pixels,

    OffsetT                                                 num_rows,

    OffsetT                                                 row_stride_samples,

    int                                                     tiles_per_row,

    GridQueue<int>                                          tile_queue)

{

    // Thread block type for compositing input tiles

    typedef AgentHistogram<

            AgentHistogramPolicyT,

            PRIVATIZED_SMEM_BINS,

            NUM_CHANNELS,

            NUM_ACTIVE_CHANNELS,

            SampleIteratorT,

            CounterT,

            PrivatizedDecodeOpT,

            OutputDecodeOpT,

            OffsetT>

        AgentHistogramT;


    // Shared memory for AgentHistogram

    __shared__ typename AgentHistogramT::TempStorage temp_storage;


    AgentHistogramT agent(

        temp_storage,

        d_samples,

        num_output_bins_wrapper.array,

        num_privatized_bins_wrapper.array,

        d_output_histograms_wrapper.array,

        d_privatized_histograms_wrapper.array,

        output_decode_op_wrapper.array,

        privatized_decode_op_wrapper.array);


    // Initialize counters

    agent.InitBinCounters();


    // Consume input tiles

    agent.ConsumeTiles(

        num_row_pixels,

        num_rows,

        row_stride_samples,

        tiles_per_row,

        tile_queue);


    // Store output to global (if necessary)

    agent.StoreOutput();


}


/******************************************************************************

 * Dispatch

 ******************************************************************************/


template <

    int         NUM_CHANNELS,

    int         NUM_ACTIVE_CHANNELS,

    typename    SampleIteratorT,

    typename    CounterT,

    typename    LevelT,

    typename    OffsetT>

struct DipatchHistogram

{

    //---------------------------------------------------------------------

    // Types and constants

    //---------------------------------------------------------------------


    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;


    enum

    {

        // Maximum number of bins per channel for which we will use a privatized smem strategy

        MAX_PRIVATIZED_SMEM_BINS = 256

    };


    //---------------------------------------------------------------------

    // Transform functors for converting samples to bin-ids

    //---------------------------------------------------------------------


    // Searches for bin given a list of bin-boundary levels

    template <typename LevelIteratorT>

    struct SearchTransform

    {

        LevelIteratorT  d_levels;                   // Pointer to levels array

        int             num_output_levels;          // Number of levels in array


        // Initializer

        __host__ __device__ __forceinline__ void Init(

            LevelIteratorT  d_levels,               // Pointer to levels array

            int             num_output_levels)      // Number of levels in array

        {

            this->d_levels          = d_levels;

            this->num_output_levels = num_output_levels;

        }


        // Method for converting samples to bin-ids

        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>

        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)

        {

            typedef typename If<IsPointer<LevelIteratorT>::VALUE,

                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator

                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type

                WrappedLevelIteratorT;


            WrappedLevelIteratorT wrapped_levels(d_levels);


            int num_bins = num_output_levels - 1;

            if (valid)

            {

                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;

                if (bin >= num_bins)

                    bin = -1;

            }

        }

    };


    // Scales samples to evenly-spaced bins

    struct ScaleTransform

    {

        int    num_bins;    // Number of levels in array

        LevelT max;         // Max sample level (exclusive)

        LevelT min;         // Min sample level (inclusive)

        LevelT scale;       // Bin scaling factor


        // Initializer

        template <typename _LevelT>

        __host__ __device__ __forceinline__ void Init(

            int     num_output_levels,  // Number of levels in array

            _LevelT max,                // Max sample level (exclusive)

            _LevelT min,                // Min sample level (inclusive)

            _LevelT scale)              // Bin scaling factor

        {

            this->num_bins = num_output_levels - 1;

            this->max = max;

            this->min = min;

            this->scale = scale;

        }


        // Initializer (float specialization)

        __host__ __device__ __forceinline__ void Init(

            int    num_output_levels,   // Number of levels in array

            float   max,                // Max sample level (exclusive)

            float   min,                // Min sample level (inclusive)

            float   scale)              // Bin scaling factor

        {

            this->num_bins = num_output_levels - 1;

            this->max = max;

            this->min = min;

            this->scale = float(1.0) / scale;

        }


        // Initializer (double specialization)

        __host__ __device__ __forceinline__ void Init(

            int    num_output_levels,   // Number of levels in array

            double max,                 // Max sample level (exclusive)

            double min,                 // Min sample level (inclusive)

            double scale)               // Bin scaling factor

        {

            this->num_bins = num_output_levels - 1;

            this->max = max;

            this->min = min;

            this->scale = double(1.0) / scale;

        }


        // Method for converting samples to bin-ids

        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>

        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)

        {

            LevelT level_sample = (LevelT) sample;


            if (valid && (level_sample >= min) && (level_sample < max))

                bin = (int) ((level_sample - min) / scale);

        }


        // Method for converting samples to bin-ids (float specialization)

        template <CacheLoadModifier LOAD_MODIFIER>

        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)

        {

            LevelT level_sample = (LevelT) sample;


            if (valid && (level_sample >= min) && (level_sample < max))

                bin = (int) ((level_sample - min) * scale);

        }


        // Method for converting samples to bin-ids (double specialization)

        template <CacheLoadModifier LOAD_MODIFIER>

        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)

        {

            LevelT level_sample = (LevelT) sample;


            if (valid && (level_sample >= min) && (level_sample < max))

                bin = (int) ((level_sample - min) * scale);

        }

    };


    // Pass-through bin transform operator

    struct PassThruTransform

    {

        // Method for converting samples to bin-ids

        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>

        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)

        {

            if (valid)

                bin = (int) sample;

        }

    };


    //---------------------------------------------------------------------

    // Tuning policies

    //---------------------------------------------------------------------


    template <int NOMINAL_ITEMS_PER_THREAD>

    struct TScale

    {

        enum

        {

            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),

            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)

        };

    };


    struct Policy110

    {

        // HistogramSweepPolicy

        typedef AgentHistogramPolicy<

                512,

                (NUM_CHANNELS == 1) ? 8 : 2,

                BLOCK_LOAD_DIRECT,

                LOAD_DEFAULT,

                true,

                GMEM,

                false>

            HistogramSweepPolicy;

    };


    struct Policy200

    {

        // HistogramSweepPolicy

        typedef AgentHistogramPolicy<

                (NUM_CHANNELS == 1) ? 256 : 128,

                (NUM_CHANNELS == 1) ? 8 : 3,

                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,

                LOAD_DEFAULT,

                true,

                SMEM,

                false>

            HistogramSweepPolicy;

    };


    struct Policy300

    {

        // HistogramSweepPolicy

        typedef AgentHistogramPolicy<

                512,

                (NUM_CHANNELS == 1) ? 8 : 2,

                BLOCK_LOAD_DIRECT,

                LOAD_DEFAULT,

                true,

                GMEM,

                false>

            HistogramSweepPolicy;

    };


    struct Policy350

    {

        // HistogramSweepPolicy

        typedef AgentHistogramPolicy<

                128,

                TScale<8>::VALUE,

                BLOCK_LOAD_DIRECT,

                LOAD_LDG,

                true,

                BLEND,

                true>

            HistogramSweepPolicy;

    };


    struct Policy500

    {

        // HistogramSweepPolicy

        typedef AgentHistogramPolicy<

                384,

                TScale<16>::VALUE,

                BLOCK_LOAD_DIRECT,

                LOAD_LDG,

                true,

                SMEM,

                false>

            HistogramSweepPolicy;

    };


    //---------------------------------------------------------------------

    // Tuning policies of current PTX compiler pass

    //---------------------------------------------------------------------


#if (CUB_PTX_ARCH >= 500)

    typedef Policy500 PtxPolicy;


#elif (CUB_PTX_ARCH >= 350)

    typedef Policy350 PtxPolicy;


#elif (CUB_PTX_ARCH >= 300)

    typedef Policy300 PtxPolicy;


#elif (CUB_PTX_ARCH >= 200)

    typedef Policy200 PtxPolicy;


#else

    typedef Policy110 PtxPolicy;


#endif


    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)

    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};


    //---------------------------------------------------------------------

    // Utilities

    //---------------------------------------------------------------------


    template <typename KernelConfig>

    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t InitConfigs(

        int             ptx_version,

        KernelConfig    &histogram_sweep_config)

    {

    #if (CUB_PTX_ARCH > 0)


        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy

        return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();


    #else


        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version

        if (ptx_version >= 500)

        {

            return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();

        }

        else if (ptx_version >= 350)

        {

            return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();

        }

        else if (ptx_version >= 300)

        {

            return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();

        }

        else if (ptx_version >= 200)

        {

            return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();

        }

        else if (ptx_version >= 110)

        {

            return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();

        }

        else

        {

            // No global atomic support

            return cudaErrorNotSupported;

        }


    #endif

    }


    struct KernelConfig

    {

        int                             block_threads;

        int                             pixels_per_thread;


        template <typename BlockPolicy>

        CUB_RUNTIME_FUNCTION __forceinline__

        cudaError_t Init()

        {

            block_threads               = BlockPolicy::BLOCK_THREADS;

            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;


            return cudaSuccess;

        }

    };


    //---------------------------------------------------------------------

    // Dispatch entrypoints

    //---------------------------------------------------------------------


    template <

        typename                            PrivatizedDecodeOpT,

        typename                            OutputDecodeOpT,

        typename                            DeviceHistogramInitKernelT,

        typename                            DeviceHistogramSweepKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t PrivatizedDispatch(

        void*                               d_temp_storage,

        size_t&                             temp_storage_bytes,

        SampleIteratorT                     d_samples,

        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],

        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],

        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],

        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],

        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],

        int                                 max_num_output_bins,

        OffsetT                             num_row_pixels,

        OffsetT                             num_rows,

        OffsetT                             row_stride_samples,

        DeviceHistogramInitKernelT          histogram_init_kernel,

        DeviceHistogramSweepKernelT         histogram_sweep_kernel,

        KernelConfig                        histogram_sweep_config,

        cudaStream_t                        stream,

        bool                                debug_synchronous)

    {

    #ifndef CUB_RUNTIME_ENABLED


        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported);


    #else


        cudaError error = cudaSuccess;

        do

        {

            // Get device ordinal

            int device_ordinal;

            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;


            // Get SM count

            int sm_count;

            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;


            // Get SM occupancy for histogram_sweep_kernel

            int histogram_sweep_sm_occupancy;

            if (CubDebug(error = MaxSmOccupancy(

                histogram_sweep_sm_occupancy,

                histogram_sweep_kernel,

                histogram_sweep_config.block_threads))) break;


            // Get device occupancy for histogram_sweep_kernel

            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;


            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)

            {

                // Treat as a single linear array of samples

                num_row_pixels      *= num_rows;

                num_rows            = 1;

                row_stride_samples  = num_row_pixels * NUM_CHANNELS;

            }


            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy

            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;

            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;

            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);

            int blocks_per_col      = (blocks_per_row > 0) ?

                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :

                                        0;

            int num_thread_blocks   = blocks_per_row * blocks_per_col;


            dim3 sweep_grid_dims;

            sweep_grid_dims.x = (unsigned int) blocks_per_row;

            sweep_grid_dims.y = (unsigned int) blocks_per_col;

            sweep_grid_dims.z = 1;


            // Temporary storage allocation requirements

            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;

            void*       allocations[NUM_ALLOCATIONS];

            size_t      allocation_sizes[NUM_ALLOCATIONS];


            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);


            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();


            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)

            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;

            if (d_temp_storage == NULL)

            {

                // Return if the caller is simply requesting the size of the storage allocation

                break;

            }


            // Construct the grid queue descriptor

            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);


            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)

            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;

            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];


            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)

            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;

            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];


            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)

            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;

            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];


            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)

            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;

            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];


            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)

            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;

            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;


            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)

            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;

            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)

                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;


            int histogram_init_block_threads    = 256;

            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;


            // Log DeviceHistogramInitKernel configuration

            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",

                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);


            // Invoke histogram_init_kernel

            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(

                num_output_bins_wrapper,

                d_output_histograms_wrapper,

                tile_queue);


            // Return if empty problem

            if ((blocks_per_row == 0) || (blocks_per_col == 0))

                break;


            // Log histogram_sweep_kernel configuration

            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",

                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,

                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);


            // Invoke histogram_sweep_kernel

            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(

                d_samples,

                num_output_bins_wrapper,

                num_privatized_bins_wrapper,

                d_output_histograms_wrapper,

                d_privatized_histograms_wrapper,

                output_decode_op_wrapper,

                privatized_decode_op_wrapper,

                num_row_pixels,

                num_rows,

                row_stride_samples,

                tiles_per_row,

                tile_queue);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


        }

        while (0);


        return error;


    #endif // CUB_RUNTIME_ENABLED

    }


    CUB_RUNTIME_FUNCTION

    static cudaError_t DispatchRange(

        void*               d_temp_storage,

        size_t&             temp_storage_bytes,

        SampleIteratorT     d_samples,

        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],

        int                 num_output_levels[NUM_ACTIVE_CHANNELS],

        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],

        OffsetT             num_row_pixels,

        OffsetT             num_rows,

        OffsetT             row_stride_samples,

        cudaStream_t        stream,

        bool                debug_synchronous,

        Int2Type<false>     is_byte_sample)

    {

        cudaError error = cudaSuccess;

        do

        {

            // Get PTX version

            int ptx_version;

    #if (CUB_PTX_ARCH == 0)

            if (CubDebug(error = PtxVersion(ptx_version))) break;

    #else

            ptx_version = CUB_PTX_ARCH;

    #endif


            // Get kernel dispatch configurations

            KernelConfig histogram_sweep_config;

            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))

                break;


            // Use the search transform op for converting samples to privatized bins

            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;


            // Use the pass-thru transform op for converting privatized bins to output bins

            typedef PassThruTransform OutputDecodeOpT;


            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];

            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];

            int                     max_levels = num_output_levels[0];


            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)

            {

                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);

                if (num_output_levels[channel] > max_levels)

                    max_levels = num_output_levels[channel];

            }

            int max_num_output_bins = max_levels - 1;


            // Dispatch

            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)

            {

                // Too many bins to keep in shared memory.

                const int PRIVATIZED_SMEM_BINS = 0;


                if (CubDebug(error = PrivatizedDispatch(

                    d_temp_storage,

                    temp_storage_bytes,

                    d_samples,

                    d_output_histograms,

                    num_output_levels,

                    privatized_decode_op,

                    num_output_levels,

                    output_decode_op,

                    max_num_output_bins,

                    num_row_pixels,

                    num_rows,

                    row_stride_samples,

                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,

                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,

                    histogram_sweep_config,

                    stream,

                    debug_synchronous))) break;

            }

            else

            {

                // Dispatch shared-privatized approach

                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;


                if (CubDebug(error = PrivatizedDispatch(

                    d_temp_storage,

                    temp_storage_bytes,

                    d_samples,

                    d_output_histograms,

                    num_output_levels,

                    privatized_decode_op,

                    num_output_levels,

                    output_decode_op,

                    max_num_output_bins,

                    num_row_pixels,

                    num_rows,

                    row_stride_samples,

                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,

                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,

                    histogram_sweep_config,

                    stream,

                    debug_synchronous))) break;

            }


        } while (0);


        return error;

    }


    CUB_RUNTIME_FUNCTION

    static cudaError_t DispatchRange(

        void*               d_temp_storage,

        size_t&             temp_storage_bytes,

        SampleIteratorT     d_samples,

        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],

        int                 num_output_levels[NUM_ACTIVE_CHANNELS],

        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],

        OffsetT             num_row_pixels,

        OffsetT             num_rows,

        OffsetT             row_stride_samples,

        cudaStream_t        stream,

        bool                debug_synchronous,

        Int2Type<true>      is_byte_sample)

    {

        cudaError error = cudaSuccess;

        do

        {

            // Get PTX version

            int ptx_version;

    #if (CUB_PTX_ARCH == 0)

            if (CubDebug(error = PtxVersion(ptx_version))) break;

    #else

            ptx_version = CUB_PTX_ARCH;

    #endif


            // Get kernel dispatch configurations

            KernelConfig histogram_sweep_config;

            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))

                break;


            // Use the pass-thru transform op for converting samples to privatized bins

            typedef PassThruTransform PrivatizedDecodeOpT;


            // Use the search transform op for converting privatized bins to output bins

            typedef SearchTransform<LevelT*> OutputDecodeOpT;


            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];

            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];

            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];

            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel


            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)

            {

                num_privatized_levels[channel] = 257;

                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);


                if (num_output_levels[channel] > max_levels)

                    max_levels = num_output_levels[channel];

            }

            int max_num_output_bins = max_levels - 1;


            const int PRIVATIZED_SMEM_BINS = 256;


            if (CubDebug(error = PrivatizedDispatch(

                d_temp_storage,

                temp_storage_bytes,

                d_samples,

                d_output_histograms,

                num_privatized_levels,

                privatized_decode_op,

                num_output_levels,

                output_decode_op,

                max_num_output_bins,

                num_row_pixels,

                num_rows,

                row_stride_samples,

                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,

                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,

                histogram_sweep_config,

                stream,

                debug_synchronous))) break;


        } while (0);


        return error;

    }


    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t DispatchEven(

        void*               d_temp_storage,

        size_t&             temp_storage_bytes,

        SampleIteratorT     d_samples,

        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],

        int                 num_output_levels[NUM_ACTIVE_CHANNELS],

        LevelT              lower_level[NUM_ACTIVE_CHANNELS],

        LevelT              upper_level[NUM_ACTIVE_CHANNELS],

        OffsetT             num_row_pixels,

        OffsetT             num_rows,

        OffsetT             row_stride_samples,

        cudaStream_t        stream,

        bool                debug_synchronous,

        Int2Type<false>     is_byte_sample)

    {

        cudaError error = cudaSuccess;

        do

        {

            // Get PTX version

            int ptx_version;

    #if (CUB_PTX_ARCH == 0)

            if (CubDebug(error = PtxVersion(ptx_version))) break;

    #else

            ptx_version = CUB_PTX_ARCH;

    #endif


            // Get kernel dispatch configurations

            KernelConfig histogram_sweep_config;

            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))

                break;


            // Use the scale transform op for converting samples to privatized bins

            typedef ScaleTransform PrivatizedDecodeOpT;


            // Use the pass-thru transform op for converting privatized bins to output bins

            typedef PassThruTransform OutputDecodeOpT;


            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];

            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];

            int                         max_levels = num_output_levels[0];


            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)

            {

                int     bins    = num_output_levels[channel] - 1;

                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;


                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);


                if (num_output_levels[channel] > max_levels)

                    max_levels = num_output_levels[channel];

            }

            int max_num_output_bins = max_levels - 1;


            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)

            {

                // Dispatch shared-privatized approach

                const int PRIVATIZED_SMEM_BINS = 0;


                if (CubDebug(error = PrivatizedDispatch(

                    d_temp_storage,

                    temp_storage_bytes,

                    d_samples,

                    d_output_histograms,

                    num_output_levels,

                    privatized_decode_op,

                    num_output_levels,

                    output_decode_op,

                    max_num_output_bins,

                    num_row_pixels,

                    num_rows,

                    row_stride_samples,

                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,

                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,

                    histogram_sweep_config,

                    stream,

                    debug_synchronous))) break;

            }

            else

            {

                // Dispatch shared-privatized approach

                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;


                if (CubDebug(error = PrivatizedDispatch(

                    d_temp_storage,

                    temp_storage_bytes,

                    d_samples,

                    d_output_histograms,

                    num_output_levels,

                    privatized_decode_op,

                    num_output_levels,

                    output_decode_op,

                    max_num_output_bins,

                    num_row_pixels,

                    num_rows,

                    row_stride_samples,

                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,

                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,

                    histogram_sweep_config,

                    stream,

                    debug_synchronous))) break;

            }

        }

        while (0);


        return error;

    }


    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t DispatchEven(

        void*               d_temp_storage,

        size_t&             temp_storage_bytes,

        SampleIteratorT     d_samples,

        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],

        int                 num_output_levels[NUM_ACTIVE_CHANNELS],

        LevelT              lower_level[NUM_ACTIVE_CHANNELS],

        LevelT              upper_level[NUM_ACTIVE_CHANNELS],

        OffsetT             num_row_pixels,

        OffsetT             num_rows,

        OffsetT             row_stride_samples,

        cudaStream_t        stream,

        bool                debug_synchronous,

        Int2Type<true>      is_byte_sample)

    {

        cudaError error = cudaSuccess;

        do

        {

            // Get PTX version

            int ptx_version;

    #if (CUB_PTX_ARCH == 0)

            if (CubDebug(error = PtxVersion(ptx_version))) break;

    #else

            ptx_version = CUB_PTX_ARCH;

    #endif


            // Get kernel dispatch configurations

            KernelConfig histogram_sweep_config;

            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))

                break;


            // Use the pass-thru transform op for converting samples to privatized bins

            typedef PassThruTransform PrivatizedDecodeOpT;


            // Use the scale transform op for converting privatized bins to output bins

            typedef ScaleTransform OutputDecodeOpT;


            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];

            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];

            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];

            int                     max_levels = num_output_levels[0];


            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)

            {

                num_privatized_levels[channel] = 257;


                int     bins    = num_output_levels[channel] - 1;

                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;

                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);


                if (num_output_levels[channel] > max_levels)

                    max_levels = num_output_levels[channel];

            }

            int max_num_output_bins = max_levels - 1;


            const int PRIVATIZED_SMEM_BINS = 256;


            if (CubDebug(error = PrivatizedDispatch(

                d_temp_storage,

                temp_storage_bytes,

                d_samples,

                d_output_histograms,

                num_privatized_levels,

                privatized_decode_op,

                num_output_levels,

                output_decode_op,

                max_num_output_bins,

                num_row_pixels,

                num_rows,

                row_stride_samples,

                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,

                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,

                histogram_sweep_config,

                stream,

                debug_synchronous))) break;


        }

        while (0);


        return error;

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::CacheModifiedInputIterator
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
Definition cache_modified_input_iterator.cuh:108

cub::GridQueue
GridQueue is a descriptor utility for dynamic queue management.
Definition grid_queue.cuh:83

cub::GridQueue::AllocationSize
__host__ __device__ static __forceinline__ size_t AllocationSize()
Returns the device allocation size in bytes needed to construct a GridQueue instance.
Definition grid_queue.cuh:100

cub::BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_DIRECT
Definition block_load.cuh:485

cub::BLOCK_LOAD_WARP_TRANSPOSE
@ BLOCK_LOAD_WARP_TRANSPOSE
Definition block_load.cuh:541

cub::LOAD_LDG
@ LOAD_LDG
Cache as texture.
Definition thread_load.cuh:69

cub::LOAD_DEFAULT
@ LOAD_DEFAULT
Default (no modifier)
Definition thread_load.cuh:64

_CubLog
#define _CubLog(format,...)
Log macro for printf statements.
Definition util_debug.cuh:112

cub::PtxVersion
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
Definition util_device.cuh:118

cub::AliasTemporaries
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition util_device.cuh:62

cub::MaxSmOccupancy
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Definition util_device.cuh:244

CubDebug
#define CubDebug(e)
Debug macro.
Definition util_debug.cuh:94

cub::SyncStream
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Definition util_device.cuh:199

CUB_MAX
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition util_macro.cuh:61

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::UpperBound
__device__ __forceinline__ OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val)
Returns the offset of the first value within input which compares greater than val.
Definition thread_search.cuh:126

cub::DeviceHistogramInitKernel
__global__ void DeviceHistogramInitKernel(ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_output_bins_wrapper, ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_output_histograms_wrapper, GridQueue< int > tile_queue)
< Signed integer type for global offsets
Definition dispatch_histogram.cuh:67

cub::tiles_per_row
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int tiles_per_row
Number of image tiles per row.
Definition dispatch_histogram.cuh:111

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

cub::num_output_bins_wrapper
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_output_bins_wrapper
< Input data to reduce
Definition dispatch_histogram.cuh:102

cub::d_privatized_histograms_wrapper
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_privatized_histograms_wrapper
Reference to privatized histograms.
Definition dispatch_histogram.cuh:105

cub::tile_queue
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT int GridQueue< int > tile_queue
< Drain queue descriptor for dynamically mapping tile data onto thread blocks
Definition dispatch_histogram.cuh:113

cub::privatized_decode_op_wrapper
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > privatized_decode_op_wrapper
The transform operator for determining privatized counter indices from samples, one for each channel.
Definition dispatch_histogram.cuh:107

cub::__launch_bounds__
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets

cub::d_output_histograms_wrapper
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > d_output_histograms_wrapper
Reference to final output histograms.
Definition dispatch_histogram.cuh:104

cub::num_rows
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT num_rows
The number of rows in the region of interest.
Definition dispatch_histogram.cuh:109

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::num_row_pixels
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT num_row_pixels
The number of multi-channel pixels per row in the region of interest.
Definition dispatch_histogram.cuh:108

cub::num_privatized_bins_wrapper
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > num_privatized_bins_wrapper
The number bins per privatized histogram.
Definition dispatch_histogram.cuh:103

cub::output_decode_op_wrapper
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > output_decode_op_wrapper
The transform operator for determining output bin-ids from privatized counter indices,...
Definition dispatch_histogram.cuh:106

cub::row_stride_samples
ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< int, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< CounterT *, NUM_ACTIVE_CHANNELS > ArrayWrapper< OutputDecodeOpT, NUM_ACTIVE_CHANNELS > ArrayWrapper< PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS > OffsetT OffsetT OffsetT row_stride_samples
The number of samples between starts of consecutive rows in the region of interest.
Definition dispatch_histogram.cuh:110

cub::AgentHistogramPolicy
< Whether to dequeue tiles from a global work queue
Definition agent_histogram.cuh:78

cub::AgentHistogram
AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wi...
Definition agent_histogram.cuh:112

cub::ArrayWrapper
A wrapper for passing simple static arrays as kernel parameters.
Definition util_type.cuh:771

cub::ArrayWrapper::array
T array[COUNT]
Statically-sized array of type T.
Definition util_type.cuh:774

cub::DipatchHistogram::KernelConfig
Definition dispatch_histogram.cuh:500

cub::DipatchHistogram::PassThruTransform
Definition dispatch_histogram.cuh:316

cub::DipatchHistogram::Policy110
SM11.
Definition dispatch_histogram.cuh:345

cub::DipatchHistogram::Policy200
SM20.
Definition dispatch_histogram.cuh:360

cub::DipatchHistogram::Policy300
SM30.
Definition dispatch_histogram.cuh:375

cub::DipatchHistogram::Policy350
SM35.
Definition dispatch_histogram.cuh:390

cub::DipatchHistogram::Policy500
SM50.
Definition dispatch_histogram.cuh:405

cub::DipatchHistogram::PtxHistogramSweepPolicy
Definition dispatch_histogram.cuh:442

cub::DipatchHistogram::ScaleTransform
Definition dispatch_histogram.cuh:236

cub::DipatchHistogram::SearchTransform
Definition dispatch_histogram.cuh:198

cub::DipatchHistogram::SearchTransform::BinSelect
__host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
Definition dispatch_histogram.cuh:213

cub::DipatchHistogram::TScale
Definition dispatch_histogram.cuh:334

cub::DipatchHistogram
< Signed integer type for global offsets
Definition dispatch_histogram.cuh:176

cub::DipatchHistogram::InitConfigs
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t InitConfigs(int ptx_version, KernelConfig &histogram_sweep_config)
Definition dispatch_histogram.cuh:454

cub::DipatchHistogram::DispatchRange
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
Definition dispatch_histogram.cuh:814

cub::DipatchHistogram::PrivatizedDispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t PrivatizedDispatch(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel, KernelConfig histogram_sweep_config, cudaStream_t stream, bool debug_synchronous)
< Function type of cub::DeviceHistogramSweepKernel
Definition dispatch_histogram.cuh:529

cub::DipatchHistogram::DispatchEven
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< true > is_byte_sample)
Definition dispatch_histogram.cuh:1008

cub::DipatchHistogram::SampleT
std::iterator_traits< SampleIteratorT >::value_type SampleT
The sample value type of the input iterator.
Definition dispatch_histogram.cuh:182

cub::DipatchHistogram::DispatchRange
static CUB_RUNTIME_FUNCTION cudaError_t DispatchRange(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT *d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
Definition dispatch_histogram.cuh:706

cub::DipatchHistogram::DispatchEven
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t DispatchEven(void *d_temp_storage, size_t &temp_storage_bytes, SampleIteratorT d_samples, CounterT *d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], LevelT lower_level[NUM_ACTIVE_CHANNELS], LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type< false > is_byte_sample)
Definition dispatch_histogram.cuh:896

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53