doxygen/openfpm/dispatch__reduce__by__key_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <stdio.h>

#include <iterator>


#include "dispatch_scan.cuh"

#include "../../agent/agent_reduce_by_key.cuh"

#include "../../thread/thread_operators.cuh"

#include "../../grid/grid_queue.cuh"

#include "../../util_device.cuh"

#include "../../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Kernel entry points

 *****************************************************************************/


template <

    typename            AgentReduceByKeyPolicyT,

    typename            KeysInputIteratorT,

    typename            UniqueOutputIteratorT,

    typename            ValuesInputIteratorT,

    typename            AggregatesOutputIteratorT,

    typename            NumRunsOutputIteratorT,

    typename            ScanTileStateT,

    typename            EqualityOpT,

    typename            ReductionOpT,

    typename            OffsetT>

__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))

__global__ void DeviceReduceByKeyKernel(

    KeysInputIteratorT          d_keys_in,

    UniqueOutputIteratorT       d_unique_out,

    ValuesInputIteratorT        d_values_in,

    AggregatesOutputIteratorT   d_aggregates_out,

    NumRunsOutputIteratorT      d_num_runs_out,

    ScanTileStateT              tile_state,

    int                         start_tile,

    EqualityOpT                 equality_op,

    ReductionOpT                reduction_op,

    OffsetT                     num_items)

{

    // Thread block type for reducing tiles of value segments

    typedef AgentReduceByKey<

            AgentReduceByKeyPolicyT,

            KeysInputIteratorT,

            UniqueOutputIteratorT,

            ValuesInputIteratorT,

            AggregatesOutputIteratorT,

            NumRunsOutputIteratorT,

            EqualityOpT,

            ReductionOpT,

            OffsetT>

        AgentReduceByKeyT;


    // Shared memory for AgentReduceByKey

    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;


    // Process tiles

    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(

        num_items,

        tile_state,

        start_tile);

}


/******************************************************************************

 * Dispatch

 ******************************************************************************/


template <

    typename    KeysInputIteratorT,

    typename    UniqueOutputIteratorT,

    typename    ValuesInputIteratorT,

    typename    AggregatesOutputIteratorT,

    typename    NumRunsOutputIteratorT,

    typename    EqualityOpT,

    typename    ReductionOpT,

    typename    OffsetT>

struct DispatchReduceByKey

{

    //-------------------------------------------------------------------------

    // Types and constants

    //-------------------------------------------------------------------------


    // The input keys type

    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;


    // The output keys type

    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?

        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,

        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type


    // The input values type

    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;


    // The output values type

    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?

        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,

        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type


    enum

    {

        INIT_KERNEL_THREADS     = 128,

        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),

        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),

    };


    // Tile status descriptor interface type

    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;


    //-------------------------------------------------------------------------

    // Tuning policies

    //-------------------------------------------------------------------------


    struct Policy350

    {

        enum {

            NOMINAL_4B_ITEMS_PER_THREAD = 6,

            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),

        };


        typedef AgentReduceByKeyPolicy<

                128,

                ITEMS_PER_THREAD,

                BLOCK_LOAD_DIRECT,

                LOAD_LDG,

                BLOCK_SCAN_WARP_SCANS>

            ReduceByKeyPolicyT;

    };


    struct Policy300

    {

        enum {

            NOMINAL_4B_ITEMS_PER_THREAD = 6,

            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),

        };


        typedef AgentReduceByKeyPolicy<

                128,

                ITEMS_PER_THREAD,

                BLOCK_LOAD_WARP_TRANSPOSE,

                LOAD_DEFAULT,

                BLOCK_SCAN_WARP_SCANS>

            ReduceByKeyPolicyT;

    };


    struct Policy200

    {

        enum {

            NOMINAL_4B_ITEMS_PER_THREAD = 11,

            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),

        };


        typedef AgentReduceByKeyPolicy<

                128,

                ITEMS_PER_THREAD,

                BLOCK_LOAD_WARP_TRANSPOSE,

                LOAD_DEFAULT,

                BLOCK_SCAN_WARP_SCANS>

            ReduceByKeyPolicyT;

    };


    struct Policy130

    {

        enum {

            NOMINAL_4B_ITEMS_PER_THREAD = 7,

            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),

        };


        typedef AgentReduceByKeyPolicy<

                128,

                ITEMS_PER_THREAD,

                BLOCK_LOAD_WARP_TRANSPOSE,

                LOAD_DEFAULT,

                BLOCK_SCAN_WARP_SCANS>

            ReduceByKeyPolicyT;

    };


    struct Policy110

    {

        enum {

            NOMINAL_4B_ITEMS_PER_THREAD = 5,

            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),

        };


        typedef AgentReduceByKeyPolicy<

                64,

                ITEMS_PER_THREAD,

                BLOCK_LOAD_WARP_TRANSPOSE,

                LOAD_DEFAULT,

                BLOCK_SCAN_RAKING>

            ReduceByKeyPolicyT;

    };


    /******************************************************************************

     * Tuning policies of current PTX compiler pass

     ******************************************************************************/


#if (CUB_PTX_ARCH >= 350)

    typedef Policy350 PtxPolicy;


#elif (CUB_PTX_ARCH >= 300)

    typedef Policy300 PtxPolicy;


#elif (CUB_PTX_ARCH >= 200)

    typedef Policy200 PtxPolicy;


#elif (CUB_PTX_ARCH >= 130)

    typedef Policy130 PtxPolicy;


#else

    typedef Policy110 PtxPolicy;


#endif


    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)

    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};


    /******************************************************************************

     * Utilities

     ******************************************************************************/


    template <typename KernelConfig>

    CUB_RUNTIME_FUNCTION __forceinline__

    static void InitConfigs(

        int             ptx_version,

        KernelConfig    &reduce_by_key_config)

    {

    #if (CUB_PTX_ARCH > 0)

        (void)ptx_version;


        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy

        reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();


    #else


        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version

        if (ptx_version >= 350)

        {

            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();

        }

        else if (ptx_version >= 300)

        {

            reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();

        }

        else if (ptx_version >= 200)

        {

            reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();

        }

        else if (ptx_version >= 130)

        {

            reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();

        }

        else

        {

            reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();

        }


    #endif

    }


    struct KernelConfig

    {

        int block_threads;

        int items_per_thread;

        int tile_items;


        template <typename PolicyT>

        CUB_RUNTIME_FUNCTION __forceinline__

        void Init()

        {

            block_threads       = PolicyT::BLOCK_THREADS;

            items_per_thread    = PolicyT::ITEMS_PER_THREAD;

            tile_items          = block_threads * items_per_thread;

        }

    };


    //---------------------------------------------------------------------

    // Dispatch entrypoints

    //---------------------------------------------------------------------


    template <

        typename                    ScanInitKernelT,

        typename                    ReduceByKeyKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Dispatch(

        void*                       d_temp_storage,

        size_t&                     temp_storage_bytes,

        KeysInputIteratorT          d_keys_in,

        UniqueOutputIteratorT       d_unique_out,

        ValuesInputIteratorT        d_values_in,

        AggregatesOutputIteratorT   d_aggregates_out,

        NumRunsOutputIteratorT      d_num_runs_out,

        EqualityOpT                 equality_op,

        ReductionOpT                reduction_op,

        OffsetT                     num_items,

        cudaStream_t                stream,

        bool                        debug_synchronous,

        int                         /*ptx_version*/,

        ScanInitKernelT                init_kernel,

        ReduceByKeyKernelT             reduce_by_key_kernel,

        KernelConfig                reduce_by_key_config)

    {


#ifndef CUB_RUNTIME_ENABLED

      (void)d_temp_storage;

      (void)temp_storage_bytes;

      (void)d_keys_in;

      (void)d_unique_out;

      (void)d_values_in;

      (void)d_aggregates_out;

      (void)d_num_runs_out;

      (void)equality_op;

      (void)reduction_op;

      (void)num_items;

      (void)stream;

      (void)debug_synchronous;

      (void)init_kernel;

      (void)reduce_by_key_kernel;

      (void)reduce_by_key_config;


        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported);


#else


        cudaError error = cudaSuccess;

        do

        {

            // Get device ordinal

            int device_ordinal;

            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;


            // Get SM count

            int sm_count;

            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;


            // Number of input tiles

            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;

            int num_tiles = (num_items + tile_size - 1) / tile_size;


            // Specify temporary storage allocation requirements

            size_t  allocation_sizes[1];

            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors


            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)

            void* allocations[1];

            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;

            if (d_temp_storage == NULL)

            {

                // Return if the caller is simply requesting the size of the storage allocation

                break;

            }


            // Construct the tile status interface

            ScanTileStateT tile_state;

            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;


            // Log init_kernel configuration

            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);

            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);


            // Invoke init_kernel to initialize tile descriptors

            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(

                tile_state,

                num_tiles,

                d_num_runs_out);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


            // Return if empty problem

            if (num_items == 0)

                break;


            // Get SM occupancy for reduce_by_key_kernel

            int reduce_by_key_sm_occupancy;

            if (CubDebug(error = MaxSmOccupancy(

                reduce_by_key_sm_occupancy,            // out

                reduce_by_key_kernel,

                reduce_by_key_config.block_threads))) break;


            // Get max x-dimension of grid

            int max_dim_x;

            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;


            // Run grids in epochs (in case number of tiles exceeds max x-dimension

            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);

            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)

            {

                // Log reduce_by_key_kernel configuration

                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",

                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);


                // Invoke reduce_by_key_kernel

                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(

                    d_keys_in,

                    d_unique_out,

                    d_values_in,

                    d_aggregates_out,

                    d_num_runs_out,

                    tile_state,

                    start_tile,

                    equality_op,

                    reduction_op,

                    num_items);


                // Check for failure to launch

                if (CubDebug(error = cudaPeekAtLastError())) break;


                // Sync the stream if specified to flush runtime errors

                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            }

        }

        while (0);


        return error;


#endif  // CUB_RUNTIME_ENABLED

    }


    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Dispatch(

        void*                       d_temp_storage,

        size_t&                     temp_storage_bytes,

        KeysInputIteratorT          d_keys_in,

        UniqueOutputIteratorT       d_unique_out,

        ValuesInputIteratorT        d_values_in,

        AggregatesOutputIteratorT   d_aggregates_out,

        NumRunsOutputIteratorT      d_num_runs_out,

        EqualityOpT                 equality_op,

        ReductionOpT                reduction_op,

        OffsetT                     num_items,

        cudaStream_t                stream,

        bool                        debug_synchronous)

    {

        cudaError error = cudaSuccess;

        do

        {

            // Get PTX version

            int ptx_version;

    #if (CUB_PTX_ARCH == 0)

            if (CubDebug(error = PtxVersion(ptx_version))) break;

    #else

            ptx_version = CUB_PTX_ARCH;

    #endif


            // Get kernel kernel dispatch configurations

            KernelConfig reduce_by_key_config;

            InitConfigs(ptx_version, reduce_by_key_config);


            // Dispatch

            if (CubDebug(error = Dispatch(

                d_temp_storage,

                temp_storage_bytes,

                d_keys_in,

                d_unique_out,

                d_values_in,

                d_aggregates_out,

                d_num_runs_out,

                equality_op,

                reduction_op,

                num_items,

                stream,

                debug_synchronous,

                ptx_version,

                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,

                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,

                reduce_by_key_config))) break;

        }

        while (0);


        return error;

    }

};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockRadixRank
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition block_radix_rank.cuh:98

dispatch_scan.cuh

cub::BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_DIRECT
Definition block_load.cuh:485

cub::BLOCK_LOAD_WARP_TRANSPOSE
@ BLOCK_LOAD_WARP_TRANSPOSE
Definition block_load.cuh:541

cub::LOAD_LDG
@ LOAD_LDG
Cache as texture.
Definition thread_load.cuh:69

cub::LOAD_DEFAULT
@ LOAD_DEFAULT
Default (no modifier)
Definition thread_load.cuh:64

_CubLog
#define _CubLog(format,...)
Log macro for printf statements.
Definition util_debug.cuh:112

cub::PtxVersion
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
Definition util_device.cuh:118

cub::AliasTemporaries
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition util_device.cuh:62

cub::MaxSmOccupancy
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Definition util_device.cuh:244

CubDebug
#define CubDebug(e)
Debug macro.
Definition util_debug.cuh:94

cub::SyncStream
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Definition util_device.cuh:199

CUB_MAX
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition util_macro.cuh:61

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::d_unique_out
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
Definition dispatch_reduce_by_key.cuh:74

cub::d_num_runs_out
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
Definition dispatch_reduce_by_key.cuh:77

cub::num_items
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:168

cub::reduction_op
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Definition dispatch_reduce.cuh:75

cub::__launch_bounds__
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets

cub::start_tile
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
Definition dispatch_reduce_by_key.cuh:79

cub::d_values_in
KeyT const ValueT * d_values_in
[in] Input values buffer
Definition dispatch_radix_sort.cuh:165

cub::num_tiles
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
Definition dispatch_rle.cuh:84

cub::BLOCK_SCAN_RAKING
@ BLOCK_SCAN_RAKING
Definition block_scan.cuh:78

cub::BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_WARP_SCANS
Definition block_scan.cuh:108

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::tile_state
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
Definition dispatch_reduce_by_key.cuh:78

cub::d_aggregates_out
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
Definition dispatch_reduce_by_key.cuh:76

cub::equality_op
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
Definition dispatch_reduce_by_key.cuh:80

cub::AgentReduceByKeyPolicy
< The BlockScan algorithm to use
Definition agent_reduce_by_key.cuh:68

cub::AgentReduceByKey
AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-...
Definition agent_reduce_by_key.cuh:99

cub::DispatchReduceByKey::KernelConfig
Definition dispatch_reduce_by_key.cuh:325

cub::DispatchReduceByKey::Policy110
SM11.
Definition dispatch_reduce_by_key.cuh:233

cub::DispatchReduceByKey::Policy130
SM13.
Definition dispatch_reduce_by_key.cuh:216

cub::DispatchReduceByKey::Policy200
SM20.
Definition dispatch_reduce_by_key.cuh:199

cub::DispatchReduceByKey::Policy300
SM30.
Definition dispatch_reduce_by_key.cuh:182

cub::DispatchReduceByKey::Policy350
SM35.
Definition dispatch_reduce_by_key.cuh:165

cub::DispatchReduceByKey::PtxReduceByKeyPolicy
Definition dispatch_reduce_by_key.cuh:271

cub::DispatchReduceByKey
< Signed integer type for global offsets
Definition dispatch_reduce_by_key.cuh:127

cub::DispatchReduceByKey::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
Definition dispatch_reduce_by_key.cuh:497

cub::DispatchReduceByKey::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
Definition dispatch_reduce_by_key.cuh:353

cub::DispatchReduceByKey::InitConfigs
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &reduce_by_key_config)
Definition dispatch_reduce_by_key.cuh:283

cub::Equals
Type equality test.
Definition util_type.cuh:99

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::ReduceByKeyScanTileState
Definition single_pass_scan_operators.cuh:450

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53