doxygen/openfpm/dispatch__reduce_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <stdio.h>

#include <iterator>


#include "../../agent/agent_reduce.cuh"

#include "../../iterator/arg_index_input_iterator.cuh"

#include "../../thread/thread_operators.cuh"

#include "../../grid/grid_even_share.cuh"

#include "../../iterator/arg_index_input_iterator.cuh"

#include "../../util_debug.cuh"

#include "../../util_device.cuh"

#include "../../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Kernel entry points

 *****************************************************************************/


template <

    typename                ChainedPolicyT,

    typename                InputIteratorT,

    typename                OutputIteratorT,

    typename                OffsetT,

    typename                ReductionOpT>

__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))

__global__ void DeviceReduceKernel(

    InputIteratorT          d_in,

    OutputIteratorT         d_out,

    OffsetT                 num_items,

    GridEvenShare<OffsetT>  even_share,

    ReductionOpT            reduction_op)

{

    // The output value type

    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type


    // Thread block type for reducing input tiles

    typedef AgentReduce<

            typename ChainedPolicyT::ActivePolicy::ReducePolicy,

            InputIteratorT,

            OutputIteratorT,

            OffsetT,

            ReductionOpT>

        AgentReduceT;


    // Shared memory storage

    __shared__ typename AgentReduceT::TempStorage temp_storage;


    // Consume input tiles

    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);


    // Output result

    if (threadIdx.x == 0)

        d_out[blockIdx.x] = block_aggregate;

}


template <

    typename                ChainedPolicyT,

    typename                InputIteratorT,

    typename                OutputIteratorT,

    typename                OffsetT,

    typename                ReductionOpT,

    typename                OuputT>

__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)

__global__ void DeviceReduceSingleTileKernel(

    InputIteratorT          d_in,

    OutputIteratorT         d_out,

    OffsetT                 num_items,

    ReductionOpT            reduction_op,

    OuputT                  init)

{

    // Thread block type for reducing input tiles

    typedef AgentReduce<

            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,

            InputIteratorT,

            OutputIteratorT,

            OffsetT,

            ReductionOpT>

        AgentReduceT;


    // Shared memory storage

    __shared__ typename AgentReduceT::TempStorage temp_storage;


    // Check if empty problem

    if (num_items == 0)

    {

        if (threadIdx.x == 0)

            *d_out = init;

        return;

    }


    // Consume input tiles

    OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(

        OffsetT(0),

        num_items);


    // Output result

    if (threadIdx.x == 0)

        *d_out = reduction_op(init, block_aggregate);

}


template <typename T, typename OffsetT, typename IteratorT>

__device__ __forceinline__

void NormalizeReductionOutput(

    T &/*val*/,

    OffsetT /*base_offset*/,

    IteratorT /*itr*/)

{}


template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>

__device__ __forceinline__

void NormalizeReductionOutput(

    KeyValuePairT &val,

    OffsetT base_offset,

    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)

{

    val.key -= base_offset;

}


template <

    typename                ChainedPolicyT,

    typename                InputIteratorT,

    typename                OutputIteratorT,

    typename                OffsetIteratorT,

    typename                OffsetT,

    typename                ReductionOpT,

    typename                OutputT>

__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))

__global__ void DeviceSegmentedReduceKernel(

    InputIteratorT          d_in,

    OutputIteratorT         d_out,

    OffsetIteratorT         d_begin_offsets,

    OffsetIteratorT         d_end_offsets,

    int                     /*num_segments*/,

    ReductionOpT            reduction_op,

    OutputT                 init)

{

    // Thread block type for reducing input tiles

    typedef AgentReduce<

            typename ChainedPolicyT::ActivePolicy::ReducePolicy,

            InputIteratorT,

            OutputIteratorT,

            OffsetT,

            ReductionOpT>

        AgentReduceT;


    // Shared memory storage

    __shared__ typename AgentReduceT::TempStorage temp_storage;


    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];

    OffsetT segment_end     = d_end_offsets[blockIdx.x];


    // Check if empty problem

    if (segment_begin == segment_end)

    {

        if (threadIdx.x == 0)

            d_out[blockIdx.x] = init;

        return;

    }


    // Consume input tiles

    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(

        segment_begin,

        segment_end);


    // Normalize as needed

    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);


    if (threadIdx.x == 0)

        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;

}


/******************************************************************************

 * Policy

 ******************************************************************************/


template <

    typename OuputT,

    typename OffsetT,

    typename ReductionOpT>

struct DeviceReducePolicy

{

    //------------------------------------------------------------------------------

    // Architecture-specific tuning policies

    //------------------------------------------------------------------------------


    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>

    {

        // ReducePolicy

        typedef AgentReducePolicy<

                CUB_SCALED_GRANULARITIES(128, 8, OuputT),

                2,

                BLOCK_REDUCE_RAKING,

                LOAD_DEFAULT>

            ReducePolicy;


        // SingleTilePolicy

        typedef ReducePolicy SingleTilePolicy;


        // SegmentedReducePolicy

        typedef ReducePolicy SegmentedReducePolicy;

    };


    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>

    {

        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)

        typedef AgentReducePolicy<

                CUB_SCALED_GRANULARITIES(128, 8, OuputT),

                4,

                BLOCK_REDUCE_RAKING,

                LOAD_DEFAULT>

            ReducePolicy;


        // SingleTilePolicy

        typedef ReducePolicy SingleTilePolicy;


        // SegmentedReducePolicy

        typedef ReducePolicy SegmentedReducePolicy;

    };


    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>

    {

        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)

        typedef AgentReducePolicy<

                CUB_SCALED_GRANULARITIES(256, 20, OuputT),

                2,

                BLOCK_REDUCE_WARP_REDUCTIONS,

                LOAD_DEFAULT>

            ReducePolicy;


        // SingleTilePolicy

        typedef ReducePolicy SingleTilePolicy;


        // SegmentedReducePolicy

        typedef ReducePolicy SegmentedReducePolicy;

    };


    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>

    {

        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)

        typedef AgentReducePolicy<

                CUB_SCALED_GRANULARITIES(256, 20, OuputT),

                4,

                BLOCK_REDUCE_WARP_REDUCTIONS,

                LOAD_LDG>

            ReducePolicy;


        // SingleTilePolicy

        typedef ReducePolicy SingleTilePolicy;


        // SegmentedReducePolicy

        typedef ReducePolicy SegmentedReducePolicy;

    };


    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>

    {

        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)

        typedef AgentReducePolicy<

                CUB_SCALED_GRANULARITIES(256, 16, OuputT),

                4,

                BLOCK_REDUCE_WARP_REDUCTIONS,

                LOAD_LDG>

            ReducePolicy;


        // SingleTilePolicy

        typedef ReducePolicy SingleTilePolicy;


        // SegmentedReducePolicy

        typedef ReducePolicy SegmentedReducePolicy;

    };


    typedef Policy600 MaxPolicy;


};


/******************************************************************************

 * Single-problem dispatch

 ******************************************************************************/


template <

    typename InputIteratorT,

    typename OutputIteratorT,

    typename OffsetT,

    typename ReductionOpT>

struct DispatchReduce :

    DeviceReducePolicy<

        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,

            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type

        OffsetT,

        ReductionOpT>

{

    //------------------------------------------------------------------------------

    // Constants

    //------------------------------------------------------------------------------


    // Data type of output iterator

    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type


    //------------------------------------------------------------------------------

    // Problem state

    //------------------------------------------------------------------------------


    void                *d_temp_storage;

    size_t              &temp_storage_bytes;

    InputIteratorT      d_in;

    OutputIteratorT     d_out;

    OffsetT             num_items;

    ReductionOpT        reduction_op;

    OutputT             init;

    cudaStream_t        stream;

    bool                debug_synchronous;

    int                 ptx_version;


    //------------------------------------------------------------------------------

    // Constructor

    //------------------------------------------------------------------------------


    CUB_RUNTIME_FUNCTION __forceinline__

    DispatchReduce(

        void*                   d_temp_storage,

        size_t                  &temp_storage_bytes,

        InputIteratorT          d_in,

        OutputIteratorT         d_out,

        OffsetT                 num_items,

        ReductionOpT            reduction_op,

        OutputT                 init,

        cudaStream_t            stream,

        bool                    debug_synchronous,

        int                     ptx_version)

    :

        d_temp_storage(d_temp_storage),

        temp_storage_bytes(temp_storage_bytes),

        d_in(d_in),

        d_out(d_out),

        num_items(num_items),

        reduction_op(reduction_op),

        init(init),

        stream(stream),

        debug_synchronous(debug_synchronous),

        ptx_version(ptx_version)

    {}


    //------------------------------------------------------------------------------

    // Small-problem (single tile) invocation

    //------------------------------------------------------------------------------


    template <

        typename                ActivePolicyT,

        typename                SingleTileKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t InvokeSingleTile(

        SingleTileKernelT       single_tile_kernel)

    {

#ifndef CUB_RUNTIME_ENABLED

        (void)single_tile_kernel;


        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported );

#else

        cudaError error = cudaSuccess;

        do

        {

            // Return if the caller is simply requesting the size of the storage allocation

            if (d_temp_storage == NULL)

            {

                temp_storage_bytes = 1;

                break;

            }


            // Log single_reduce_sweep_kernel configuration

            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",

                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,

                (long long) stream,

                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);


            // Invoke single_reduce_sweep_kernel

            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(

                d_in,

                d_out,

                num_items,

                reduction_op,

                init);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

        }

        while (0);


        return error;


#endif // CUB_RUNTIME_ENABLED

    }


    //------------------------------------------------------------------------------

    // Normal problem size invocation (two-pass)

    //------------------------------------------------------------------------------


    template <

        typename                ActivePolicyT,

        typename                ReduceKernelT,

        typename                SingleTileKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t InvokePasses(

        ReduceKernelT           reduce_kernel,

        SingleTileKernelT       single_tile_kernel)

    {

#ifndef CUB_RUNTIME_ENABLED

        (void)                  reduce_kernel;

        (void)                  single_tile_kernel;


        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported );

#else


        cudaError error = cudaSuccess;

        do

        {

            // Get device ordinal

            int device_ordinal;

            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;


            // Get SM count

            int sm_count;

            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;


            // Init regular kernel configuration

            KernelConfig reduce_config;

            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;

            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;


            // Even-share work distribution

            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);

            GridEvenShare<OffsetT> even_share;

            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);


            // Temporary storage allocation requirements

            void* allocations[1];

            size_t allocation_sizes[1] =

            {

                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions

            };


            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)

            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;

            if (d_temp_storage == NULL)

            {

                // Return if the caller is simply requesting the size of the storage allocation

                return cudaSuccess;

            }


            // Alias the allocation for the privatized per-block reductions

            OutputT *d_block_reductions = (OutputT*) allocations[0];


            // Get grid size for device_reduce_sweep_kernel

            int reduce_grid_size = even_share.grid_size;


            // Log device_reduce_sweep_kernel configuration

            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",

                reduce_grid_size,

                ActivePolicyT::ReducePolicy::BLOCK_THREADS,

                (long long) stream,

                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,

                reduce_config.sm_occupancy);


            // Invoke DeviceReduceKernel

            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(

                d_in,

                d_block_reductions,

                num_items,

                even_share,

                reduction_op);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


            // Log single_reduce_sweep_kernel configuration

            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",

                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,

                (long long) stream,

                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);


            // Invoke DeviceReduceSingleTileKernel

            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(

                d_block_reductions,

                d_out,

                reduce_grid_size,

                reduction_op,

                init);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

        }

        while (0);


        return error;


#endif // CUB_RUNTIME_ENABLED


    }


    //------------------------------------------------------------------------------

    // Chained policy invocation

    //------------------------------------------------------------------------------


    template <typename ActivePolicyT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t Invoke()

    {

        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;

        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;


        // Force kernel code-generation in all compiler passes

        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))

        {

            // Small, single tile size

            return InvokeSingleTile<ActivePolicyT>(

                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);

        }

        else

        {

            // Regular size

            return InvokePasses<ActivePolicyT>(

                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,

                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);

        }

    }


    //------------------------------------------------------------------------------

    // Dispatch entrypoints

    //------------------------------------------------------------------------------


    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Dispatch(

        void            *d_temp_storage,

        size_t          &temp_storage_bytes,

        InputIteratorT  d_in,

        OutputIteratorT d_out,

        OffsetT         num_items,

        ReductionOpT    reduction_op,

        OutputT         init,

        cudaStream_t    stream,

        bool            debug_synchronous)

    {

        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;


        cudaError error = cudaSuccess;

        do

        {

            // Get PTX version

            int ptx_version;

            if (CubDebug(error = PtxVersion(ptx_version))) break;


            // Create dispatch functor

            DispatchReduce dispatch(

                d_temp_storage, temp_storage_bytes,

                d_in, d_out, num_items, reduction_op, init,

                stream, debug_synchronous, ptx_version);


            // Dispatch to chained policy

            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;

        }

        while (0);


        return error;

    }

};


/******************************************************************************

 * Segmented dispatch

 ******************************************************************************/


template <

    typename InputIteratorT,

    typename OutputIteratorT,

    typename OffsetIteratorT,

    typename OffsetT,

    typename ReductionOpT>

struct DispatchSegmentedReduce :

    DeviceReducePolicy<

        typename std::iterator_traits<InputIteratorT>::value_type,

        OffsetT,

        ReductionOpT>

{

    //------------------------------------------------------------------------------

    // Constants

    //------------------------------------------------------------------------------


    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type


    //------------------------------------------------------------------------------

    // Problem state

    //------------------------------------------------------------------------------


    void                *d_temp_storage;

    size_t              &temp_storage_bytes;

    InputIteratorT      d_in;

    OutputIteratorT     d_out;

    OffsetT             num_segments;

    OffsetIteratorT     d_begin_offsets;

    OffsetIteratorT     d_end_offsets;

    ReductionOpT        reduction_op;

    OutputT             init;

    cudaStream_t        stream;

    bool                debug_synchronous;

    int                 ptx_version;


    //------------------------------------------------------------------------------

    // Constructor

    //------------------------------------------------------------------------------


    CUB_RUNTIME_FUNCTION __forceinline__

    DispatchSegmentedReduce(

        void*                   d_temp_storage,

        size_t                  &temp_storage_bytes,

        InputIteratorT          d_in,

        OutputIteratorT         d_out,

        OffsetT                 num_segments,

        OffsetIteratorT         d_begin_offsets,

        OffsetIteratorT         d_end_offsets,

        ReductionOpT            reduction_op,

        OutputT                 init,

        cudaStream_t            stream,

        bool                    debug_synchronous,

        int                     ptx_version)

    :

        d_temp_storage(d_temp_storage),

        temp_storage_bytes(temp_storage_bytes),

        d_in(d_in),

        d_out(d_out),

        num_segments(num_segments),

        d_begin_offsets(d_begin_offsets),

        d_end_offsets(d_end_offsets),

        reduction_op(reduction_op),

        init(init),

        stream(stream),

        debug_synchronous(debug_synchronous),

        ptx_version(ptx_version)

    {}


    //------------------------------------------------------------------------------

    // Chained policy invocation

    //------------------------------------------------------------------------------


    template <

        typename                        ActivePolicyT,

        typename                        DeviceSegmentedReduceKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t InvokePasses(

        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)

    {

#ifndef CUB_RUNTIME_ENABLED

        (void)segmented_reduce_kernel;

        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported );

#else

        cudaError error = cudaSuccess;

        do

        {

            // Return if the caller is simply requesting the size of the storage allocation

            if (d_temp_storage == NULL)

            {

                temp_storage_bytes = 1;

                return cudaSuccess;

            }


            // Init kernel configuration

            KernelConfig segmented_reduce_config;

            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;


            // Log device_reduce_sweep_kernel configuration

            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",

                num_segments,

                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,

                (long long) stream,

                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,

                segmented_reduce_config.sm_occupancy);


            // Invoke DeviceReduceKernel

            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(

                d_in,

                d_out,

                d_begin_offsets,

                d_end_offsets,

                num_segments,

                reduction_op,

                init);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

        }

        while (0);


        return error;


#endif // CUB_RUNTIME_ENABLED


    }


    template <typename ActivePolicyT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t Invoke()

    {

        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;


        // Force kernel code-generation in all compiler passes

        return InvokePasses<ActivePolicyT>(

            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);

    }


    //------------------------------------------------------------------------------

    // Dispatch entrypoints

    //------------------------------------------------------------------------------


    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Dispatch(

        void            *d_temp_storage,

        size_t          &temp_storage_bytes,

        InputIteratorT  d_in,

        OutputIteratorT d_out,

        int             num_segments,

        OffsetIteratorT d_begin_offsets,

        OffsetIteratorT d_end_offsets,

        ReductionOpT    reduction_op,

        OutputT         init,

        cudaStream_t    stream,

        bool            debug_synchronous)

    {

        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;


        if (num_segments <= 0)

            return cudaSuccess;


        cudaError error = cudaSuccess;

        do

        {

            // Get PTX version

            int ptx_version;

            if (CubDebug(error = PtxVersion(ptx_version))) break;


            // Create dispatch functor

            DispatchSegmentedReduce dispatch(

                d_temp_storage, temp_storage_bytes,

                d_in, d_out,

                num_segments, d_begin_offsets, d_end_offsets,

                reduction_op, init,

                stream, debug_synchronous, ptx_version);


            // Dispatch to chained policy

            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;

        }

        while (0);


        return error;

    }

};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::ArgIndexInputIterator
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
Definition arg_index_input_iterator.cuh:114

cub::LOAD_LDG
@ LOAD_LDG
Cache as texture.
Definition thread_load.cuh:69

cub::LOAD_DEFAULT
@ LOAD_DEFAULT
Default (no modifier)
Definition thread_load.cuh:64

_CubLog
#define _CubLog(format,...)
Log macro for printf statements.
Definition util_debug.cuh:112

cub::PtxVersion
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
Definition util_device.cuh:118

cub::AliasTemporaries
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition util_device.cuh:62

CubDebug
#define CubDebug(e)
Debug macro.
Definition util_debug.cuh:94

cub::SyncStream
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Definition util_device.cuh:199

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::NormalizeReductionOutput
__device__ __forceinline__ void NormalizeReductionOutput(T &, OffsetT, IteratorT)
Normalize input iterator to segment offset.
Definition dispatch_reduce.cuh:154

cub::init
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
Definition dispatch_reduce.cuh:119

cub::num_items
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:168

cub::reduction_op
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Definition dispatch_reduce.cuh:75

cub::__launch_bounds__
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets

cub::d_begin_offsets
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
Definition dispatch_radix_sort.cuh:334

cub::even_share
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
Definition dispatch_radix_sort.cuh:79

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::d_end_offsets
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
Definition dispatch_radix_sort.cuh:335

cub::BLOCK_REDUCE_WARP_REDUCTIONS
@ BLOCK_REDUCE_WARP_REDUCTIONS
Definition block_reduce.cuh:148

cub::BLOCK_REDUCE_RAKING
@ BLOCK_REDUCE_RAKING
Definition block_reduce.cuh:119

cub::d_out
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Definition dispatch_reduce.cuh:71

cub::AgentReducePolicy
< Cache load modifier for reading input elements
Definition agent_reduce.cuh:68

cub::AgentReduce::TempStorage
Alias wrapper allowing storage to be unioned.
Definition agent_reduce.cuh:151

cub::AgentReduce
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
Definition agent_reduce.cuh:100

cub::AgentReduce::ConsumeRange
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
Definition agent_reduce.cuh:312

cub::AgentReduce::ConsumeTiles
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
Definition agent_reduce.cuh:368

cub::ChainedPolicy
Helper for dispatching into a policy chain.
Definition util_device.cuh:305

cub::DeviceReducePolicy::Policy130
SM13.
Definition dispatch_reduce.cuh:248

cub::DeviceReducePolicy::Policy130::ReducePolicy
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
Definition dispatch_reduce.cuh:255

cub::DeviceReducePolicy::Policy200
SM20.
Definition dispatch_reduce.cuh:267

cub::DeviceReducePolicy::Policy200::ReducePolicy
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
Definition dispatch_reduce.cuh:274

cub::DeviceReducePolicy::Policy300
SM30.
Definition dispatch_reduce.cuh:286

cub::DeviceReducePolicy::Policy300::ReducePolicy
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 2, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
Definition dispatch_reduce.cuh:293

cub::DeviceReducePolicy::Policy350
SM35.
Definition dispatch_reduce.cuh:305

cub::DeviceReducePolicy::Policy350::ReducePolicy
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
Definition dispatch_reduce.cuh:312

cub::DeviceReducePolicy::Policy600
SM60.
Definition dispatch_reduce.cuh:323

cub::DeviceReducePolicy::Policy600::ReducePolicy
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 16, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
Definition dispatch_reduce.cuh:330

cub::DeviceReducePolicy
< Binary reduction functor type having member T operator()(const T &a, const T &b)
Definition dispatch_reduce.cuh:241

cub::DeviceReducePolicy::MaxPolicy
Policy600 MaxPolicy
MaxPolicy.
Definition dispatch_reduce.cuh:341

cub::DispatchReduce
< Binary reduction functor type having member T operator()(const T &a, const T &b)
Definition dispatch_reduce.cuh:366

cub::DispatchReduce::temp_storage_bytes
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
Definition dispatch_reduce.cuh:382

cub::DispatchReduce::d_in
InputIteratorT d_in
[in] Pointer to the input sequence of data items
Definition dispatch_reduce.cuh:383

cub::DispatchReduce::debug_synchronous
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
Definition dispatch_reduce.cuh:389

cub::DispatchReduce::d_out
OutputIteratorT d_out
[out] Pointer to the output aggregate
Definition dispatch_reduce.cuh:384

cub::DispatchReduce::InvokeSingleTile
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block block to reduce in-core.
Definition dispatch_reduce.cuh:432

cub::DispatchReduce::num_items
OffsetT num_items
[in] Total number of input items (i.e., length of d_in)
Definition dispatch_reduce.cuh:385

cub::DispatchReduce::ptx_version
int ptx_version
[in] PTX version
Definition dispatch_reduce.cuh:390

cub::DispatchReduce::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Definition dispatch_reduce.cuh:631

cub::DispatchReduce::init
OutputT init
[in] The initial value of the reduction
Definition dispatch_reduce.cuh:387

cub::DispatchReduce::DispatchReduce
CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
Definition dispatch_reduce.cuh:398

cub::DispatchReduce::stream
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
Definition dispatch_reduce.cuh:388

cub::DispatchReduce::Invoke
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
Definition dispatch_reduce.cuh:601

cub::DispatchReduce::InvokePasses
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel)
Invoke two-passes to reduce.
Definition dispatch_reduce.cuh:489

cub::DispatchReduce::d_temp_storage
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
Definition dispatch_reduce.cuh:381

cub::DispatchReduce::reduction_op
ReductionOpT reduction_op
[in] Binary reduction functor
Definition dispatch_reduce.cuh:386

cub::DispatchSegmentedReduce
< Binary reduction functor type having member T operator()(const T &a, const T &b)
Definition dispatch_reduce.cuh:686

cub::DispatchSegmentedReduce::temp_storage_bytes
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
Definition dispatch_reduce.cuh:702

cub::DispatchSegmentedReduce::d_out
OutputIteratorT d_out
[out] Pointer to the output aggregate
Definition dispatch_reduce.cuh:704

cub::DispatchSegmentedReduce::d_begin_offsets
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
Definition dispatch_reduce.cuh:706

cub::DispatchSegmentedReduce::ptx_version
int ptx_version
[in] PTX version
Definition dispatch_reduce.cuh:712

cub::DispatchSegmentedReduce::d_temp_storage
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
Definition dispatch_reduce.cuh:701

cub::DispatchSegmentedReduce::init
OutputT init
[in] The initial value of the reduction
Definition dispatch_reduce.cuh:709

cub::DispatchSegmentedReduce::InvokePasses
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
Invocation.
Definition dispatch_reduce.cuh:759

cub::DispatchSegmentedReduce::reduction_op
ReductionOpT reduction_op
[in] Binary reduction functor
Definition dispatch_reduce.cuh:708

cub::DispatchSegmentedReduce::d_in
InputIteratorT d_in
[in] Pointer to the input sequence of data items
Definition dispatch_reduce.cuh:703

cub::DispatchSegmentedReduce::OutputT
If<(Equals< typenamestd::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typenamestd::iterator_traits< InputIteratorT >::value_type, typenamestd::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
Definition dispatch_reduce.cuh:694

cub::DispatchSegmentedReduce::d_end_offsets
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
Definition dispatch_reduce.cuh:707

cub::DispatchSegmentedReduce::Invoke
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
Definition dispatch_reduce.cuh:817

cub::DispatchSegmentedReduce::debug_synchronous
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
Definition dispatch_reduce.cuh:711

cub::DispatchSegmentedReduce::stream
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
Definition dispatch_reduce.cuh:710

cub::DispatchSegmentedReduce::num_segments
OffsetT num_segments
[in] The number of segments that comprise the sorting data
Definition dispatch_reduce.cuh:705

cub::DispatchSegmentedReduce::DispatchSegmentedReduce
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
Definition dispatch_reduce.cuh:720

cub::DispatchSegmentedReduce::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Definition dispatch_reduce.cuh:835

cub::Equals
Type equality test.
Definition util_type.cuh:99

cub::GridEvenShare
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
Definition grid_even_share.cuh:75

cub::GridEvenShare::DispatchInit
__host__ __device__ __forceinline__ void DispatchInit(OffsetT num_items, int max_grid_size, int tile_items)
Dispatch initializer. To be called prior prior to kernel launch.
Definition grid_even_share.cuh:122

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::KernelConfig
Definition util_device.cuh:279

CUB_SUBSCRIPTION_FACTOR
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
Definition util_arch.cuh:99

CUB_SCALED_GRANULARITIES
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.
Definition util_arch.cuh:141