doxygen/openfpm/dispatch__spmv__orig_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <stdio.h>

#include <iterator>


#include "../../agent/single_pass_scan_operators.cuh"

#include "../../agent/agent_segment_fixup.cuh"

#include "../../agent/agent_spmv_orig.cuh"

#include "../../util_type.cuh"

#include "../../util_debug.cuh"

#include "../../util_device.cuh"

#include "../../thread/thread_search.cuh"

#include "../../grid/grid_queue.cuh"

#include "../../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * SpMV kernel entry points

 *****************************************************************************/


template <

    typename    AgentSpmvPolicyT,

    typename    ValueT,

    typename    OffsetT>

__global__ void DeviceSpmv1ColKernel(

    SpmvParams<ValueT, OffsetT> spmv_params)

{

    typedef CacheModifiedInputIterator<

            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,

            ValueT,

            OffsetT>

        VectorValueIteratorT;


    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);


    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;

    if (row_idx < spmv_params.num_rows)

    {

        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];

        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];


        ValueT value = 0.0;

        if (end_nonzero_idx != nonzero_idx)

        {

            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];

        }


        spmv_params.d_vector_y[row_idx] = value;

    }

}


template <

    typename    SpmvPolicyT,

    typename    OffsetT,

    typename    CoordinateT,

    typename    SpmvParamsT>

__global__ void DeviceSpmvSearchKernel(

    int             num_merge_tiles,

    CoordinateT*    d_tile_coordinates,

    SpmvParamsT     spmv_params)

{

    enum

    {

        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,

        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,

        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,

    };


    typedef CacheModifiedInputIterator<

            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,

            OffsetT,

            OffsetT>

        RowOffsetsSearchIteratorT;


    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)

    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;

    if (tile_idx < num_merge_tiles + 1)

    {

        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);

        CoordinateT                     tile_coordinate;

        CountingInputIterator<OffsetT>  nonzero_indices(0);


        // Search the merge path

        MergePathSearch(

            diagonal,

            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),

            nonzero_indices,

            spmv_params.num_rows,

            spmv_params.num_nonzeros,

            tile_coordinate);


        // Output starting offset

        d_tile_coordinates[tile_idx] = tile_coordinate;

    }

}


template <

    typename        SpmvPolicyT,

    typename        ScanTileStateT,

    typename        ValueT,

    typename        OffsetT,

    typename        CoordinateT,

    bool            HAS_ALPHA,

    bool            HAS_BETA>

__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))

__global__ void DeviceSpmvKernel(

    SpmvParams<ValueT, OffsetT>     spmv_params,

    CoordinateT*                    d_tile_coordinates,

    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,

    int                             num_tiles,

    ScanTileStateT                  tile_state,

    int                             num_segment_fixup_tiles)

{

    // Spmv agent type specialization

    typedef AgentSpmv<

            SpmvPolicyT,

            ValueT,

            OffsetT,

            HAS_ALPHA,

            HAS_BETA>

        AgentSpmvT;


    // Shared memory for AgentSpmv

    __shared__ typename AgentSpmvT::TempStorage temp_storage;


    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(

        d_tile_coordinates,

        d_tile_carry_pairs,

        num_tiles);


    // Initialize fixup tile status

    tile_state.InitializeStatus(num_segment_fixup_tiles);


}


template <

    typename    AgentSegmentFixupPolicyT,

    typename    PairsInputIteratorT,

    typename    AggregatesOutputIteratorT,

    typename    OffsetT,

    typename    ScanTileStateT>

__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))

__global__ void DeviceSegmentFixupKernel(

    PairsInputIteratorT         d_pairs_in,

    AggregatesOutputIteratorT   d_aggregates_out,

    OffsetT                     num_items,

    int                         num_tiles,

    ScanTileStateT              tile_state)

{

    // Thread block type for reducing tiles of value segments

    typedef AgentSegmentFixup<

            AgentSegmentFixupPolicyT,

            PairsInputIteratorT,

            AggregatesOutputIteratorT,

            cub::Equality,

            cub::Sum,

            OffsetT>

        AgentSegmentFixupT;


    // Shared memory for AgentSegmentFixup

    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;


    // Process tiles

    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(

        num_items,

        num_tiles,

        tile_state);

}


/******************************************************************************

 * Dispatch

 ******************************************************************************/


template <

    typename    ValueT,

    typename    OffsetT>

struct DispatchSpmv

{

    //---------------------------------------------------------------------

    // Constants and Types

    //---------------------------------------------------------------------


    enum

    {

        INIT_KERNEL_THREADS = 128

    };


    // SpmvParams bundle type

    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;


    // 2D merge path coordinate type

    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;


    // Tile status descriptor interface type

    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;


    // Tuple type for scanning (pairs accumulated segment-value with segment-index)

    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;


    //---------------------------------------------------------------------

    // Tuning policies

    //---------------------------------------------------------------------


    struct Policy110

    {

        typedef AgentSpmvPolicy<

                128,

                1,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                false,

                BLOCK_SCAN_WARP_SCANS>

            SpmvPolicyT;


        typedef AgentSegmentFixupPolicy<

                128,

                4,

                BLOCK_LOAD_VECTORIZE,

                LOAD_DEFAULT,

                BLOCK_SCAN_WARP_SCANS>

            SegmentFixupPolicyT;

    };


    struct Policy200

    {

        typedef AgentSpmvPolicy<

                96,

                18,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                false,

                BLOCK_SCAN_RAKING>

            SpmvPolicyT;


        typedef AgentSegmentFixupPolicy<

                128,

                4,

                BLOCK_LOAD_VECTORIZE,

                LOAD_DEFAULT,

                BLOCK_SCAN_WARP_SCANS>

            SegmentFixupPolicyT;


    };


    struct Policy300

    {

        typedef AgentSpmvPolicy<

                96,

                6,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                false,

                BLOCK_SCAN_WARP_SCANS>

            SpmvPolicyT;


        typedef AgentSegmentFixupPolicy<

                128,

                4,

                BLOCK_LOAD_VECTORIZE,

                LOAD_DEFAULT,

                BLOCK_SCAN_WARP_SCANS>

            SegmentFixupPolicyT;


    };


    struct Policy350

    {

        typedef AgentSpmvPolicy<

                (sizeof(ValueT) > 4) ? 96 : 128,

                (sizeof(ValueT) > 4) ? 4 : 7,

                LOAD_LDG,

                LOAD_CA,

                LOAD_LDG,

                LOAD_LDG,

                LOAD_LDG,

                (sizeof(ValueT) > 4) ? true : false,

                BLOCK_SCAN_WARP_SCANS>

            SpmvPolicyT;


        typedef AgentSegmentFixupPolicy<

                128,

                3,

                BLOCK_LOAD_VECTORIZE,

                LOAD_LDG,

                BLOCK_SCAN_WARP_SCANS>

            SegmentFixupPolicyT;

    };


    struct Policy370

    {


        typedef AgentSpmvPolicy<

                (sizeof(ValueT) > 4) ? 128 : 128,

                (sizeof(ValueT) > 4) ? 9 : 14,

                LOAD_LDG,

                LOAD_CA,

                LOAD_LDG,

                LOAD_LDG,

                LOAD_LDG,

                false,

                BLOCK_SCAN_WARP_SCANS>

            SpmvPolicyT;


        typedef AgentSegmentFixupPolicy<

                128,

                3,

                BLOCK_LOAD_VECTORIZE,

                LOAD_LDG,

                BLOCK_SCAN_WARP_SCANS>

            SegmentFixupPolicyT;

    };


    struct Policy500

    {

        typedef AgentSpmvPolicy<

                (sizeof(ValueT) > 4) ? 64 : 128,

                (sizeof(ValueT) > 4) ? 6 : 7,

                LOAD_LDG,

                LOAD_DEFAULT,

                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,

                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,

                LOAD_LDG,

                (sizeof(ValueT) > 4) ? true : false,

                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>

            SpmvPolicyT;


        typedef AgentSegmentFixupPolicy<

                128,

                3,

                BLOCK_LOAD_VECTORIZE,

                LOAD_LDG,

                BLOCK_SCAN_RAKING_MEMOIZE>

            SegmentFixupPolicyT;

    };


    struct Policy600

    {

        typedef AgentSpmvPolicy<

                (sizeof(ValueT) > 4) ? 64 : 128,

                (sizeof(ValueT) > 4) ? 5 : 7,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                LOAD_DEFAULT,

                false,

                BLOCK_SCAN_WARP_SCANS>

            SpmvPolicyT;


        typedef AgentSegmentFixupPolicy<

                128,

                3,

                BLOCK_LOAD_DIRECT,

                LOAD_LDG,

                BLOCK_SCAN_WARP_SCANS>

            SegmentFixupPolicyT;

    };


    //---------------------------------------------------------------------

    // Tuning policies of current PTX compiler pass

    //---------------------------------------------------------------------


#if (CUB_PTX_ARCH >= 600)

    typedef Policy600 PtxPolicy;


#elif (CUB_PTX_ARCH >= 500)

    typedef Policy500 PtxPolicy;


#elif (CUB_PTX_ARCH >= 370)

    typedef Policy370 PtxPolicy;


#elif (CUB_PTX_ARCH >= 350)

    typedef Policy350 PtxPolicy;


#elif (CUB_PTX_ARCH >= 300)

    typedef Policy300 PtxPolicy;


#elif (CUB_PTX_ARCH >= 200)

    typedef Policy200 PtxPolicy;


#else

    typedef Policy110 PtxPolicy;


#endif


    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)

    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};

    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};


    //---------------------------------------------------------------------

    // Utilities

    //---------------------------------------------------------------------


    template <typename KernelConfig>

    CUB_RUNTIME_FUNCTION __forceinline__

    static void InitConfigs(

        int             ptx_version,

        KernelConfig    &spmv_config,

        KernelConfig    &segment_fixup_config)

    {

    #if (CUB_PTX_ARCH > 0)


        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy

        spmv_config.template Init<PtxSpmvPolicyT>();

        segment_fixup_config.template Init<PtxSegmentFixupPolicy>();


    #else


        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version

        if (ptx_version >= 600)

        {

            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();

            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();

        }

        else if (ptx_version >= 500)

        {

            spmv_config.template            Init<typename Policy500::SpmvPolicyT>();

            segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();

        }

        else if (ptx_version >= 370)

        {

            spmv_config.template            Init<typename Policy370::SpmvPolicyT>();

            segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();

        }

        else if (ptx_version >= 350)

        {

            spmv_config.template            Init<typename Policy350::SpmvPolicyT>();

            segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();

        }

        else if (ptx_version >= 300)

        {

            spmv_config.template            Init<typename Policy300::SpmvPolicyT>();

            segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();


        }

        else if (ptx_version >= 200)

        {

            spmv_config.template            Init<typename Policy200::SpmvPolicyT>();

            segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();

        }

        else

        {

            spmv_config.template            Init<typename Policy110::SpmvPolicyT>();

            segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();

        }


    #endif

    }


    struct KernelConfig

    {

        int block_threads;

        int items_per_thread;

        int tile_items;


        template <typename PolicyT>

        CUB_RUNTIME_FUNCTION __forceinline__

        void Init()

        {

            block_threads       = PolicyT::BLOCK_THREADS;

            items_per_thread    = PolicyT::ITEMS_PER_THREAD;

            tile_items          = block_threads * items_per_thread;

        }

    };


    //---------------------------------------------------------------------

    // Dispatch entrypoints

    //---------------------------------------------------------------------


    template <

        typename                Spmv1ColKernelT,

        typename                SpmvSearchKernelT,

        typename                SpmvKernelT,

        typename                SegmentFixupKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Dispatch(

        void*                   d_temp_storage,

        size_t&                 temp_storage_bytes,

        SpmvParamsT&            spmv_params,

        cudaStream_t            stream,

        bool                    debug_synchronous,

        Spmv1ColKernelT         spmv_1col_kernel,

        SpmvSearchKernelT       spmv_search_kernel,

        SpmvKernelT             spmv_kernel,

        SegmentFixupKernelT     segment_fixup_kernel,

        KernelConfig            spmv_config,

        KernelConfig            segment_fixup_config)

    {

#ifndef CUB_RUNTIME_ENABLED


        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported );


#else

        cudaError error = cudaSuccess;

        do

        {

            if (spmv_params.num_cols == 1)

            {

                if (d_temp_storage == NULL)

                {

                    // Return if the caller is simply requesting the size of the storage allocation

                    temp_storage_bytes = 1;

                    break;

                }


                // Get search/init grid dims

                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;

                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;


                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",

                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);


                // Invoke spmv_search_kernel

                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(

                    spmv_params);


                // Check for failure to launch

                if (CubDebug(error = cudaPeekAtLastError())) break;


                // Sync the stream if specified to flush runtime errors

                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


                break;

            }


            // Get device ordinal

            int device_ordinal;

            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;


            // Get SM count

            int sm_count;

            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;


            // Get max x-dimension of grid

            int max_dim_x;

            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;


            // Total number of spmv work items

            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;


            // Tile sizes of kernels

            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;

            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;


            // Number of tiles for kernels

            unsigned int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;

            unsigned int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;


            // Get SM occupancy for kernels

            int spmv_sm_occupancy;

            if (CubDebug(error = MaxSmOccupancy(

                spmv_sm_occupancy,

                spmv_kernel,

                spmv_config.block_threads))) break;


            int segment_fixup_sm_occupancy;

            if (CubDebug(error = MaxSmOccupancy(

                segment_fixup_sm_occupancy,

                segment_fixup_kernel,

                segment_fixup_config.block_threads))) break;


            // Get grid dimensions

            dim3 spmv_grid_size(

                CUB_MIN(num_merge_tiles, max_dim_x),

                (num_merge_tiles + max_dim_x - 1) / max_dim_x,

                1);


            dim3 segment_fixup_grid_size(

                CUB_MIN(num_segment_fixup_tiles, max_dim_x),

                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,

                1);


            // Get the temporary storage allocation requirements

            size_t allocation_sizes[3];

            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors

            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs

            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates


            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)

            void* allocations[3];

            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;

            if (d_temp_storage == NULL)

            {

                // Return if the caller is simply requesting the size of the storage allocation

                break;

            }


            // Construct the tile status interface

            ScanTileStateT tile_state;

            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;


            // Alias the other allocations

            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs

            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates


            // Get search/init grid dims

            int search_block_size   = INIT_KERNEL_THREADS;

            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;


#if (CUB_PTX_ARCH == 0)

            // Init textures

            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;

#endif


            if (search_grid_size < sm_count)

//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)

            {

                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords

                d_tile_coordinates = NULL;

            }

            else

            {

                // Use separate search kernel if we have enough spmv tiles to saturate the device


                // Log spmv_search_kernel configuration

                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",

                    search_grid_size, search_block_size, (long long) stream);


                // Invoke spmv_search_kernel

                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(

                    num_merge_tiles,

                    d_tile_coordinates,

                    spmv_params);


                // Check for failure to launch

                if (CubDebug(error = cudaPeekAtLastError())) break;


                // Sync the stream if specified to flush runtime errors

                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            }


            // Log spmv_kernel configuration

            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",

                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);


            // Invoke spmv_kernel

            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(

                spmv_params,

                d_tile_coordinates,

                d_tile_carry_pairs,

                num_merge_tiles,

                tile_state,

                num_segment_fixup_tiles);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


            // Run reduce-by-key fixup if necessary

            if (num_merge_tiles > 1)

            {

                // Log segment_fixup_kernel configuration

                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",

                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);


                // Invoke segment_fixup_kernel

                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(

                    d_tile_carry_pairs,

                    spmv_params.d_vector_y,

                    num_merge_tiles,

                    num_segment_fixup_tiles,

                    tile_state);


                // Check for failure to launch

                if (CubDebug(error = cudaPeekAtLastError())) break;


                // Sync the stream if specified to flush runtime errors

                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            }


#if (CUB_PTX_ARCH == 0)

            // Free textures

            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;

#endif

        }

        while (0);


        return error;


#endif // CUB_RUNTIME_ENABLED

    }


    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Dispatch(

        void*                   d_temp_storage,

        size_t&                 temp_storage_bytes,

        SpmvParamsT&            spmv_params,

        cudaStream_t            stream                  = 0,

        bool                    debug_synchronous       = false)

    {

        cudaError error = cudaSuccess;

        do

        {

            // Get PTX version

            int ptx_version;

    #if (CUB_PTX_ARCH == 0)

            if (CubDebug(error = PtxVersion(ptx_version))) break;

    #else

            ptx_version = CUB_PTX_ARCH;

    #endif


            // Get kernel kernel dispatch configurations

            KernelConfig spmv_config, segment_fixup_config;

            InitConfigs(ptx_version, spmv_config, segment_fixup_config);


            if (CubDebug(error = Dispatch(

                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,

                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,

                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,

                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,

                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,

                spmv_config, segment_fixup_config))) break;


        }

        while (0);


        return error;

    }

};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::CacheModifiedInputIterator
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
Definition cache_modified_input_iterator.cuh:108

cub::CountingInputIterator
A random-access input generator for dereferencing a sequence of incrementing integer values.
Definition counting_input_iterator.cuh:95

cub::BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_DIRECT
Definition block_load.cuh:485

cub::BLOCK_LOAD_VECTORIZE
@ BLOCK_LOAD_VECTORIZE
Definition block_load.cuh:505

cub::LOAD_LDG
@ LOAD_LDG
Cache as texture.
Definition thread_load.cuh:69

cub::LOAD_CA
@ LOAD_CA
Cache at all levels.
Definition thread_load.cuh:65

cub::LOAD_DEFAULT
@ LOAD_DEFAULT
Default (no modifier)
Definition thread_load.cuh:64

_CubLog
#define _CubLog(format,...)
Log macro for printf statements.
Definition util_debug.cuh:112

cub::PtxVersion
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
Definition util_device.cuh:118

cub::AliasTemporaries
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition util_device.cuh:62

cub::MaxSmOccupancy
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Definition util_device.cuh:244

CubDebug
#define CubDebug(e)
Debug macro.
Definition util_debug.cuh:94

cub::SyncStream
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Definition util_device.cuh:199

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::spmv_params
OffsetT spmv_params
[in] SpMV input parameter bundle
Definition dispatch_spmv_orig.cuh:159

cub::DeviceSpmvSearchKernel
__global__ void DeviceSpmvSearchKernel(int num_merge_tiles, CoordinateT *d_tile_coordinates, SpmvParamsT spmv_params)
< SpmvParams type
Definition dispatch_spmv_orig.cuh:104

cub::DeviceSpmv1ColKernel
__global__ void DeviceSpmv1ColKernel(SpmvParams< ValueT, OffsetT > spmv_params)
< Signed integer type for sequence offsets
Definition dispatch_spmv_orig.cuh:68

cub::num_items
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:168

cub::num_segment_fixup_tiles
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > int ScanTileStateT int num_segment_fixup_tiles
< [in] Number of reduce-by-key tiles (fixup grid size)
Definition dispatch_spmv_orig.cuh:165

cub::d_tile_coordinates
OffsetT CoordinateT * d_tile_coordinates
[in] Pointer to the temporary array of tile starting coordinates
Definition dispatch_spmv_orig.cuh:160

cub::__launch_bounds__
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets

cub::d_tile_carry_pairs
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > * d_tile_carry_pairs
[out] Pointer to the temporary array carry-out dot product row-ids, one per block
Definition dispatch_spmv_orig.cuh:161

cub::num_tiles
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
Definition dispatch_rle.cuh:84

cub::BLOCK_SCAN_RAKING
@ BLOCK_SCAN_RAKING
Definition block_scan.cuh:78

cub::BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_WARP_SCANS
Definition block_scan.cuh:108

cub::BLOCK_SCAN_RAKING_MEMOIZE
@ BLOCK_SCAN_RAKING_MEMOIZE
Definition block_scan.cuh:88

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::MergePathSearch
__host__ __device__ __forceinline__ void MergePathSearch(OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT &path_coordinate)
Definition thread_search.cuh:53

cub::tile_state
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
Definition dispatch_reduce_by_key.cuh:78

cub::d_aggregates_out
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
Definition dispatch_reduce_by_key.cuh:76

cub::AgentSegmentFixupPolicy
< The BlockScan algorithm to use
Definition agent_segment_fixup.cuh:68

cub::AgentSegmentFixup
AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device...
Definition agent_segment_fixup.cuh:96

cub::AgentSpmvPolicy
< The BlockScan algorithm to use
Definition agent_spmv_orig.cuh:74

cub::AgentSpmv
AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide Sp...
Definition agent_spmv_orig.cuh:127

cub::CubVector
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition util_type.cuh:454

cub::DispatchSpmv::KernelConfig
Definition dispatch_spmv_orig.cuh:545

cub::DispatchSpmv::Policy110
SM11.
Definition dispatch_spmv_orig.cuh:267

cub::DispatchSpmv::Policy200
SM20.
Definition dispatch_spmv_orig.cuh:291

cub::DispatchSpmv::Policy300
SM30.
Definition dispatch_spmv_orig.cuh:318

cub::DispatchSpmv::Policy350
SM35.
Definition dispatch_spmv_orig.cuh:344

cub::DispatchSpmv::Policy370
SM37.
Definition dispatch_spmv_orig.cuh:369

cub::DispatchSpmv::Policy500
SM50.
Definition dispatch_spmv_orig.cuh:394

cub::DispatchSpmv::Policy600
SM60.
Definition dispatch_spmv_orig.cuh:420

cub::DispatchSpmv::PtxSegmentFixupPolicy
Definition dispatch_spmv_orig.cuh:474

cub::DispatchSpmv::PtxSpmvPolicyT
Definition dispatch_spmv_orig.cuh:473

cub::DispatchSpmv
< Signed integer type for global offsets
Definition dispatch_spmv_orig.cuh:238

cub::DispatchSpmv::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream=0, bool debug_synchronous=false)
Definition dispatch_spmv_orig.cuh:793

cub::DispatchSpmv::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream, bool debug_synchronous, Spmv1ColKernelT spmv_1col_kernel, SpmvSearchKernelT spmv_search_kernel, SpmvKernelT spmv_kernel, SegmentFixupKernelT segment_fixup_kernel, KernelConfig spmv_config, KernelConfig segment_fixup_config)
< Function type of cub::DeviceSegmentFixupKernelT
Definition dispatch_spmv_orig.cuh:578

cub::DispatchSpmv::InitConfigs
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &spmv_config, KernelConfig &segment_fixup_config)
Definition dispatch_spmv_orig.cuh:486

cub::Equality
Default equality functor.
Definition thread_operators.cuh:60

cub::KeyValuePair
A key identifier paired with a corresponding value.
Definition util_type.cuh:667

cub::ReduceByKeyScanTileState
Definition single_pass_scan_operators.cuh:450

cub::SpmvParams
< Signed integer type for sequence offsets
Definition agent_spmv_orig.cuh:100

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53