doxygen/openfpm/agent__spmv__orig_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <iterator>


#include "../util_type.cuh"

#include "../block/block_reduce.cuh"

#include "../block/block_scan.cuh"

#include "../block/block_exchange.cuh"

#include "../thread/thread_search.cuh"

#include "../thread/thread_operators.cuh"

#include "../iterator/cache_modified_input_iterator.cuh"

#include "../iterator/counting_input_iterator.cuh"

#include "../iterator/tex_ref_input_iterator.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Tuning policy

 ******************************************************************************/


template <

    int                             _BLOCK_THREADS,

    int                             _ITEMS_PER_THREAD,

    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,

    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,

    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,

    CacheLoadModifier               _VALUES_LOAD_MODIFIER,

    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,

    bool                            _DIRECT_LOAD_NONZEROS,

    BlockScanAlgorithm              _SCAN_ALGORITHM>

struct AgentSpmvPolicy

{

    enum

    {

        BLOCK_THREADS                                                   = _BLOCK_THREADS,

        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,

        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,

    };


    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;

    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;

    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;

    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;

    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;

    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;


};


/******************************************************************************

 * Thread block abstractions

 ******************************************************************************/


template <

    typename        ValueT,

    typename        OffsetT>

struct SpmvParams

{

    ValueT*         d_values;

    OffsetT*        d_row_end_offsets;

    OffsetT*        d_column_indices;

    ValueT*         d_vector_x;

    ValueT*         d_vector_y;

    int             num_rows;

    int             num_cols;

    int             num_nonzeros;

    ValueT          alpha;

    ValueT          beta;


    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;

};


template <

    typename    AgentSpmvPolicyT,

    typename    ValueT,

    typename    OffsetT,

    bool        HAS_ALPHA,

    bool        HAS_BETA,

    int         PTX_ARCH = CUB_PTX_ARCH>

struct AgentSpmv

{

    //---------------------------------------------------------------------

    // Types and constants

    //---------------------------------------------------------------------


    enum

    {

        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,

        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,

        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,

    };


    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;


    typedef CacheModifiedInputIterator<

            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,

            OffsetT,

            OffsetT>

        RowOffsetsSearchIteratorT;


    typedef CacheModifiedInputIterator<

            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,

            OffsetT,

            OffsetT>

        RowOffsetsIteratorT;


    typedef CacheModifiedInputIterator<

            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,

            OffsetT,

            OffsetT>

        ColumnIndicesIteratorT;


    typedef CacheModifiedInputIterator<

            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,

            ValueT,

            OffsetT>

        ValueIteratorT;


    typedef CacheModifiedInputIterator<

            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,

            ValueT,

            OffsetT>

        VectorValueIteratorT;


    // Tuple type for scanning (pairs accumulated segment-value with segment-index)

    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;


    // Reduce-value-by-segment scan operator

    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;


    // BlockReduce specialization

    typedef BlockReduce<

            ValueT,

            BLOCK_THREADS,

            BLOCK_REDUCE_WARP_REDUCTIONS>

        BlockReduceT;


    // BlockScan specialization

    typedef BlockScan<

            KeyValuePairT,

            BLOCK_THREADS,

            AgentSpmvPolicyT::SCAN_ALGORITHM>

        BlockScanT;


    // BlockScan specialization

    typedef BlockScan<

            ValueT,

            BLOCK_THREADS,

            AgentSpmvPolicyT::SCAN_ALGORITHM>

        BlockPrefixSumT;


    // BlockExchange specialization

    typedef BlockExchange<

            ValueT,

            BLOCK_THREADS,

            ITEMS_PER_THREAD>

        BlockExchangeT;


    union MergeItem

    {

        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)

        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;


        OffsetT     row_end_offset;

        MergeValueT nonzero;

    };


    struct _TempStorage

    {

        CoordinateT tile_coords[2];


        union Aliasable

        {

            // Smem needed for tile of merge items

            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];


            // Smem needed for block exchange

            typename BlockExchangeT::TempStorage exchange;


            // Smem needed for block-wide reduction

            typename BlockReduceT::TempStorage reduce;


            // Smem needed for tile scanning

            typename BlockScanT::TempStorage scan;


            // Smem needed for tile prefix sum

            typename BlockPrefixSumT::TempStorage prefix_sum;


        } aliasable;

    };


    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------

    // Per-thread fields

    //---------------------------------------------------------------------


    _TempStorage&                   temp_storage;


    SpmvParams<ValueT, OffsetT>&    spmv_params;


    ValueIteratorT                  wd_values;

    RowOffsetsIteratorT             wd_row_end_offsets;

    ColumnIndicesIteratorT          wd_column_indices;

    VectorValueIteratorT            wd_vector_x;

    VectorValueIteratorT            wd_vector_y;


    //---------------------------------------------------------------------

    // Interface

    //---------------------------------------------------------------------


    __device__ __forceinline__ AgentSpmv(

        TempStorage&                    temp_storage,

        SpmvParams<ValueT, OffsetT>&    spmv_params)

    :

        temp_storage(temp_storage.Alias()),

        spmv_params(spmv_params),

        wd_values(spmv_params.d_values),

        wd_row_end_offsets(spmv_params.d_row_end_offsets),

        wd_column_indices(spmv_params.d_column_indices),

        wd_vector_x(spmv_params.d_vector_x),

        wd_vector_y(spmv_params.d_vector_y)

    {}


    __device__ __forceinline__ KeyValuePairT ConsumeTile(

        int             tile_idx,

        CoordinateT     tile_start_coord,

        CoordinateT     tile_end_coord,

        Int2Type<true>  is_direct_load)

    {

        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;

        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;

        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;


        // Gather the row end-offsets for the merge tile into shared memory

        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)

        {

            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];

        }


        CTA_SYNC();


        // Search for the thread's starting coordinate within the merge tile

        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);

        CoordinateT                     thread_start_coord;


        MergePathSearch(

            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal

            s_tile_row_end_offsets,                     // List A

            tile_nonzero_indices,                       // List B

            tile_num_rows,

            tile_num_nonzeros,

            thread_start_coord);


        CTA_SYNC();            // Perf-sync


        // Compute the thread's merge path segment

        CoordinateT     thread_current_coord = thread_start_coord;

        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];


        ValueT          running_total = 0.0;


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

        {

            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);

            OffsetT column_idx          = wd_column_indices[nonzero_idx];

            ValueT  value               = wd_values[nonzero_idx];


            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];

#if (CUB_PTX_ARCH >= 350)

            vector_value                = wd_vector_x[column_idx];

#endif

            ValueT  nonzero             = value * vector_value;


            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];


            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)

            {

                // Move down (accumulate)

                running_total += nonzero;

                scan_segment[ITEM].value    = running_total;

                scan_segment[ITEM].key      = tile_num_rows;

                ++thread_current_coord.y;

            }

            else

            {

                // Move right (reset)

                scan_segment[ITEM].value    = running_total;

                scan_segment[ITEM].key      = thread_current_coord.x;

                running_total               = 0.0;

                ++thread_current_coord.x;

            }

        }


        CTA_SYNC();


        // Block-wide reduce-value-by-segment

        KeyValuePairT       tile_carry;

        ReduceBySegmentOpT  scan_op;

        KeyValuePairT       scan_item;


        scan_item.value = running_total;

        scan_item.key   = thread_current_coord.x;


        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);


        if (tile_num_rows > 0)

        {

            if (threadIdx.x == 0)

                scan_item.key = -1;


            // Direct scatter

            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

            {

                if (scan_segment[ITEM].key < tile_num_rows)

                {

                    if (scan_item.key == scan_segment[ITEM].key)

                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;


                    if (HAS_ALPHA)

                    {

                        scan_segment[ITEM].value *= spmv_params.alpha;

                    }


                    if (HAS_BETA)

                    {

                        // Update the output vector element

                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];

                        scan_segment[ITEM].value += addend;

                    }


                    // Set the output vector element

                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;

                }

            }

        }


        // Return the tile's running carry-out

        return tile_carry;

    }


    __device__ __forceinline__ KeyValuePairT ConsumeTile(

        int             tile_idx,

        CoordinateT     tile_start_coord,

        CoordinateT     tile_end_coord,

        Int2Type<false> is_direct_load)

    {

        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;

        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;


#if (CUB_PTX_ARCH >= 520)


        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;

        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;


        // Gather the nonzeros for the merge tile into shared memory

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

        {

            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);


            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;

            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;

            ValueT* s                       = s_tile_nonzeros + nonzero_idx;


            if (nonzero_idx < tile_num_nonzeros)

            {


                OffsetT column_idx              = *ci;

                ValueT  value                   = *a;


                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];

                vector_value                    = wd_vector_x[column_idx];


                ValueT  nonzero                 = value * vector_value;


                *s    = nonzero;

            }

        }


#else


        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;

        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;


        // Gather the nonzeros for the merge tile into shared memory

        if (tile_num_nonzeros > 0)

        {

            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

            {

                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);

                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);


                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];

                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];


                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];

#if (CUB_PTX_ARCH >= 350)

                vector_value                    = wd_vector_x[column_idx];

#endif

                ValueT  nonzero                 = value * vector_value;


                s_tile_nonzeros[nonzero_idx]    = nonzero;

            }

        }


#endif


        // Gather the row end-offsets for the merge tile into shared memory

        #pragma unroll 1

        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)

        {

            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];

        }


        CTA_SYNC();


        // Search for the thread's starting coordinate within the merge tile

        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);

        CoordinateT                     thread_start_coord;


        MergePathSearch(

            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal

            s_tile_row_end_offsets,                     // List A

            tile_nonzero_indices,                       // List B

            tile_num_rows,

            tile_num_nonzeros,

            thread_start_coord);


        CTA_SYNC();            // Perf-sync


        // Compute the thread's merge path segment

        CoordinateT     thread_current_coord = thread_start_coord;

        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];

        ValueT          running_total = 0.0;


        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];

        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

        {

            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)

            {

                // Move down (accumulate)

                scan_segment[ITEM].value    = nonzero;

                running_total               += nonzero;

                ++thread_current_coord.y;

                nonzero                     = s_tile_nonzeros[thread_current_coord.y];

            }

            else

            {

                // Move right (reset)

                scan_segment[ITEM].value    = 0.0;

                running_total               = 0.0;

                ++thread_current_coord.x;

                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];

            }


            scan_segment[ITEM].key = thread_current_coord.x;

        }


        CTA_SYNC();


        // Block-wide reduce-value-by-segment

        KeyValuePairT       tile_carry;

        ReduceBySegmentOpT  scan_op;

        KeyValuePairT       scan_item;


        scan_item.value = running_total;

        scan_item.key = thread_current_coord.x;


        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);


        if (threadIdx.x == 0)

        {

            scan_item.key = thread_start_coord.x;

            scan_item.value = 0.0;

        }


        if (tile_num_rows > 0)

        {


            CTA_SYNC();


            // Scan downsweep and scatter

            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;


            if (scan_item.key != scan_segment[0].key)

            {

                s_partials[scan_item.key] = scan_item.value;

            }

            else

            {

                scan_segment[0].value += scan_item.value;

            }


            #pragma unroll

            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)

            {

                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)

                {

                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;

                }

                else

                {

                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;

                }

            }


            CTA_SYNC();


            #pragma unroll 1

            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)

            {

                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];

            }

        }


        // Return the tile's running carry-out

        return tile_carry;

    }


    __device__ __forceinline__ void ConsumeTile(

        CoordinateT*    d_tile_coordinates,

        KeyValuePairT*  d_tile_carry_pairs,

        int             num_merge_tiles)

    {

        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index


        if (tile_idx >= num_merge_tiles)

            return;


        // Read our starting coordinates

        if (threadIdx.x < 2)

        {

            if (d_tile_coordinates == NULL)

            {

                // Search our starting coordinates

                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;

                CoordinateT                     tile_coord;

                CountingInputIterator<OffsetT>  nonzero_indices(0);


                // Search the merge path

                MergePathSearch(

                    diagonal,

                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),

                    nonzero_indices,

                    spmv_params.num_rows,

                    spmv_params.num_nonzeros,

                    tile_coord);


                temp_storage.tile_coords[threadIdx.x] = tile_coord;

            }

            else

            {

                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];

            }

        }


        CTA_SYNC();


        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];

        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];


        // Consume multi-segment tile

        KeyValuePairT tile_carry = ConsumeTile(

            tile_idx,

            tile_start_coord,

            tile_end_coord,

            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());


        // Output the tile's carry-out

        if (threadIdx.x == 0)

        {

            if (HAS_ALPHA)

                tile_carry.value *= spmv_params.alpha;


            tile_carry.key += tile_start_coord.x;

            d_tile_carry_pairs[tile_idx]    = tile_carry;

        }

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockExchange
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
Definition block_exchange.cuh:117

cub::BlockRadixRank
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition block_radix_rank.cuh:98

cub::BlockReduce
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
Definition block_reduce.cuh:222

cub::BlockScan
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition block_scan.cuh:194

cub::BlockScan::ExclusiveScan
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
Definition block_scan.cuh:728

cub::CacheModifiedInputIterator
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
Definition cache_modified_input_iterator.cuh:108

cub::CountingInputIterator
A random-access input generator for dereferencing a sequence of incrementing integer values.
Definition counting_input_iterator.cuh:95

cub::CacheLoadModifier
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition thread_load.cuh:63

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::d_tile_coordinates
OffsetT CoordinateT * d_tile_coordinates
[in] Pointer to the temporary array of tile starting coordinates
Definition dispatch_spmv_orig.cuh:160

cub::d_tile_carry_pairs
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > * d_tile_carry_pairs
[out] Pointer to the temporary array carry-out dot product row-ids, one per block
Definition dispatch_spmv_orig.cuh:161

cub::BlockScanAlgorithm
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
Definition block_scan.cuh:58

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::MergePathSearch
__host__ __device__ __forceinline__ void MergePathSearch(OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT &path_coordinate)
Definition thread_search.cuh:53

cub::BLOCK_REDUCE_WARP_REDUCTIONS
@ BLOCK_REDUCE_WARP_REDUCTIONS
Definition block_reduce.cuh:148

cub::scan_op
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Definition dispatch_scan.cuh:109

cub::AgentSpmvPolicy
< The BlockScan algorithm to use
Definition agent_spmv_orig.cuh:74

cub::AgentSpmvPolicy::ROW_OFFSETS_LOAD_MODIFIER
static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER
Cache load modifier for reading CSR row-offsets.
Definition agent_spmv_orig.cuh:83

cub::AgentSpmvPolicy::VECTOR_VALUES_LOAD_MODIFIER
static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER
Cache load modifier for reading vector values.
Definition agent_spmv_orig.cuh:86

cub::AgentSpmvPolicy::COLUMN_INDICES_LOAD_MODIFIER
static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER
Cache load modifier for reading CSR column-indices.
Definition agent_spmv_orig.cuh:84

cub::AgentSpmvPolicy::ROW_OFFSETS_SEARCH_LOAD_MODIFIER
static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER
Cache load modifier for reading CSR row-offsets.
Definition agent_spmv_orig.cuh:82

cub::AgentSpmvPolicy::ITEMS_PER_THREAD
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
Definition agent_spmv_orig.cuh:78

cub::AgentSpmvPolicy::DIRECT_LOAD_NONZEROS
@ DIRECT_LOAD_NONZEROS
Whether to load nonzeros directly from global during sequential merging (pre-staged through shared me...
Definition agent_spmv_orig.cuh:79

cub::AgentSpmvPolicy::BLOCK_THREADS
@ BLOCK_THREADS
Threads per thread block.
Definition agent_spmv_orig.cuh:77

cub::AgentSpmvPolicy::SCAN_ALGORITHM
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
Definition agent_spmv_orig.cuh:87

cub::AgentSpmvPolicy::VALUES_LOAD_MODIFIER
static const CacheLoadModifier VALUES_LOAD_MODIFIER
Cache load modifier for reading CSR values.
Definition agent_spmv_orig.cuh:85

cub::AgentSpmv::TempStorage
Temporary storage type (unionable)
Definition agent_spmv_orig.cuh:245

cub::AgentSpmv::_TempStorage
Shared memory type required by this thread block.
Definition agent_spmv_orig.cuh:221

cub::AgentSpmv
AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide Sp...
Definition agent_spmv_orig.cuh:127

cub::AgentSpmv::wd_values
ValueIteratorT wd_values
Wrapped pointer to the array of num_nonzeros values of the corresponding nonzero elements of matrix A...
Definition agent_spmv_orig.cuh:257

cub::AgentSpmv::AgentSpmv
__device__ __forceinline__ AgentSpmv(TempStorage &temp_storage, SpmvParams< ValueT, OffsetT > &spmv_params)
Definition agent_spmv_orig.cuh:271

cub::AgentSpmv::wd_column_indices
ColumnIndicesIteratorT wd_column_indices
Wrapped Pointer to the array of num_nonzeros column-indices of the corresponding nonzero elements of ...
Definition agent_spmv_orig.cuh:259

cub::AgentSpmv::spmv_params
SpmvParams< ValueT, OffsetT > & spmv_params
Reference to temp_storage.
Definition agent_spmv_orig.cuh:255

cub::AgentSpmv::RowOffsetsSearchIteratorT
CacheModifiedInputIterator< AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT > RowOffsetsSearchIteratorT
Input iterator wrapper types (for applying cache modifiers)
Definition agent_spmv_orig.cuh:149

cub::AgentSpmv::ConsumeTile
__device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type< true > is_direct_load)
Definition agent_spmv_orig.cuh:290

cub::AgentSpmv::wd_vector_x
VectorValueIteratorT wd_vector_x
Wrapped Pointer to the array of num_cols values corresponding to the dense input vector x
Definition agent_spmv_orig.cuh:260

cub::AgentSpmv::wd_vector_y
VectorValueIteratorT wd_vector_y
Wrapped Pointer to the array of num_cols values corresponding to the dense input vector x
Definition agent_spmv_orig.cuh:261

cub::AgentSpmv::wd_row_end_offsets
RowOffsetsIteratorT wd_row_end_offsets
Wrapped Pointer to the array of m offsets demarcating the end of every row in d_column_indices and d_...
Definition agent_spmv_orig.cuh:258

cub::AgentSpmv::CoordinateT
CubVector< OffsetT, 2 >::Type CoordinateT
2D merge path coordinate type
Definition agent_spmv_orig.cuh:141

cub::AgentSpmv::ConsumeTile
__device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type< false > is_direct_load)
Definition agent_spmv_orig.cuh:414

cub::AgentSpmv::ConsumeTile
__device__ __forceinline__ void ConsumeTile(CoordinateT *d_tile_coordinates, KeyValuePairT *d_tile_carry_pairs, int num_merge_tiles)
Definition agent_spmv_orig.cuh:602

cub::BlockExchange::TempStorage
\smemstorage{BlockExchange}
Definition block_exchange.cuh:165

cub::BlockReduce::TempStorage
\smemstorage{BlockReduce}
Definition block_reduce.cuh:277

cub::BlockScan::TempStorage
\smemstorage{BlockScan}
Definition block_scan.cuh:260

cub::CubVector
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition util_type.cuh:454

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::KeyValuePair
A key identifier paired with a corresponding value.
Definition util_type.cuh:667

cub::KeyValuePair::value
Value value
Item value.
Definition util_type.cuh:672

cub::KeyValuePair::key
Key key
Item key.
Definition util_type.cuh:671

cub::ReduceByKeyOp
< Binary reduction operator to apply to values
Definition thread_operators.cuh:282

cub::SpmvParams
< Signed integer type for sequence offsets
Definition agent_spmv_orig.cuh:100

cub::SpmvParams::d_vector_y
ValueT * d_vector_y
Pointer to the array of num_rows values corresponding to the dense output vector y
Definition agent_spmv_orig.cuh:105

cub::SpmvParams::d_row_end_offsets
OffsetT * d_row_end_offsets
Pointer to the array of m offsets demarcating the end of every row in d_column_indices and d_values.
Definition agent_spmv_orig.cuh:102

cub::SpmvParams::num_nonzeros
int num_nonzeros
Number of nonzero elements of matrix A.
Definition agent_spmv_orig.cuh:108

cub::SpmvParams::num_cols
int num_cols
Number of columns of matrix A.
Definition agent_spmv_orig.cuh:107

cub::SpmvParams::d_vector_x
ValueT * d_vector_x
Pointer to the array of num_cols values corresponding to the dense input vector x
Definition agent_spmv_orig.cuh:104

cub::SpmvParams::beta
ValueT beta
Beta addend-multiplicand.
Definition agent_spmv_orig.cuh:110

cub::SpmvParams::num_rows
int num_rows
Number of rows of matrix A.
Definition agent_spmv_orig.cuh:106

cub::SpmvParams::d_column_indices
OffsetT * d_column_indices
Pointer to the array of num_nonzeros column-indices of the corresponding nonzero elements of matrix A...
Definition agent_spmv_orig.cuh:103

cub::SpmvParams::alpha
ValueT alpha
Alpha multiplicand.
Definition agent_spmv_orig.cuh:109

cub::SpmvParams::d_values
ValueT * d_values
Pointer to the array of num_nonzeros values of the corresponding nonzero elements of matrix A.
Definition agent_spmv_orig.cuh:101

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

cub::AgentSpmv::MergeItem
Merge item type (either a non-zero value or a row-end offset)
Definition agent_spmv_orig.cuh:211

cub::AgentSpmv::_TempStorage::Aliasable
Definition agent_spmv_orig.cuh:225

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53