38#include "../util_type.cuh"
39#include "../block/block_reduce.cuh"
40#include "../block/block_scan.cuh"
41#include "../block/block_exchange.cuh"
42#include "../thread/thread_search.cuh"
43#include "../thread/thread_operators.cuh"
44#include "../iterator/cache_modified_input_iterator.cuh"
45#include "../iterator/counting_input_iterator.cuh"
46#include "../iterator/tex_ref_input_iterator.cuh"
47#include "../util_namespace.cuh"
65 int _ITEMS_PER_THREAD,
71 bool _DIRECT_LOAD_NONZEROS,
112 TexRefInputIterator<ValueT, 66778899, OffsetT> t_vector_x;
120 typename AgentSpmvPolicyT,
135 BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS,
136 ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD,
137 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
146 AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
152 AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
158 AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
164 AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
170 AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
192 AgentSpmvPolicyT::SCAN_ALGORITHM>
199 AgentSpmvPolicyT::SCAN_ALGORITHM>
227 MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
275 temp_storage(temp_storage.Alias()),
296 int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
297 int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
298 OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
301 for (
int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
313 OffsetT(threadIdx.x * ITEMS_PER_THREAD),
314 s_tile_row_end_offsets,
315 tile_nonzero_indices,
323 CoordinateT thread_current_coord = thread_start_coord;
326 ValueT running_total = 0.0;
329 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
335 ValueT vector_value =
spmv_params.t_vector_x[column_idx];
336#if (CUB_PTX_ARCH >= 350)
339 ValueT nonzero = value * vector_value;
341 OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
343 if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
346 running_total += nonzero;
347 scan_segment[ITEM].
value = running_total;
348 scan_segment[ITEM].
key = tile_num_rows;
349 ++thread_current_coord.y;
354 scan_segment[ITEM].
value = running_total;
355 scan_segment[ITEM].
key = thread_current_coord.x;
357 ++thread_current_coord.x;
368 scan_item.
value = running_total;
369 scan_item.
key = thread_current_coord.x;
373 if (tile_num_rows > 0)
375 if (threadIdx.x == 0)
380 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
382 if (scan_segment[ITEM].key < tile_num_rows)
384 if (scan_item.
key == scan_segment[ITEM].
key)
396 scan_segment[ITEM].
value += addend;
400 spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].
key] = scan_segment[ITEM].
value;
420 int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
421 int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
423#if (CUB_PTX_ARCH >= 520)
425 OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
426 ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
430 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
432 int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
436 ValueT* s = s_tile_nonzeros + nonzero_idx;
438 if (nonzero_idx < tile_num_nonzeros)
444 ValueT vector_value =
spmv_params.t_vector_x[column_idx];
447 ValueT nonzero = value * vector_value;
456 OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
457 ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
460 if (tile_num_nonzeros > 0)
463 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
465 int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
466 nonzero_idx =
CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
469 ValueT value =
wd_values[tile_start_coord.y + nonzero_idx];
471 ValueT vector_value =
spmv_params.t_vector_x[column_idx];
472#if (CUB_PTX_ARCH >= 350)
475 ValueT nonzero = value * vector_value;
477 s_tile_nonzeros[nonzero_idx] = nonzero;
485 for (
int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
497 OffsetT(threadIdx.x * ITEMS_PER_THREAD),
498 s_tile_row_end_offsets,
499 tile_nonzero_indices,
507 CoordinateT thread_current_coord = thread_start_coord;
509 ValueT running_total = 0.0;
511 OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
512 ValueT nonzero = s_tile_nonzeros[thread_current_coord.y];
515 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
517 if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
520 scan_segment[ITEM].
value = nonzero;
521 running_total += nonzero;
522 ++thread_current_coord.y;
523 nonzero = s_tile_nonzeros[thread_current_coord.y];
528 scan_segment[ITEM].
value = 0.0;
530 ++thread_current_coord.x;
531 row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
534 scan_segment[ITEM].
key = thread_current_coord.x;
544 scan_item.
value = running_total;
545 scan_item.
key = thread_current_coord.x;
549 if (threadIdx.x == 0)
551 scan_item.
key = thread_start_coord.x;
552 scan_item.
value = 0.0;
555 if (tile_num_rows > 0)
561 ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
563 if (scan_item.
key != scan_segment[0].
key)
565 s_partials[scan_item.
key] = scan_item.
value;
573 for (
int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
575 if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
577 s_partials[scan_segment[ITEM - 1].
key] = scan_segment[ITEM - 1].value;
581 scan_segment[ITEM].
value += scan_segment[ITEM - 1].
value;
588 for (
int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
590 spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
607 int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;
609 if (tile_idx >= num_merge_tiles)
618 OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
631 temp_storage.tile_coords[threadIdx.x] = tile_coord;
641 CoordinateT tile_start_coord = temp_storage.tile_coords[0];
642 CoordinateT tile_end_coord = temp_storage.tile_coords[1];
652 if (threadIdx.x == 0)
657 tile_carry.
key += tile_start_coord.x;
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
OffsetT CoordinateT * d_tile_coordinates
[in] Pointer to the temporary array of tile starting coordinates
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > * d_tile_carry_pairs
[out] Pointer to the temporary array carry-out dot product row-ids, one per block
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
OffsetT OffsetT
[in] Total number of input data items
__host__ __device__ __forceinline__ void MergePathSearch(OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT &path_coordinate)
@ BLOCK_REDUCE_WARP_REDUCTIONS
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
< The BlockScan algorithm to use
static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER
Cache load modifier for reading CSR row-offsets.
static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER
Cache load modifier for reading vector values.
static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER
Cache load modifier for reading CSR column-indices.
static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER
Cache load modifier for reading CSR row-offsets.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
@ DIRECT_LOAD_NONZEROS
Whether to load nonzeros directly from global during sequential merging (pre-staged through shared me...
@ BLOCK_THREADS
Threads per thread block.
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
static const CacheLoadModifier VALUES_LOAD_MODIFIER
Cache load modifier for reading CSR values.
Temporary storage type (unionable)
Shared memory type required by this thread block.
AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide Sp...
ValueIteratorT wd_values
Wrapped pointer to the array of num_nonzeros values of the corresponding nonzero elements of matrix A...
__device__ __forceinline__ AgentSpmv(TempStorage &temp_storage, SpmvParams< ValueT, OffsetT > &spmv_params)
ColumnIndicesIteratorT wd_column_indices
Wrapped Pointer to the array of num_nonzeros column-indices of the corresponding nonzero elements of ...
SpmvParams< ValueT, OffsetT > & spmv_params
Reference to temp_storage.
CacheModifiedInputIterator< AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT > RowOffsetsSearchIteratorT
Input iterator wrapper types (for applying cache modifiers)
__device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type< true > is_direct_load)
VectorValueIteratorT wd_vector_x
Wrapped Pointer to the array of num_cols values corresponding to the dense input vector x
VectorValueIteratorT wd_vector_y
Wrapped Pointer to the array of num_cols values corresponding to the dense input vector x
RowOffsetsIteratorT wd_row_end_offsets
Wrapped Pointer to the array of m offsets demarcating the end of every row in d_column_indices and d_...
CubVector< OffsetT, 2 >::Type CoordinateT
2D merge path coordinate type
__device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type< false > is_direct_load)
__device__ __forceinline__ void ConsumeTile(CoordinateT *d_tile_coordinates, KeyValuePairT *d_tile_carry_pairs, int num_merge_tiles)
\smemstorage{BlockExchange}
\smemstorage{BlockReduce}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A key identifier paired with a corresponding value.
< Binary reduction operator to apply to values
< Signed integer type for sequence offsets
ValueT * d_vector_y
Pointer to the array of num_rows values corresponding to the dense output vector y
OffsetT * d_row_end_offsets
Pointer to the array of m offsets demarcating the end of every row in d_column_indices and d_values.
int num_nonzeros
Number of nonzero elements of matrix A.
int num_cols
Number of columns of matrix A.
ValueT * d_vector_x
Pointer to the array of num_cols values corresponding to the dense input vector x
ValueT beta
Beta addend-multiplicand.
int num_rows
Number of rows of matrix A.
OffsetT * d_column_indices
Pointer to the array of num_nonzeros column-indices of the corresponding nonzero elements of matrix A...
ValueT alpha
Alpha multiplicand.
ValueT * d_values
Pointer to the array of num_nonzeros values of the corresponding nonzero elements of matrix A.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Merge item type (either a non-zero value or a row-end offset)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...