OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
agent_spmv_orig.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "../util_type.cuh"
39 #include "../block/block_reduce.cuh"
40 #include "../block/block_scan.cuh"
41 #include "../block/block_exchange.cuh"
42 #include "../thread/thread_search.cuh"
43 #include "../thread/thread_operators.cuh"
44 #include "../iterator/cache_modified_input_iterator.cuh"
45 #include "../iterator/counting_input_iterator.cuh"
46 #include "../iterator/tex_ref_input_iterator.cuh"
47 #include "../util_namespace.cuh"
48 
50 CUB_NS_PREFIX
51 
53 namespace cub {
54 
55 
56 /******************************************************************************
57  * Tuning policy
58  ******************************************************************************/
59 
63 template <
64  int _BLOCK_THREADS,
65  int _ITEMS_PER_THREAD,
66  CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
67  CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER,
68  CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER,
69  CacheLoadModifier _VALUES_LOAD_MODIFIER,
70  CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER,
71  bool _DIRECT_LOAD_NONZEROS,
72  BlockScanAlgorithm _SCAN_ALGORITHM>
74 {
75  enum
76  {
77  BLOCK_THREADS = _BLOCK_THREADS,
78  ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
79  DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS,
80  };
81 
82  static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;
83  static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER;
84  static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER;
85  static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER;
86  static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER;
87  static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
88 
89 };
90 
91 
92 /******************************************************************************
93  * Thread block abstractions
94  ******************************************************************************/
95 
96 template <
97  typename ValueT,
98  typename OffsetT>
99 struct SpmvParams
100 {
101  ValueT* d_values;
104  ValueT* d_vector_x;
105  ValueT* d_vector_y;
106  int num_rows;
107  int num_cols;
109  ValueT alpha;
110  ValueT beta;
111 
112  TexRefInputIterator<ValueT, 66778899, OffsetT> t_vector_x;
113 };
114 
115 
119 template <
120  typename AgentSpmvPolicyT,
121  typename ValueT,
122  typename OffsetT,
123  bool HAS_ALPHA,
124  bool HAS_BETA,
125  int PTX_ARCH = CUB_PTX_ARCH>
126 struct AgentSpmv
127 {
128  //---------------------------------------------------------------------
129  // Types and constants
130  //---------------------------------------------------------------------
131 
133  enum
134  {
135  BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS,
136  ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD,
137  TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
138  };
139 
142 
144 
146  AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
147  OffsetT,
148  OffsetT>
150 
152  AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
153  OffsetT,
154  OffsetT>
156 
158  AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
159  OffsetT,
160  OffsetT>
162 
164  AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
165  ValueT,
166  OffsetT>
168 
170  AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
171  ValueT,
172  OffsetT>
174 
175  // Tuple type for scanning (pairs accumulated segment-value with segment-index)
177 
178  // Reduce-value-by-segment scan operator
180 
181  // BlockReduce specialization
182  typedef BlockReduce<
183  ValueT,
184  BLOCK_THREADS,
186  BlockReduceT;
187 
188  // BlockScan specialization
189  typedef BlockScan<
191  BLOCK_THREADS,
192  AgentSpmvPolicyT::SCAN_ALGORITHM>
193  BlockScanT;
194 
195  // BlockScan specialization
196  typedef BlockScan<
197  ValueT,
198  BLOCK_THREADS,
199  AgentSpmvPolicyT::SCAN_ALGORITHM>
201 
202  // BlockExchange specialization
203  typedef BlockExchange<
204  ValueT,
205  BLOCK_THREADS,
206  ITEMS_PER_THREAD>
208 
210  union MergeItem
211  {
212  // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
214 
215  OffsetT row_end_offset;
216  MergeValueT nonzero;
217  };
218 
221  {
222  CoordinateT tile_coords[2];
223 
224  union Aliasable
225  {
226  // Smem needed for tile of merge items
227  MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
228 
229  // Smem needed for block exchange
230  typename BlockExchangeT::TempStorage exchange;
231 
232  // Smem needed for block-wide reduction
233  typename BlockReduceT::TempStorage reduce;
234 
235  // Smem needed for tile scanning
236  typename BlockScanT::TempStorage scan;
237 
238  // Smem needed for tile prefix sum
239  typename BlockPrefixSumT::TempStorage prefix_sum;
240 
241  } aliasable;
242  };
243 
245  struct TempStorage : Uninitialized<_TempStorage> {};
246 
247 
248  //---------------------------------------------------------------------
249  // Per-thread fields
250  //---------------------------------------------------------------------
251 
252 
253  _TempStorage& temp_storage;
254 
256 
262 
263 
264  //---------------------------------------------------------------------
265  // Interface
266  //---------------------------------------------------------------------
267 
271  __device__ __forceinline__ AgentSpmv(
272  TempStorage& temp_storage,
274  :
275  temp_storage(temp_storage.Alias()),
277  wd_values(spmv_params.d_values),
278  wd_row_end_offsets(spmv_params.d_row_end_offsets),
279  wd_column_indices(spmv_params.d_column_indices),
280  wd_vector_x(spmv_params.d_vector_x),
281  wd_vector_y(spmv_params.d_vector_y)
282  {}
283 
284 
285 
286 
290  __device__ __forceinline__ KeyValuePairT ConsumeTile(
291  int tile_idx,
292  CoordinateT tile_start_coord,
293  CoordinateT tile_end_coord,
294  Int2Type<true> is_direct_load)
295  {
296  int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
297  int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
298  OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
299 
300  // Gather the row end-offsets for the merge tile into shared memory
301  for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
302  {
303  s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
304  }
305 
306  CTA_SYNC();
307 
308  // Search for the thread's starting coordinate within the merge tile
309  CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
310  CoordinateT thread_start_coord;
311 
313  OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
314  s_tile_row_end_offsets, // List A
315  tile_nonzero_indices, // List B
316  tile_num_rows,
317  tile_num_nonzeros,
318  thread_start_coord);
319 
320  CTA_SYNC(); // Perf-sync
321 
322  // Compute the thread's merge path segment
323  CoordinateT thread_current_coord = thread_start_coord;
324  KeyValuePairT scan_segment[ITEMS_PER_THREAD];
325 
326  ValueT running_total = 0.0;
327 
328  #pragma unroll
329  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
330  {
331  OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
332  OffsetT column_idx = wd_column_indices[nonzero_idx];
333  ValueT value = wd_values[nonzero_idx];
334 
335  ValueT vector_value = spmv_params.t_vector_x[column_idx];
336 #if (CUB_PTX_ARCH >= 350)
337  vector_value = wd_vector_x[column_idx];
338 #endif
339  ValueT nonzero = value * vector_value;
340 
341  OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
342 
343  if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
344  {
345  // Move down (accumulate)
346  running_total += nonzero;
347  scan_segment[ITEM].value = running_total;
348  scan_segment[ITEM].key = tile_num_rows;
349  ++thread_current_coord.y;
350  }
351  else
352  {
353  // Move right (reset)
354  scan_segment[ITEM].value = running_total;
355  scan_segment[ITEM].key = thread_current_coord.x;
356  running_total = 0.0;
357  ++thread_current_coord.x;
358  }
359  }
360 
361  CTA_SYNC();
362 
363  // Block-wide reduce-value-by-segment
364  KeyValuePairT tile_carry;
366  KeyValuePairT scan_item;
367 
368  scan_item.value = running_total;
369  scan_item.key = thread_current_coord.x;
370 
371  BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
372 
373  if (tile_num_rows > 0)
374  {
375  if (threadIdx.x == 0)
376  scan_item.key = -1;
377 
378  // Direct scatter
379  #pragma unroll
380  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
381  {
382  if (scan_segment[ITEM].key < tile_num_rows)
383  {
384  if (scan_item.key == scan_segment[ITEM].key)
385  scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
386 
387  if (HAS_ALPHA)
388  {
389  scan_segment[ITEM].value *= spmv_params.alpha;
390  }
391 
392  if (HAS_BETA)
393  {
394  // Update the output vector element
395  ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
396  scan_segment[ITEM].value += addend;
397  }
398 
399  // Set the output vector element
400  spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
401  }
402  }
403  }
404 
405  // Return the tile's running carry-out
406  return tile_carry;
407  }
408 
409 
410 
414  __device__ __forceinline__ KeyValuePairT ConsumeTile(
415  int tile_idx,
416  CoordinateT tile_start_coord,
417  CoordinateT tile_end_coord,
418  Int2Type<false> is_direct_load)
419  {
420  int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
421  int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
422 
423 #if (CUB_PTX_ARCH >= 520)
424 
425  OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
426  ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
427 
428  // Gather the nonzeros for the merge tile into shared memory
429  #pragma unroll
430  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
431  {
432  int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
433 
434  ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx;
435  ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx;
436  ValueT* s = s_tile_nonzeros + nonzero_idx;
437 
438  if (nonzero_idx < tile_num_nonzeros)
439  {
440 
441  OffsetT column_idx = *ci;
442  ValueT value = *a;
443 
444  ValueT vector_value = spmv_params.t_vector_x[column_idx];
445  vector_value = wd_vector_x[column_idx];
446 
447  ValueT nonzero = value * vector_value;
448 
449  *s = nonzero;
450  }
451  }
452 
453 
454 #else
455 
456  OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
457  ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
458 
459  // Gather the nonzeros for the merge tile into shared memory
460  if (tile_num_nonzeros > 0)
461  {
462  #pragma unroll
463  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
464  {
465  int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
466  nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
467 
468  OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx];
469  ValueT value = wd_values[tile_start_coord.y + nonzero_idx];
470 
471  ValueT vector_value = spmv_params.t_vector_x[column_idx];
472 #if (CUB_PTX_ARCH >= 350)
473  vector_value = wd_vector_x[column_idx];
474 #endif
475  ValueT nonzero = value * vector_value;
476 
477  s_tile_nonzeros[nonzero_idx] = nonzero;
478  }
479  }
480 
481 #endif
482 
483  // Gather the row end-offsets for the merge tile into shared memory
484  #pragma unroll 1
485  for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
486  {
487  s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
488  }
489 
490  CTA_SYNC();
491 
492  // Search for the thread's starting coordinate within the merge tile
493  CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
494  CoordinateT thread_start_coord;
495 
497  OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
498  s_tile_row_end_offsets, // List A
499  tile_nonzero_indices, // List B
500  tile_num_rows,
501  tile_num_nonzeros,
502  thread_start_coord);
503 
504  CTA_SYNC(); // Perf-sync
505 
506  // Compute the thread's merge path segment
507  CoordinateT thread_current_coord = thread_start_coord;
508  KeyValuePairT scan_segment[ITEMS_PER_THREAD];
509  ValueT running_total = 0.0;
510 
511  OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
512  ValueT nonzero = s_tile_nonzeros[thread_current_coord.y];
513 
514  #pragma unroll
515  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
516  {
517  if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
518  {
519  // Move down (accumulate)
520  scan_segment[ITEM].value = nonzero;
521  running_total += nonzero;
522  ++thread_current_coord.y;
523  nonzero = s_tile_nonzeros[thread_current_coord.y];
524  }
525  else
526  {
527  // Move right (reset)
528  scan_segment[ITEM].value = 0.0;
529  running_total = 0.0;
530  ++thread_current_coord.x;
531  row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
532  }
533 
534  scan_segment[ITEM].key = thread_current_coord.x;
535  }
536 
537  CTA_SYNC();
538 
539  // Block-wide reduce-value-by-segment
540  KeyValuePairT tile_carry;
542  KeyValuePairT scan_item;
543 
544  scan_item.value = running_total;
545  scan_item.key = thread_current_coord.x;
546 
547  BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
548 
549  if (threadIdx.x == 0)
550  {
551  scan_item.key = thread_start_coord.x;
552  scan_item.value = 0.0;
553  }
554 
555  if (tile_num_rows > 0)
556  {
557 
558  CTA_SYNC();
559 
560  // Scan downsweep and scatter
561  ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
562 
563  if (scan_item.key != scan_segment[0].key)
564  {
565  s_partials[scan_item.key] = scan_item.value;
566  }
567  else
568  {
569  scan_segment[0].value += scan_item.value;
570  }
571 
572  #pragma unroll
573  for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
574  {
575  if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
576  {
577  s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
578  }
579  else
580  {
581  scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
582  }
583  }
584 
585  CTA_SYNC();
586 
587  #pragma unroll 1
588  for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
589  {
590  spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
591  }
592  }
593 
594  // Return the tile's running carry-out
595  return tile_carry;
596  }
597 
598 
602  __device__ __forceinline__ void ConsumeTile(
605  int num_merge_tiles)
606  {
607  int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
608 
609  if (tile_idx >= num_merge_tiles)
610  return;
611 
612  // Read our starting coordinates
613  if (threadIdx.x < 2)
614  {
615  if (d_tile_coordinates == NULL)
616  {
617  // Search our starting coordinates
618  OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
619  CoordinateT tile_coord;
620  CountingInputIterator<OffsetT> nonzero_indices(0);
621 
622  // Search the merge path
624  diagonal,
625  RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
626  nonzero_indices,
627  spmv_params.num_rows,
628  spmv_params.num_nonzeros,
629  tile_coord);
630 
631  temp_storage.tile_coords[threadIdx.x] = tile_coord;
632  }
633  else
634  {
635  temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
636  }
637  }
638 
639  CTA_SYNC();
640 
641  CoordinateT tile_start_coord = temp_storage.tile_coords[0];
642  CoordinateT tile_end_coord = temp_storage.tile_coords[1];
643 
644  // Consume multi-segment tile
645  KeyValuePairT tile_carry = ConsumeTile(
646  tile_idx,
647  tile_start_coord,
648  tile_end_coord,
650 
651  // Output the tile's carry-out
652  if (threadIdx.x == 0)
653  {
654  if (HAS_ALPHA)
655  tile_carry.value *= spmv_params.alpha;
656 
657  tile_carry.key += tile_start_coord.x;
658  d_tile_carry_pairs[tile_idx] = tile_carry;
659  }
660  }
661 
662 
663 };
664 
665 
666 
667 
668 } // CUB namespace
669 CUB_NS_POSTFIX // Optional outer namespace(s)
670 
ValueT * d_values
Pointer to the array of num_nonzeros values of the corresponding nonzero elements of matrix A.
CubVector< OffsetT, 2 >::Type CoordinateT
2D merge path coordinate type
Key key
Item key.
Definition: util_type.cuh:671
int num_rows
Number of rows of matrix A.
ValueT alpha
Alpha multiplicand.
__device__ __forceinline__ void ConsumeTile(CoordinateT *d_tile_coordinates, KeyValuePairT *d_tile_carry_pairs, int num_merge_tiles)
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition: thread_load.cuh:62
static const CacheLoadModifier VALUES_LOAD_MODIFIER
Cache load modifier for reading CSR values.
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > * d_tile_carry_pairs
[out] Pointer to the temporary array carry-out dot product row-ids, one per block
Value value
Item value.
Definition: util_type.cuh:672
A random-access input generator for dereferencing a sequence of incrementing integer values.
Optional outer namespace(s)
static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER
Cache load modifier for reading CSR column-indices.
static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER
Cache load modifier for reading vector values.
\smemstorage{BlockExchange}
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
SpmvParams< ValueT, OffsetT > & spmv_params
Reference to temp_storage.
__device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type< false > is_direct_load)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
< The BlockScan algorithm to use
A key identifier paired with a corresponding value.
Definition: util_type.cuh:666
OffsetT * d_column_indices
Pointer to the array of num_nonzeros column-indices of the corresponding nonzero elements of matrix A...
CTA_SYNC()
Definition: util_ptx.cuh:255
AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide Sp...
ValueT * d_vector_y
Pointer to the array of num_rows values corresponding to the dense output vector y
ValueT beta
Beta addend-multiplicand.
Shared memory type required by this thread block.
Items per thread (per tile of input)
static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER
Cache load modifier for reading CSR row-offsets.
OffsetT OffsetT
[in] Total number of input data items
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
__host__ __device__ __forceinline__ void MergePathSearch(OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT &path_coordinate)
OffsetT CoordinateT * d_tile_coordinates
[in] Pointer to the temporary array of tile starting coordinates
Temporary storage type (unionable)
VectorValueIteratorT wd_vector_x
Wrapped Pointer to the array of num_cols values corresponding to the dense input vector x
int num_nonzeros
Number of nonzero elements of matrix A.
ColumnIndicesIteratorT wd_column_indices
Wrapped Pointer to the array of num_nonzeros column-indices of the corresponding nonzero elements of ...
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
RowOffsetsIteratorT wd_row_end_offsets
Wrapped Pointer to the array of m offsets demarcating the end of every row in d_column_indices and d_...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
Definition: block_scan.cuh:728
__device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type< true > is_direct_load)
int num_cols
Number of columns of matrix A.
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
CacheModifiedInputIterator< AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT > RowOffsetsSearchIteratorT
Input iterator wrapper types (for applying cache modifiers)
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition: block_scan.cuh:193
VectorValueIteratorT wd_vector_y
Wrapped Pointer to the array of num_cols values corresponding to the dense input vector x
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition: util_type.cuh:454
__device__ __forceinline__ AgentSpmv(TempStorage &temp_storage, SpmvParams< ValueT, OffsetT > &spmv_params)
ValueIteratorT wd_values
Wrapped pointer to the array of num_nonzeros values of the corresponding nonzero elements of matrix A...
\smemstorage{BlockReduce}
OffsetT * d_row_end_offsets
Pointer to the array of m offsets demarcating the end of every row in d_column_indices and d_values.
Merge item type (either a non-zero value or a row-end offset)
\smemstorage{BlockScan}
Definition: block_scan.cuh:260
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
< Signed integer type for sequence offsets
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
Definition: block_scan.cuh:57
ValueT * d_vector_x
Pointer to the array of num_cols values corresponding to the dense input vector x
static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER
Cache load modifier for reading CSR row-offsets.
Whether to load nonzeros directly from global during sequential merging (pre-staged through shared me...