OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
agent_spmv_orig.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include <iterator>
37
38#include "../util_type.cuh"
39#include "../block/block_reduce.cuh"
40#include "../block/block_scan.cuh"
41#include "../block/block_exchange.cuh"
42#include "../thread/thread_search.cuh"
43#include "../thread/thread_operators.cuh"
44#include "../iterator/cache_modified_input_iterator.cuh"
45#include "../iterator/counting_input_iterator.cuh"
46#include "../iterator/tex_ref_input_iterator.cuh"
47#include "../util_namespace.cuh"
48
50CUB_NS_PREFIX
51
53namespace cub {
54
55
56/******************************************************************************
57 * Tuning policy
58 ******************************************************************************/
59
63template <
64 int _BLOCK_THREADS,
65 int _ITEMS_PER_THREAD,
66 CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
67 CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER,
68 CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER,
69 CacheLoadModifier _VALUES_LOAD_MODIFIER,
70 CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER,
71 bool _DIRECT_LOAD_NONZEROS,
72 BlockScanAlgorithm _SCAN_ALGORITHM>
74{
75 enum
76 {
77 BLOCK_THREADS = _BLOCK_THREADS,
78 ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
79 DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS,
80 };
81
82 static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;
83 static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER;
84 static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER;
85 static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER;
86 static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER;
87 static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
88
89};
90
91
92/******************************************************************************
93 * Thread block abstractions
94 ******************************************************************************/
95
96template <
97 typename ValueT,
98 typename OffsetT>
100{
101 ValueT* d_values;
104 ValueT* d_vector_x;
105 ValueT* d_vector_y;
109 ValueT alpha;
110 ValueT beta;
111
112 TexRefInputIterator<ValueT, 66778899, OffsetT> t_vector_x;
113};
114
115
119template <
120 typename AgentSpmvPolicyT,
121 typename ValueT,
122 typename OffsetT,
123 bool HAS_ALPHA,
124 bool HAS_BETA,
125 int PTX_ARCH = CUB_PTX_ARCH>
127{
128 //---------------------------------------------------------------------
129 // Types and constants
130 //---------------------------------------------------------------------
131
133 enum
134 {
135 BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS,
136 ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD,
137 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
138 };
139
142
144
146 AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
147 OffsetT,
148 OffsetT>
150
152 AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
153 OffsetT,
154 OffsetT>
156
158 AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
159 OffsetT,
160 OffsetT>
162
164 AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
165 ValueT,
166 OffsetT>
168
170 AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
171 ValueT,
172 OffsetT>
174
175 // Tuple type for scanning (pairs accumulated segment-value with segment-index)
177
178 // Reduce-value-by-segment scan operator
180
181 // BlockReduce specialization
182 typedef BlockReduce<
183 ValueT,
184 BLOCK_THREADS,
187
188 // BlockScan specialization
189 typedef BlockScan<
191 BLOCK_THREADS,
192 AgentSpmvPolicyT::SCAN_ALGORITHM>
194
195 // BlockScan specialization
196 typedef BlockScan<
197 ValueT,
198 BLOCK_THREADS,
199 AgentSpmvPolicyT::SCAN_ALGORITHM>
201
202 // BlockExchange specialization
203 typedef BlockExchange<
204 ValueT,
205 BLOCK_THREADS,
206 ITEMS_PER_THREAD>
208
211 {
212 // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
214
215 OffsetT row_end_offset;
216 MergeValueT nonzero;
217 };
218
221 {
222 CoordinateT tile_coords[2];
223
225 {
226 // Smem needed for tile of merge items
227 MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
228
229 // Smem needed for block exchange
230 typename BlockExchangeT::TempStorage exchange;
231
232 // Smem needed for block-wide reduction
233 typename BlockReduceT::TempStorage reduce;
234
235 // Smem needed for tile scanning
236 typename BlockScanT::TempStorage scan;
237
238 // Smem needed for tile prefix sum
239 typename BlockPrefixSumT::TempStorage prefix_sum;
240
241 } aliasable;
242 };
243
245 struct TempStorage : Uninitialized<_TempStorage> {};
246
247
248 //---------------------------------------------------------------------
249 // Per-thread fields
250 //---------------------------------------------------------------------
251
252
253 _TempStorage& temp_storage;
254
256
262
263
264 //---------------------------------------------------------------------
265 // Interface
266 //---------------------------------------------------------------------
267
271 __device__ __forceinline__ AgentSpmv(
272 TempStorage& temp_storage,
274 :
275 temp_storage(temp_storage.Alias()),
277 wd_values(spmv_params.d_values),
278 wd_row_end_offsets(spmv_params.d_row_end_offsets),
279 wd_column_indices(spmv_params.d_column_indices),
280 wd_vector_x(spmv_params.d_vector_x),
281 wd_vector_y(spmv_params.d_vector_y)
282 {}
283
284
285
286
290 __device__ __forceinline__ KeyValuePairT ConsumeTile(
291 int tile_idx,
292 CoordinateT tile_start_coord,
293 CoordinateT tile_end_coord,
294 Int2Type<true> is_direct_load)
295 {
296 int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
297 int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
298 OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
299
300 // Gather the row end-offsets for the merge tile into shared memory
301 for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
302 {
303 s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
304 }
305
306 CTA_SYNC();
307
308 // Search for the thread's starting coordinate within the merge tile
309 CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
310 CoordinateT thread_start_coord;
311
313 OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
314 s_tile_row_end_offsets, // List A
315 tile_nonzero_indices, // List B
316 tile_num_rows,
317 tile_num_nonzeros,
318 thread_start_coord);
319
320 CTA_SYNC(); // Perf-sync
321
322 // Compute the thread's merge path segment
323 CoordinateT thread_current_coord = thread_start_coord;
324 KeyValuePairT scan_segment[ITEMS_PER_THREAD];
325
326 ValueT running_total = 0.0;
327
328 #pragma unroll
329 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
330 {
331 OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
332 OffsetT column_idx = wd_column_indices[nonzero_idx];
333 ValueT value = wd_values[nonzero_idx];
334
335 ValueT vector_value = spmv_params.t_vector_x[column_idx];
336#if (CUB_PTX_ARCH >= 350)
337 vector_value = wd_vector_x[column_idx];
338#endif
339 ValueT nonzero = value * vector_value;
340
341 OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
342
343 if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
344 {
345 // Move down (accumulate)
346 running_total += nonzero;
347 scan_segment[ITEM].value = running_total;
348 scan_segment[ITEM].key = tile_num_rows;
349 ++thread_current_coord.y;
350 }
351 else
352 {
353 // Move right (reset)
354 scan_segment[ITEM].value = running_total;
355 scan_segment[ITEM].key = thread_current_coord.x;
356 running_total = 0.0;
357 ++thread_current_coord.x;
358 }
359 }
360
361 CTA_SYNC();
362
363 // Block-wide reduce-value-by-segment
364 KeyValuePairT tile_carry;
366 KeyValuePairT scan_item;
367
368 scan_item.value = running_total;
369 scan_item.key = thread_current_coord.x;
370
371 BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
372
373 if (tile_num_rows > 0)
374 {
375 if (threadIdx.x == 0)
376 scan_item.key = -1;
377
378 // Direct scatter
379 #pragma unroll
380 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
381 {
382 if (scan_segment[ITEM].key < tile_num_rows)
383 {
384 if (scan_item.key == scan_segment[ITEM].key)
385 scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
386
387 if (HAS_ALPHA)
388 {
389 scan_segment[ITEM].value *= spmv_params.alpha;
390 }
391
392 if (HAS_BETA)
393 {
394 // Update the output vector element
395 ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
396 scan_segment[ITEM].value += addend;
397 }
398
399 // Set the output vector element
400 spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
401 }
402 }
403 }
404
405 // Return the tile's running carry-out
406 return tile_carry;
407 }
408
409
410
414 __device__ __forceinline__ KeyValuePairT ConsumeTile(
415 int tile_idx,
416 CoordinateT tile_start_coord,
417 CoordinateT tile_end_coord,
418 Int2Type<false> is_direct_load)
419 {
420 int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
421 int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
422
423#if (CUB_PTX_ARCH >= 520)
424
425 OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
426 ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
427
428 // Gather the nonzeros for the merge tile into shared memory
429 #pragma unroll
430 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
431 {
432 int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
433
434 ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx;
435 ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx;
436 ValueT* s = s_tile_nonzeros + nonzero_idx;
437
438 if (nonzero_idx < tile_num_nonzeros)
439 {
440
441 OffsetT column_idx = *ci;
442 ValueT value = *a;
443
444 ValueT vector_value = spmv_params.t_vector_x[column_idx];
445 vector_value = wd_vector_x[column_idx];
446
447 ValueT nonzero = value * vector_value;
448
449 *s = nonzero;
450 }
451 }
452
453
454#else
455
456 OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
457 ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
458
459 // Gather the nonzeros for the merge tile into shared memory
460 if (tile_num_nonzeros > 0)
461 {
462 #pragma unroll
463 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
464 {
465 int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
466 nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
467
468 OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx];
469 ValueT value = wd_values[tile_start_coord.y + nonzero_idx];
470
471 ValueT vector_value = spmv_params.t_vector_x[column_idx];
472#if (CUB_PTX_ARCH >= 350)
473 vector_value = wd_vector_x[column_idx];
474#endif
475 ValueT nonzero = value * vector_value;
476
477 s_tile_nonzeros[nonzero_idx] = nonzero;
478 }
479 }
480
481#endif
482
483 // Gather the row end-offsets for the merge tile into shared memory
484 #pragma unroll 1
485 for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
486 {
487 s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
488 }
489
490 CTA_SYNC();
491
492 // Search for the thread's starting coordinate within the merge tile
493 CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
494 CoordinateT thread_start_coord;
495
497 OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
498 s_tile_row_end_offsets, // List A
499 tile_nonzero_indices, // List B
500 tile_num_rows,
501 tile_num_nonzeros,
502 thread_start_coord);
503
504 CTA_SYNC(); // Perf-sync
505
506 // Compute the thread's merge path segment
507 CoordinateT thread_current_coord = thread_start_coord;
508 KeyValuePairT scan_segment[ITEMS_PER_THREAD];
509 ValueT running_total = 0.0;
510
511 OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
512 ValueT nonzero = s_tile_nonzeros[thread_current_coord.y];
513
514 #pragma unroll
515 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
516 {
517 if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
518 {
519 // Move down (accumulate)
520 scan_segment[ITEM].value = nonzero;
521 running_total += nonzero;
522 ++thread_current_coord.y;
523 nonzero = s_tile_nonzeros[thread_current_coord.y];
524 }
525 else
526 {
527 // Move right (reset)
528 scan_segment[ITEM].value = 0.0;
529 running_total = 0.0;
530 ++thread_current_coord.x;
531 row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
532 }
533
534 scan_segment[ITEM].key = thread_current_coord.x;
535 }
536
537 CTA_SYNC();
538
539 // Block-wide reduce-value-by-segment
540 KeyValuePairT tile_carry;
542 KeyValuePairT scan_item;
543
544 scan_item.value = running_total;
545 scan_item.key = thread_current_coord.x;
546
547 BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
548
549 if (threadIdx.x == 0)
550 {
551 scan_item.key = thread_start_coord.x;
552 scan_item.value = 0.0;
553 }
554
555 if (tile_num_rows > 0)
556 {
557
558 CTA_SYNC();
559
560 // Scan downsweep and scatter
561 ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
562
563 if (scan_item.key != scan_segment[0].key)
564 {
565 s_partials[scan_item.key] = scan_item.value;
566 }
567 else
568 {
569 scan_segment[0].value += scan_item.value;
570 }
571
572 #pragma unroll
573 for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
574 {
575 if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
576 {
577 s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
578 }
579 else
580 {
581 scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
582 }
583 }
584
585 CTA_SYNC();
586
587 #pragma unroll 1
588 for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
589 {
590 spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
591 }
592 }
593
594 // Return the tile's running carry-out
595 return tile_carry;
596 }
597
598
602 __device__ __forceinline__ void ConsumeTile(
605 int num_merge_tiles)
606 {
607 int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
608
609 if (tile_idx >= num_merge_tiles)
610 return;
611
612 // Read our starting coordinates
613 if (threadIdx.x < 2)
614 {
615 if (d_tile_coordinates == NULL)
616 {
617 // Search our starting coordinates
618 OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
619 CoordinateT tile_coord;
620 CountingInputIterator<OffsetT> nonzero_indices(0);
621
622 // Search the merge path
624 diagonal,
625 RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
626 nonzero_indices,
627 spmv_params.num_rows,
628 spmv_params.num_nonzeros,
629 tile_coord);
630
631 temp_storage.tile_coords[threadIdx.x] = tile_coord;
632 }
633 else
634 {
635 temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
636 }
637 }
638
639 CTA_SYNC();
640
641 CoordinateT tile_start_coord = temp_storage.tile_coords[0];
642 CoordinateT tile_end_coord = temp_storage.tile_coords[1];
643
644 // Consume multi-segment tile
645 KeyValuePairT tile_carry = ConsumeTile(
646 tile_idx,
647 tile_start_coord,
648 tile_end_coord,
650
651 // Output the tile's carry-out
652 if (threadIdx.x == 0)
653 {
654 if (HAS_ALPHA)
655 tile_carry.value *= spmv_params.alpha;
656
657 tile_carry.key += tile_start_coord.x;
658 d_tile_carry_pairs[tile_idx] = tile_carry;
659 }
660 }
661
662
663};
664
665
666
667
668} // CUB namespace
669CUB_NS_POSTFIX // Optional outer namespace(s)
670
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
A random-access input generator for dereferencing a sequence of incrementing integer values.
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
#define CUB_MIN(a, b)
Select minimum(a, b)
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
OffsetT CoordinateT * d_tile_coordinates
[in] Pointer to the temporary array of tile starting coordinates
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > * d_tile_carry_pairs
[out] Pointer to the temporary array carry-out dot product row-ids, one per block
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
OffsetT OffsetT
[in] Total number of input data items
__host__ __device__ __forceinline__ void MergePathSearch(OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT &path_coordinate)
@ BLOCK_REDUCE_WARP_REDUCTIONS
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
< The BlockScan algorithm to use
static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER
Cache load modifier for reading CSR row-offsets.
static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER
Cache load modifier for reading vector values.
static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER
Cache load modifier for reading CSR column-indices.
static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER
Cache load modifier for reading CSR row-offsets.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
@ DIRECT_LOAD_NONZEROS
Whether to load nonzeros directly from global during sequential merging (pre-staged through shared me...
@ BLOCK_THREADS
Threads per thread block.
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
static const CacheLoadModifier VALUES_LOAD_MODIFIER
Cache load modifier for reading CSR values.
Temporary storage type (unionable)
Shared memory type required by this thread block.
AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide Sp...
ValueIteratorT wd_values
Wrapped pointer to the array of num_nonzeros values of the corresponding nonzero elements of matrix A...
__device__ __forceinline__ AgentSpmv(TempStorage &temp_storage, SpmvParams< ValueT, OffsetT > &spmv_params)
ColumnIndicesIteratorT wd_column_indices
Wrapped Pointer to the array of num_nonzeros column-indices of the corresponding nonzero elements of ...
SpmvParams< ValueT, OffsetT > & spmv_params
Reference to temp_storage.
CacheModifiedInputIterator< AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT > RowOffsetsSearchIteratorT
Input iterator wrapper types (for applying cache modifiers)
__device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type< true > is_direct_load)
VectorValueIteratorT wd_vector_x
Wrapped Pointer to the array of num_cols values corresponding to the dense input vector x
VectorValueIteratorT wd_vector_y
Wrapped Pointer to the array of num_cols values corresponding to the dense input vector x
RowOffsetsIteratorT wd_row_end_offsets
Wrapped Pointer to the array of m offsets demarcating the end of every row in d_column_indices and d_...
CubVector< OffsetT, 2 >::Type CoordinateT
2D merge path coordinate type
__device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type< false > is_direct_load)
__device__ __forceinline__ void ConsumeTile(CoordinateT *d_tile_coordinates, KeyValuePairT *d_tile_carry_pairs, int num_merge_tiles)
\smemstorage{BlockExchange}
\smemstorage{BlockReduce}
\smemstorage{BlockScan}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A key identifier paired with a corresponding value.
Value value
Item value.
Key key
Item key.
< Binary reduction operator to apply to values
< Signed integer type for sequence offsets
ValueT * d_vector_y
Pointer to the array of num_rows values corresponding to the dense output vector y
OffsetT * d_row_end_offsets
Pointer to the array of m offsets demarcating the end of every row in d_column_indices and d_values.
int num_nonzeros
Number of nonzero elements of matrix A.
int num_cols
Number of columns of matrix A.
ValueT * d_vector_x
Pointer to the array of num_cols values corresponding to the dense input vector x
ValueT beta
Beta addend-multiplicand.
int num_rows
Number of rows of matrix A.
OffsetT * d_column_indices
Pointer to the array of num_nonzeros column-indices of the corresponding nonzero elements of matrix A...
ValueT alpha
Alpha multiplicand.
ValueT * d_values
Pointer to the array of num_nonzeros values of the corresponding nonzero elements of matrix A.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Merge item type (either a non-zero value or a row-end offset)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53