OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
dispatch_spmv_orig.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
40#include "../../agent/single_pass_scan_operators.cuh"
41#include "../../agent/agent_segment_fixup.cuh"
42#include "../../agent/agent_spmv_orig.cuh"
43#include "../../util_type.cuh"
44#include "../../util_debug.cuh"
45#include "../../util_device.cuh"
46#include "../../thread/thread_search.cuh"
47#include "../../grid/grid_queue.cuh"
48#include "../../util_namespace.cuh"
49
51CUB_NS_PREFIX
52
54namespace cub {
55
56
57/******************************************************************************
58 * SpMV kernel entry points
59 *****************************************************************************/
60
64template <
65 typename AgentSpmvPolicyT,
66 typename ValueT,
67 typename OffsetT>
68__global__ void DeviceSpmv1ColKernel(
70{
72 AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
73 ValueT,
74 OffsetT>
75 VectorValueIteratorT;
76
77 VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
78
79 int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
80 if (row_idx < spmv_params.num_rows)
81 {
82 OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
83 OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
84
85 ValueT value = 0.0;
86 if (end_nonzero_idx != nonzero_idx)
87 {
88 value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
89 }
90
91 spmv_params.d_vector_y[row_idx] = value;
92 }
93}
94
95
99template <
100 typename SpmvPolicyT,
101 typename OffsetT,
102 typename CoordinateT,
103 typename SpmvParamsT>
105 int num_merge_tiles,
106 CoordinateT* d_tile_coordinates,
107 SpmvParamsT spmv_params)
108{
110 enum
111 {
112 BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS,
113 ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD,
114 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
115 };
116
118 SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
119 OffsetT,
120 OffsetT>
121 RowOffsetsSearchIteratorT;
122
123 // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
124 int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
125 if (tile_idx < num_merge_tiles + 1)
126 {
127 OffsetT diagonal = (tile_idx * TILE_ITEMS);
128 CoordinateT tile_coordinate;
129 CountingInputIterator<OffsetT> nonzero_indices(0);
130
131 // Search the merge path
133 diagonal,
134 RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
135 nonzero_indices,
136 spmv_params.num_rows,
137 spmv_params.num_nonzeros,
138 tile_coordinate);
139
140 // Output starting offset
141 d_tile_coordinates[tile_idx] = tile_coordinate;
142 }
143}
144
145
149template <
150 typename SpmvPolicyT,
151 typename ScanTileStateT,
152 typename ValueT,
153 typename OffsetT,
154 typename CoordinateT,
155 bool HAS_ALPHA,
156 bool HAS_BETA>
157__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
158__global__ void DeviceSpmvKernel(
160 CoordinateT* d_tile_coordinates,
162 int num_tiles,
163 ScanTileStateT tile_state,
165{
166 // Spmv agent type specialization
167 typedef AgentSpmv<
168 SpmvPolicyT,
169 ValueT,
170 OffsetT,
171 HAS_ALPHA,
172 HAS_BETA>
173 AgentSpmvT;
174
175 // Shared memory for AgentSpmv
176 __shared__ typename AgentSpmvT::TempStorage temp_storage;
177
178 AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
181 num_tiles);
182
183 // Initialize fixup tile status
184 tile_state.InitializeStatus(num_segment_fixup_tiles);
185
186}
187
188
192template <
193 typename AgentSegmentFixupPolicyT,
194 typename PairsInputIteratorT,
195 typename AggregatesOutputIteratorT,
196 typename OffsetT,
197 typename ScanTileStateT>
198__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
199__global__ void DeviceSegmentFixupKernel(
200 PairsInputIteratorT d_pairs_in,
201 AggregatesOutputIteratorT d_aggregates_out,
203 int num_tiles,
204 ScanTileStateT tile_state)
205{
206 // Thread block type for reducing tiles of value segments
207 typedef AgentSegmentFixup<
208 AgentSegmentFixupPolicyT,
209 PairsInputIteratorT,
210 AggregatesOutputIteratorT,
212 cub::Sum,
213 OffsetT>
214 AgentSegmentFixupT;
215
216 // Shared memory for AgentSegmentFixup
217 __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
218
219 // Process tiles
220 AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
221 num_items,
222 num_tiles,
223 tile_state);
224}
225
226
227/******************************************************************************
228 * Dispatch
229 ******************************************************************************/
230
234template <
235 typename ValueT,
236 typename OffsetT>
238{
239 //---------------------------------------------------------------------
240 // Constants and Types
241 //---------------------------------------------------------------------
242
243 enum
244 {
245 INIT_KERNEL_THREADS = 128
246 };
247
248 // SpmvParams bundle type
250
251 // 2D merge path coordinate type
252 typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
253
254 // Tile status descriptor interface type
256
257 // Tuple type for scanning (pairs accumulated segment-value with segment-index)
259
260
261 //---------------------------------------------------------------------
262 // Tuning policies
263 //---------------------------------------------------------------------
264
267 {
268 typedef AgentSpmvPolicy<
269 128,
270 1,
276 false,
279
281 128,
282 4,
287 };
288
290 struct Policy200
291 {
292 typedef AgentSpmvPolicy<
293 96,
294 18,
300 false,
303
305 128,
306 4,
311
312 };
313
314
315
317 struct Policy300
318 {
319 typedef AgentSpmvPolicy<
320 96,
321 6,
327 false,
330
332 128,
333 4,
338
339 };
340
341
344 {
345 typedef AgentSpmvPolicy<
346 (sizeof(ValueT) > 4) ? 96 : 128,
347 (sizeof(ValueT) > 4) ? 4 : 7,
348 LOAD_LDG,
349 LOAD_CA,
350 LOAD_LDG,
351 LOAD_LDG,
352 LOAD_LDG,
353 (sizeof(ValueT) > 4) ? true : false,
356
358 128,
359 3,
361 LOAD_LDG,
364 };
365
366
369 {
370
371 typedef AgentSpmvPolicy<
372 (sizeof(ValueT) > 4) ? 128 : 128,
373 (sizeof(ValueT) > 4) ? 9 : 14,
374 LOAD_LDG,
375 LOAD_CA,
376 LOAD_LDG,
377 LOAD_LDG,
378 LOAD_LDG,
379 false,
382
384 128,
385 3,
387 LOAD_LDG,
390 };
391
394 {
395 typedef AgentSpmvPolicy<
396 (sizeof(ValueT) > 4) ? 64 : 128,
397 (sizeof(ValueT) > 4) ? 6 : 7,
398 LOAD_LDG,
400 (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
401 (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
402 LOAD_LDG,
403 (sizeof(ValueT) > 4) ? true : false,
404 (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
406
407
409 128,
410 3,
412 LOAD_LDG,
415 };
416
417
420 {
421 typedef AgentSpmvPolicy<
422 (sizeof(ValueT) > 4) ? 64 : 128,
423 (sizeof(ValueT) > 4) ? 5 : 7,
429 false,
432
433
435 128,
436 3,
438 LOAD_LDG,
441 };
442
443
444
445 //---------------------------------------------------------------------
446 // Tuning policies of current PTX compiler pass
447 //---------------------------------------------------------------------
448
449#if (CUB_PTX_ARCH >= 600)
450 typedef Policy600 PtxPolicy;
451
452#elif (CUB_PTX_ARCH >= 500)
453 typedef Policy500 PtxPolicy;
454
455#elif (CUB_PTX_ARCH >= 370)
456 typedef Policy370 PtxPolicy;
457
458#elif (CUB_PTX_ARCH >= 350)
459 typedef Policy350 PtxPolicy;
460
461#elif (CUB_PTX_ARCH >= 300)
462 typedef Policy300 PtxPolicy;
463
464#elif (CUB_PTX_ARCH >= 200)
465 typedef Policy200 PtxPolicy;
466
467#else
468 typedef Policy110 PtxPolicy;
469
470#endif
471
472 // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
475
476
477 //---------------------------------------------------------------------
478 // Utilities
479 //---------------------------------------------------------------------
480
484 template <typename KernelConfig>
485 CUB_RUNTIME_FUNCTION __forceinline__
486 static void InitConfigs(
487 int ptx_version,
488 KernelConfig &spmv_config,
489 KernelConfig &segment_fixup_config)
490 {
491 #if (CUB_PTX_ARCH > 0)
492
493 // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
494 spmv_config.template Init<PtxSpmvPolicyT>();
495 segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
496
497 #else
498
499 // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
500 if (ptx_version >= 600)
501 {
502 spmv_config.template Init<typename Policy600::SpmvPolicyT>();
503 segment_fixup_config.template Init<typename Policy600::SegmentFixupPolicyT>();
504 }
505 else if (ptx_version >= 500)
506 {
507 spmv_config.template Init<typename Policy500::SpmvPolicyT>();
508 segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
509 }
510 else if (ptx_version >= 370)
511 {
512 spmv_config.template Init<typename Policy370::SpmvPolicyT>();
513 segment_fixup_config.template Init<typename Policy370::SegmentFixupPolicyT>();
514 }
515 else if (ptx_version >= 350)
516 {
517 spmv_config.template Init<typename Policy350::SpmvPolicyT>();
518 segment_fixup_config.template Init<typename Policy350::SegmentFixupPolicyT>();
519 }
520 else if (ptx_version >= 300)
521 {
522 spmv_config.template Init<typename Policy300::SpmvPolicyT>();
523 segment_fixup_config.template Init<typename Policy300::SegmentFixupPolicyT>();
524
525 }
526 else if (ptx_version >= 200)
527 {
528 spmv_config.template Init<typename Policy200::SpmvPolicyT>();
529 segment_fixup_config.template Init<typename Policy200::SegmentFixupPolicyT>();
530 }
531 else
532 {
533 spmv_config.template Init<typename Policy110::SpmvPolicyT>();
534 segment_fixup_config.template Init<typename Policy110::SegmentFixupPolicyT>();
535 }
536
537 #endif
538 }
539
540
545 {
546 int block_threads;
547 int items_per_thread;
548 int tile_items;
549
550 template <typename PolicyT>
551 CUB_RUNTIME_FUNCTION __forceinline__
552 void Init()
553 {
554 block_threads = PolicyT::BLOCK_THREADS;
555 items_per_thread = PolicyT::ITEMS_PER_THREAD;
556 tile_items = block_threads * items_per_thread;
557 }
558 };
559
560
561 //---------------------------------------------------------------------
562 // Dispatch entrypoints
563 //---------------------------------------------------------------------
564
572 template <
573 typename Spmv1ColKernelT,
574 typename SpmvSearchKernelT,
575 typename SpmvKernelT,
576 typename SegmentFixupKernelT>
577 CUB_RUNTIME_FUNCTION __forceinline__
578 static cudaError_t Dispatch(
579 void* d_temp_storage,
580 size_t& temp_storage_bytes,
582 cudaStream_t stream,
583 bool debug_synchronous,
584 Spmv1ColKernelT spmv_1col_kernel,
585 SpmvSearchKernelT spmv_search_kernel,
586 SpmvKernelT spmv_kernel,
587 SegmentFixupKernelT segment_fixup_kernel,
588 KernelConfig spmv_config,
589 KernelConfig segment_fixup_config)
590 {
591#ifndef CUB_RUNTIME_ENABLED
592
593 // Kernel launch not supported from this device
594 return CubDebug(cudaErrorNotSupported );
595
596#else
597 cudaError error = cudaSuccess;
598 do
599 {
600 if (spmv_params.num_cols == 1)
601 {
602 if (d_temp_storage == NULL)
603 {
604 // Return if the caller is simply requesting the size of the storage allocation
605 temp_storage_bytes = 1;
606 break;
607 }
608
609 // Get search/init grid dims
610 int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
611 int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
612
613 if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
614 degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
615
616 // Invoke spmv_search_kernel
617 spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
619
620 // Check for failure to launch
621 if (CubDebug(error = cudaPeekAtLastError())) break;
622
623 // Sync the stream if specified to flush runtime errors
624 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
625
626 break;
627 }
628
629 // Get device ordinal
630 int device_ordinal;
631 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
632
633 // Get SM count
634 int sm_count;
635 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
636
637 // Get max x-dimension of grid
638 int max_dim_x;
639 if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
640
641 // Total number of spmv work items
642 int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
643
644 // Tile sizes of kernels
645 int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread;
646 int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
647
648 // Number of tiles for kernels
649 unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
650 unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
651
652 // Get SM occupancy for kernels
653 int spmv_sm_occupancy;
654 if (CubDebug(error = MaxSmOccupancy(
655 spmv_sm_occupancy,
656 spmv_kernel,
657 spmv_config.block_threads))) break;
658
659 int segment_fixup_sm_occupancy;
660 if (CubDebug(error = MaxSmOccupancy(
661 segment_fixup_sm_occupancy,
662 segment_fixup_kernel,
663 segment_fixup_config.block_threads))) break;
664
665 // Get grid dimensions
666 dim3 spmv_grid_size(
667 CUB_MIN(num_merge_tiles, max_dim_x),
668 (num_merge_tiles + max_dim_x - 1) / max_dim_x,
669 1);
670
671 dim3 segment_fixup_grid_size(
673 (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
674 1);
675
676 // Get the temporary storage allocation requirements
677 size_t allocation_sizes[3];
678 if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors
679 allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs
680 allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates
681
682 // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
683 void* allocations[3];
684 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
685 if (d_temp_storage == NULL)
686 {
687 // Return if the caller is simply requesting the size of the storage allocation
688 break;
689 }
690
691 // Construct the tile status interface
693 if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
694
695 // Alias the other allocations
696 KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs
697 CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates
698
699 // Get search/init grid dims
700 int search_block_size = INIT_KERNEL_THREADS;
701 int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
702
703#if (CUB_PTX_ARCH == 0)
704 // Init textures
705 if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
706#endif
707
708 if (search_grid_size < sm_count)
709// if (num_merge_tiles < spmv_sm_occupancy * sm_count)
710 {
711 // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
712 d_tile_coordinates = NULL;
713 }
714 else
715 {
716 // Use separate search kernel if we have enough spmv tiles to saturate the device
717
718 // Log spmv_search_kernel configuration
719 if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
720 search_grid_size, search_block_size, (long long) stream);
721
722 // Invoke spmv_search_kernel
723 spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
724 num_merge_tiles,
727
728 // Check for failure to launch
729 if (CubDebug(error = cudaPeekAtLastError())) break;
730
731 // Sync the stream if specified to flush runtime errors
732 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
733 }
734
735 // Log spmv_kernel configuration
736 if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
737 spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
738
739 // Invoke spmv_kernel
740 spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
744 num_merge_tiles,
747
748 // Check for failure to launch
749 if (CubDebug(error = cudaPeekAtLastError())) break;
750
751 // Sync the stream if specified to flush runtime errors
752 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
753
754 // Run reduce-by-key fixup if necessary
755 if (num_merge_tiles > 1)
756 {
757 // Log segment_fixup_kernel configuration
758 if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
759 segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
760
761 // Invoke segment_fixup_kernel
762 segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
764 spmv_params.d_vector_y,
765 num_merge_tiles,
767 tile_state);
768
769 // Check for failure to launch
770 if (CubDebug(error = cudaPeekAtLastError())) break;
771
772 // Sync the stream if specified to flush runtime errors
773 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
774 }
775
776#if (CUB_PTX_ARCH == 0)
777 // Free textures
778 if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
779#endif
780 }
781 while (0);
782
783 return error;
784
785#endif // CUB_RUNTIME_ENABLED
786 }
787
788
792 CUB_RUNTIME_FUNCTION __forceinline__
793 static cudaError_t Dispatch(
794 void* d_temp_storage,
795 size_t& temp_storage_bytes,
797 cudaStream_t stream = 0,
798 bool debug_synchronous = false)
799 {
800 cudaError error = cudaSuccess;
801 do
802 {
803 // Get PTX version
804 int ptx_version;
805 #if (CUB_PTX_ARCH == 0)
806 if (CubDebug(error = PtxVersion(ptx_version))) break;
807 #else
808 ptx_version = CUB_PTX_ARCH;
809 #endif
810
811 // Get kernel kernel dispatch configurations
812 KernelConfig spmv_config, segment_fixup_config;
813 InitConfigs(ptx_version, spmv_config, segment_fixup_config);
814
815 if (CubDebug(error = Dispatch(
816 d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
817 DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
818 DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
819 DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
820 DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
821 spmv_config, segment_fixup_config))) break;
822
823 }
824 while (0);
825
826 return error;
827 }
828};
829
830
831} // CUB namespace
832CUB_NS_POSTFIX // Optional outer namespace(s)
833
834
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
A random-access input generator for dereferencing a sequence of incrementing integer values.
@ BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_VECTORIZE
@ LOAD_LDG
Cache as texture.
@ LOAD_CA
Cache at all levels.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
OffsetT spmv_params
[in] SpMV input parameter bundle
__global__ void DeviceSpmvSearchKernel(int num_merge_tiles, CoordinateT *d_tile_coordinates, SpmvParamsT spmv_params)
< SpmvParams type
__global__ void DeviceSpmv1ColKernel(SpmvParams< ValueT, OffsetT > spmv_params)
< Signed integer type for sequence offsets
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > int ScanTileStateT int num_segment_fixup_tiles
< [in] Number of reduce-by-key tiles (fixup grid size)
OffsetT CoordinateT * d_tile_coordinates
[in] Pointer to the temporary array of tile starting coordinates
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > * d_tile_carry_pairs
[out] Pointer to the temporary array carry-out dot product row-ids, one per block
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
@ BLOCK_SCAN_RAKING
@ BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_RAKING_MEMOIZE
OffsetT OffsetT
[in] Total number of input data items
__host__ __device__ __forceinline__ void MergePathSearch(OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT &path_coordinate)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
< The BlockScan algorithm to use
AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device...
< The BlockScan algorithm to use
AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide Sp...
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream=0, bool debug_synchronous=false)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream, bool debug_synchronous, Spmv1ColKernelT spmv_1col_kernel, SpmvSearchKernelT spmv_search_kernel, SpmvKernelT spmv_kernel, SegmentFixupKernelT segment_fixup_kernel, KernelConfig spmv_config, KernelConfig segment_fixup_config)
< Function type of cub::DeviceSegmentFixupKernelT
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &spmv_config, KernelConfig &segment_fixup_config)
Default equality functor.
A key identifier paired with a corresponding value.
< Signed integer type for sequence offsets
Default sum functor.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53