OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
dispatch_spmv_orig.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "../../agent/single_pass_scan_operators.cuh"
41 #include "../../agent/agent_segment_fixup.cuh"
42 #include "../../agent/agent_spmv_orig.cuh"
43 #include "../../util_type.cuh"
44 #include "../../util_debug.cuh"
45 #include "../../util_device.cuh"
46 #include "../../thread/thread_search.cuh"
47 #include "../../grid/grid_queue.cuh"
48 #include "../../util_namespace.cuh"
49 
51 CUB_NS_PREFIX
52 
54 namespace cub {
55 
56 
57 /******************************************************************************
58  * SpMV kernel entry points
59  *****************************************************************************/
60 
64 template <
65  typename AgentSpmvPolicyT,
66  typename ValueT,
67  typename OffsetT>
68 __global__ void DeviceSpmv1ColKernel(
70 {
72  AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
73  ValueT,
74  OffsetT>
75  VectorValueIteratorT;
76 
77  VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
78 
79  int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
80  if (row_idx < spmv_params.num_rows)
81  {
82  OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
83  OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
84 
85  ValueT value = 0.0;
86  if (end_nonzero_idx != nonzero_idx)
87  {
88  value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
89  }
90 
91  spmv_params.d_vector_y[row_idx] = value;
92  }
93 }
94 
95 
99 template <
100  typename SpmvPolicyT,
101  typename OffsetT,
102  typename CoordinateT,
103  typename SpmvParamsT>
104 __global__ void DeviceSpmvSearchKernel(
105  int num_merge_tiles,
106  CoordinateT* d_tile_coordinates,
107  SpmvParamsT spmv_params)
108 {
110  enum
111  {
112  BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS,
113  ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD,
114  TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
115  };
116 
118  SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
119  OffsetT,
120  OffsetT>
121  RowOffsetsSearchIteratorT;
122 
123  // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
124  int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
125  if (tile_idx < num_merge_tiles + 1)
126  {
127  OffsetT diagonal = (tile_idx * TILE_ITEMS);
128  CoordinateT tile_coordinate;
129  CountingInputIterator<OffsetT> nonzero_indices(0);
130 
131  // Search the merge path
133  diagonal,
134  RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
135  nonzero_indices,
136  spmv_params.num_rows,
137  spmv_params.num_nonzeros,
138  tile_coordinate);
139 
140  // Output starting offset
141  d_tile_coordinates[tile_idx] = tile_coordinate;
142  }
143 }
144 
145 
149 template <
150  typename SpmvPolicyT,
151  typename ScanTileStateT,
152  typename ValueT,
153  typename OffsetT,
154  typename CoordinateT,
155  bool HAS_ALPHA,
156  bool HAS_BETA>
157 __launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
158 __global__ void DeviceSpmvKernel(
160  CoordinateT* d_tile_coordinates,
162  int num_tiles,
163  ScanTileStateT tile_state,
165 {
166  // Spmv agent type specialization
167  typedef AgentSpmv<
168  SpmvPolicyT,
169  ValueT,
170  OffsetT,
171  HAS_ALPHA,
172  HAS_BETA>
173  AgentSpmvT;
174 
175  // Shared memory for AgentSpmv
176  __shared__ typename AgentSpmvT::TempStorage temp_storage;
177 
178  AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
181  num_tiles);
182 
183  // Initialize fixup tile status
184  tile_state.InitializeStatus(num_segment_fixup_tiles);
185 
186 }
187 
188 
192 template <
193  typename AgentSegmentFixupPolicyT,
194  typename PairsInputIteratorT,
195  typename AggregatesOutputIteratorT,
196  typename OffsetT,
197  typename ScanTileStateT>
198 __launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
199 __global__ void DeviceSegmentFixupKernel(
200  PairsInputIteratorT d_pairs_in,
201  AggregatesOutputIteratorT d_aggregates_out,
203  int num_tiles,
204  ScanTileStateT tile_state)
205 {
206  // Thread block type for reducing tiles of value segments
207  typedef AgentSegmentFixup<
208  AgentSegmentFixupPolicyT,
209  PairsInputIteratorT,
210  AggregatesOutputIteratorT,
212  cub::Sum,
213  OffsetT>
214  AgentSegmentFixupT;
215 
216  // Shared memory for AgentSegmentFixup
217  __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
218 
219  // Process tiles
220  AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
221  num_items,
222  num_tiles,
223  tile_state);
224 }
225 
226 
227 /******************************************************************************
228  * Dispatch
229  ******************************************************************************/
230 
234 template <
235  typename ValueT,
236  typename OffsetT>
238 {
239  //---------------------------------------------------------------------
240  // Constants and Types
241  //---------------------------------------------------------------------
242 
243  enum
244  {
245  INIT_KERNEL_THREADS = 128
246  };
247 
248  // SpmvParams bundle type
250 
251  // 2D merge path coordinate type
252  typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
253 
254  // Tile status descriptor interface type
256 
257  // Tuple type for scanning (pairs accumulated segment-value with segment-index)
259 
260 
261  //---------------------------------------------------------------------
262  // Tuning policies
263  //---------------------------------------------------------------------
264 
266  struct Policy110
267  {
268  typedef AgentSpmvPolicy<
269  128,
270  1,
271  LOAD_DEFAULT,
272  LOAD_DEFAULT,
273  LOAD_DEFAULT,
274  LOAD_DEFAULT,
275  LOAD_DEFAULT,
276  false,
278  SpmvPolicyT;
279 
280  typedef AgentSegmentFixupPolicy<
281  128,
282  4,
284  LOAD_DEFAULT,
287  };
288 
290  struct Policy200
291  {
292  typedef AgentSpmvPolicy<
293  96,
294  18,
295  LOAD_DEFAULT,
296  LOAD_DEFAULT,
297  LOAD_DEFAULT,
298  LOAD_DEFAULT,
299  LOAD_DEFAULT,
300  false,
302  SpmvPolicyT;
303 
304  typedef AgentSegmentFixupPolicy<
305  128,
306  4,
308  LOAD_DEFAULT,
311 
312  };
313 
314 
315 
317  struct Policy300
318  {
319  typedef AgentSpmvPolicy<
320  96,
321  6,
322  LOAD_DEFAULT,
323  LOAD_DEFAULT,
324  LOAD_DEFAULT,
325  LOAD_DEFAULT,
326  LOAD_DEFAULT,
327  false,
329  SpmvPolicyT;
330 
331  typedef AgentSegmentFixupPolicy<
332  128,
333  4,
335  LOAD_DEFAULT,
338 
339  };
340 
341 
343  struct Policy350
344  {
345  typedef AgentSpmvPolicy<
346  (sizeof(ValueT) > 4) ? 96 : 128,
347  (sizeof(ValueT) > 4) ? 4 : 7,
348  LOAD_LDG,
349  LOAD_CA,
350  LOAD_LDG,
351  LOAD_LDG,
352  LOAD_LDG,
353  (sizeof(ValueT) > 4) ? true : false,
355  SpmvPolicyT;
356 
357  typedef AgentSegmentFixupPolicy<
358  128,
359  3,
361  LOAD_LDG,
364  };
365 
366 
368  struct Policy370
369  {
370 
371  typedef AgentSpmvPolicy<
372  (sizeof(ValueT) > 4) ? 128 : 128,
373  (sizeof(ValueT) > 4) ? 9 : 14,
374  LOAD_LDG,
375  LOAD_CA,
376  LOAD_LDG,
377  LOAD_LDG,
378  LOAD_LDG,
379  false,
381  SpmvPolicyT;
382 
383  typedef AgentSegmentFixupPolicy<
384  128,
385  3,
387  LOAD_LDG,
390  };
391 
393  struct Policy500
394  {
395  typedef AgentSpmvPolicy<
396  (sizeof(ValueT) > 4) ? 64 : 128,
397  (sizeof(ValueT) > 4) ? 6 : 7,
398  LOAD_LDG,
399  LOAD_DEFAULT,
400  (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
401  (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
402  LOAD_LDG,
403  (sizeof(ValueT) > 4) ? true : false,
404  (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
405  SpmvPolicyT;
406 
407 
408  typedef AgentSegmentFixupPolicy<
409  128,
410  3,
412  LOAD_LDG,
415  };
416 
417 
419  struct Policy600
420  {
421  typedef AgentSpmvPolicy<
422  (sizeof(ValueT) > 4) ? 64 : 128,
423  (sizeof(ValueT) > 4) ? 5 : 7,
424  LOAD_DEFAULT,
425  LOAD_DEFAULT,
426  LOAD_DEFAULT,
427  LOAD_DEFAULT,
428  LOAD_DEFAULT,
429  false,
431  SpmvPolicyT;
432 
433 
434  typedef AgentSegmentFixupPolicy<
435  128,
436  3,
438  LOAD_LDG,
441  };
442 
443 
444 
445  //---------------------------------------------------------------------
446  // Tuning policies of current PTX compiler pass
447  //---------------------------------------------------------------------
448 
449 #if (CUB_PTX_ARCH >= 600)
450  typedef Policy600 PtxPolicy;
451 
452 #elif (CUB_PTX_ARCH >= 500)
453  typedef Policy500 PtxPolicy;
454 
455 #elif (CUB_PTX_ARCH >= 370)
456  typedef Policy370 PtxPolicy;
457 
458 #elif (CUB_PTX_ARCH >= 350)
459  typedef Policy350 PtxPolicy;
460 
461 #elif (CUB_PTX_ARCH >= 300)
462  typedef Policy300 PtxPolicy;
463 
464 #elif (CUB_PTX_ARCH >= 200)
465  typedef Policy200 PtxPolicy;
466 
467 #else
468  typedef Policy110 PtxPolicy;
469 
470 #endif
471 
472  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
475 
476 
477  //---------------------------------------------------------------------
478  // Utilities
479  //---------------------------------------------------------------------
480 
484  template <typename KernelConfig>
485  CUB_RUNTIME_FUNCTION __forceinline__
486  static void InitConfigs(
487  int ptx_version,
488  KernelConfig &spmv_config,
489  KernelConfig &segment_fixup_config)
490  {
491  #if (CUB_PTX_ARCH > 0)
492 
493  // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
494  spmv_config.template Init<PtxSpmvPolicyT>();
495  segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
496 
497  #else
498 
499  // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
500  if (ptx_version >= 600)
501  {
502  spmv_config.template Init<typename Policy600::SpmvPolicyT>();
503  segment_fixup_config.template Init<typename Policy600::SegmentFixupPolicyT>();
504  }
505  else if (ptx_version >= 500)
506  {
507  spmv_config.template Init<typename Policy500::SpmvPolicyT>();
508  segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
509  }
510  else if (ptx_version >= 370)
511  {
512  spmv_config.template Init<typename Policy370::SpmvPolicyT>();
513  segment_fixup_config.template Init<typename Policy370::SegmentFixupPolicyT>();
514  }
515  else if (ptx_version >= 350)
516  {
517  spmv_config.template Init<typename Policy350::SpmvPolicyT>();
518  segment_fixup_config.template Init<typename Policy350::SegmentFixupPolicyT>();
519  }
520  else if (ptx_version >= 300)
521  {
522  spmv_config.template Init<typename Policy300::SpmvPolicyT>();
523  segment_fixup_config.template Init<typename Policy300::SegmentFixupPolicyT>();
524 
525  }
526  else if (ptx_version >= 200)
527  {
528  spmv_config.template Init<typename Policy200::SpmvPolicyT>();
529  segment_fixup_config.template Init<typename Policy200::SegmentFixupPolicyT>();
530  }
531  else
532  {
533  spmv_config.template Init<typename Policy110::SpmvPolicyT>();
534  segment_fixup_config.template Init<typename Policy110::SegmentFixupPolicyT>();
535  }
536 
537  #endif
538  }
539 
540 
545  {
546  int block_threads;
547  int items_per_thread;
548  int tile_items;
549 
550  template <typename PolicyT>
551  CUB_RUNTIME_FUNCTION __forceinline__
552  void Init()
553  {
554  block_threads = PolicyT::BLOCK_THREADS;
555  items_per_thread = PolicyT::ITEMS_PER_THREAD;
556  tile_items = block_threads * items_per_thread;
557  }
558  };
559 
560 
561  //---------------------------------------------------------------------
562  // Dispatch entrypoints
563  //---------------------------------------------------------------------
564 
572  template <
573  typename Spmv1ColKernelT,
574  typename SpmvSearchKernelT,
575  typename SpmvKernelT,
576  typename SegmentFixupKernelT>
577  CUB_RUNTIME_FUNCTION __forceinline__
578  static cudaError_t Dispatch(
579  void* d_temp_storage,
580  size_t& temp_storage_bytes,
582  cudaStream_t stream,
583  bool debug_synchronous,
584  Spmv1ColKernelT spmv_1col_kernel,
585  SpmvSearchKernelT spmv_search_kernel,
586  SpmvKernelT spmv_kernel,
587  SegmentFixupKernelT segment_fixup_kernel,
588  KernelConfig spmv_config,
589  KernelConfig segment_fixup_config)
590  {
591 #ifndef CUB_RUNTIME_ENABLED
592 
593  // Kernel launch not supported from this device
594  return CubDebug(cudaErrorNotSupported );
595 
596 #else
597  cudaError error = cudaSuccess;
598  do
599  {
600  if (spmv_params.num_cols == 1)
601  {
602  if (d_temp_storage == NULL)
603  {
604  // Return if the caller is simply requesting the size of the storage allocation
605  temp_storage_bytes = 1;
606  break;
607  }
608 
609  // Get search/init grid dims
610  int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
611  int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
612 
613  if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
614  degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
615 
616  // Invoke spmv_search_kernel
617  spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
618  spmv_params);
619 
620  // Check for failure to launch
621  if (CubDebug(error = cudaPeekAtLastError())) break;
622 
623  // Sync the stream if specified to flush runtime errors
624  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
625 
626  break;
627  }
628 
629  // Get device ordinal
630  int device_ordinal;
631  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
632 
633  // Get SM count
634  int sm_count;
635  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
636 
637  // Get max x-dimension of grid
638  int max_dim_x;
639  if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
640 
641  // Total number of spmv work items
642  int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
643 
644  // Tile sizes of kernels
645  int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread;
646  int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
647 
648  // Number of tiles for kernels
649  unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
650  unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
651 
652  // Get SM occupancy for kernels
653  int spmv_sm_occupancy;
654  if (CubDebug(error = MaxSmOccupancy(
655  spmv_sm_occupancy,
656  spmv_kernel,
657  spmv_config.block_threads))) break;
658 
659  int segment_fixup_sm_occupancy;
660  if (CubDebug(error = MaxSmOccupancy(
661  segment_fixup_sm_occupancy,
662  segment_fixup_kernel,
663  segment_fixup_config.block_threads))) break;
664 
665  // Get grid dimensions
666  dim3 spmv_grid_size(
667  CUB_MIN(num_merge_tiles, max_dim_x),
668  (num_merge_tiles + max_dim_x - 1) / max_dim_x,
669  1);
670 
671  dim3 segment_fixup_grid_size(
672  CUB_MIN(num_segment_fixup_tiles, max_dim_x),
673  (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
674  1);
675 
676  // Get the temporary storage allocation requirements
677  size_t allocation_sizes[3];
678  if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors
679  allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs
680  allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates
681 
682  // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
683  void* allocations[3];
684  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
685  if (d_temp_storage == NULL)
686  {
687  // Return if the caller is simply requesting the size of the storage allocation
688  break;
689  }
690 
691  // Construct the tile status interface
693  if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
694 
695  // Alias the other allocations
696  KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs
697  CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates
698 
699  // Get search/init grid dims
700  int search_block_size = INIT_KERNEL_THREADS;
701  int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
702 
703 #if (CUB_PTX_ARCH == 0)
704  // Init textures
705  if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
706 #endif
707 
708  if (search_grid_size < sm_count)
709 // if (num_merge_tiles < spmv_sm_occupancy * sm_count)
710  {
711  // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
712  d_tile_coordinates = NULL;
713  }
714  else
715  {
716  // Use separate search kernel if we have enough spmv tiles to saturate the device
717 
718  // Log spmv_search_kernel configuration
719  if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
720  search_grid_size, search_block_size, (long long) stream);
721 
722  // Invoke spmv_search_kernel
723  spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
724  num_merge_tiles,
726  spmv_params);
727 
728  // Check for failure to launch
729  if (CubDebug(error = cudaPeekAtLastError())) break;
730 
731  // Sync the stream if specified to flush runtime errors
732  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
733  }
734 
735  // Log spmv_kernel configuration
736  if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
737  spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
738 
739  // Invoke spmv_kernel
740  spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
741  spmv_params,
744  num_merge_tiles,
745  tile_state,
747 
748  // Check for failure to launch
749  if (CubDebug(error = cudaPeekAtLastError())) break;
750 
751  // Sync the stream if specified to flush runtime errors
752  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
753 
754  // Run reduce-by-key fixup if necessary
755  if (num_merge_tiles > 1)
756  {
757  // Log segment_fixup_kernel configuration
758  if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
759  segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
760 
761  // Invoke segment_fixup_kernel
762  segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
764  spmv_params.d_vector_y,
765  num_merge_tiles,
767  tile_state);
768 
769  // Check for failure to launch
770  if (CubDebug(error = cudaPeekAtLastError())) break;
771 
772  // Sync the stream if specified to flush runtime errors
773  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
774  }
775 
776 #if (CUB_PTX_ARCH == 0)
777  // Free textures
778  if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
779 #endif
780  }
781  while (0);
782 
783  return error;
784 
785 #endif // CUB_RUNTIME_ENABLED
786  }
787 
788 
792  CUB_RUNTIME_FUNCTION __forceinline__
793  static cudaError_t Dispatch(
794  void* d_temp_storage,
795  size_t& temp_storage_bytes,
797  cudaStream_t stream = 0,
798  bool debug_synchronous = false)
799  {
800  cudaError error = cudaSuccess;
801  do
802  {
803  // Get PTX version
804  int ptx_version;
805  #if (CUB_PTX_ARCH == 0)
806  if (CubDebug(error = PtxVersion(ptx_version))) break;
807  #else
808  ptx_version = CUB_PTX_ARCH;
809  #endif
810 
811  // Get kernel kernel dispatch configurations
812  KernelConfig spmv_config, segment_fixup_config;
813  InitConfigs(ptx_version, spmv_config, segment_fixup_config);
814 
815  if (CubDebug(error = Dispatch(
816  d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
817  DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
818  DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
819  DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
820  DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
821  spmv_config, segment_fixup_config))) break;
822 
823  }
824  while (0);
825 
826  return error;
827  }
828 };
829 
830 
831 } // CUB namespace
832 CUB_NS_POSTFIX // Optional outer namespace(s)
833 
834 
Cache as texture.
Definition: thread_load.cuh:69
Cache at all levels.
Definition: thread_load.cuh:65
Default (no modifier)
Definition: thread_load.cuh:64
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > * d_tile_carry_pairs
[out] Pointer to the temporary array carry-out dot product row-ids, one per block
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
A random-access input generator for dereferencing a sequence of incrementing integer values.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Optional outer namespace(s)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream=0, bool debug_synchronous=false)
< The BlockScan algorithm to use
A key identifier paired with a corresponding value.
Definition: util_type.cuh:666
Default equality functor.
AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide Sp...
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
OffsetT OffsetT
[in] Total number of input data items
__host__ __device__ __forceinline__ void MergePathSearch(OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT &path_coordinate)
OffsetT CoordinateT * d_tile_coordinates
[in] Pointer to the temporary array of tile starting coordinates
__global__ void DeviceSpmv1ColKernel(SpmvParams< ValueT, OffsetT > spmv_params)
< Signed integer type for sequence offsets
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
__global__ void DeviceSpmvSearchKernel(int num_merge_tiles, CoordinateT *d_tile_coordinates, SpmvParamsT spmv_params)
< SpmvParams type
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream, bool debug_synchronous, Spmv1ColKernelT spmv_1col_kernel, SpmvSearchKernelT spmv_search_kernel, SpmvKernelT spmv_kernel, SegmentFixupKernelT segment_fixup_kernel, KernelConfig spmv_config, KernelConfig segment_fixup_config)
< Function type of cub::DeviceSegmentFixupKernelT
< The BlockScan algorithm to use
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition: util_type.cuh:454
OffsetT spmv_params
[in] SpMV input parameter bundle
< Signed integer type for global offsets
Default sum functor.
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &spmv_config, KernelConfig &segment_fixup_config)
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
< Signed integer type for sequence offsets
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > int ScanTileStateT int num_segment_fixup_tiles
< [in] Number of reduce-by-key tiles (fixup grid size)