40#include "../../agent/single_pass_scan_operators.cuh"
41#include "../../agent/agent_segment_fixup.cuh"
42#include "../../agent/agent_spmv_orig.cuh"
43#include "../../util_type.cuh"
44#include "../../util_debug.cuh"
45#include "../../util_device.cuh"
46#include "../../thread/thread_search.cuh"
47#include "../../grid/grid_queue.cuh"
48#include "../../util_namespace.cuh"
65 typename AgentSpmvPolicyT,
72 AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
77 VectorValueIteratorT wrapped_vector_x(
spmv_params.d_vector_x);
79 int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
86 if (end_nonzero_idx != nonzero_idx)
100 typename SpmvPolicyT,
102 typename CoordinateT,
103 typename SpmvParamsT>
112 BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS,
113 ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD,
114 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
118 SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
121 RowOffsetsSearchIteratorT;
124 int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
125 if (tile_idx < num_merge_tiles + 1)
127 OffsetT diagonal = (tile_idx * TILE_ITEMS);
128 CoordinateT tile_coordinate;
134 RowOffsetsSearchIteratorT(
spmv_params.d_row_end_offsets),
150 typename SpmvPolicyT,
151 typename ScanTileStateT,
154 typename CoordinateT,
158__global__
void DeviceSpmvKernel(
176 __shared__
typename AgentSpmvT::TempStorage temp_storage;
193 typename AgentSegmentFixupPolicyT,
194 typename PairsInputIteratorT,
195 typename AggregatesOutputIteratorT,
197 typename ScanTileStateT>
199__global__
void DeviceSegmentFixupKernel(
200 PairsInputIteratorT d_pairs_in,
208 AgentSegmentFixupPolicyT,
210 AggregatesOutputIteratorT,
217 __shared__
typename AgentSegmentFixupT::TempStorage temp_storage;
245 INIT_KERNEL_THREADS = 128
346 (
sizeof(ValueT) > 4) ? 96 : 128,
347 (
sizeof(ValueT) > 4) ? 4 : 7,
353 (
sizeof(ValueT) > 4) ? true :
false,
372 (
sizeof(ValueT) > 4) ? 128 : 128,
373 (
sizeof(ValueT) > 4) ? 9 : 14,
396 (
sizeof(ValueT) > 4) ? 64 : 128,
397 (
sizeof(ValueT) > 4) ? 6 : 7,
403 (
sizeof(ValueT) > 4) ? true :
false,
422 (
sizeof(ValueT) > 4) ? 64 : 128,
423 (
sizeof(ValueT) > 4) ? 5 : 7,
449#if (CUB_PTX_ARCH >= 600)
452#elif (CUB_PTX_ARCH >= 500)
455#elif (CUB_PTX_ARCH >= 370)
458#elif (CUB_PTX_ARCH >= 350)
461#elif (CUB_PTX_ARCH >= 300)
464#elif (CUB_PTX_ARCH >= 200)
484 template <
typename KernelConfig>
485 CUB_RUNTIME_FUNCTION __forceinline__
491 #if (CUB_PTX_ARCH > 0)
494 spmv_config.template Init<PtxSpmvPolicyT>();
495 segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
500 if (ptx_version >= 600)
502 spmv_config.template Init<typename Policy600::SpmvPolicyT>();
503 segment_fixup_config.template Init<typename Policy600::SegmentFixupPolicyT>();
505 else if (ptx_version >= 500)
507 spmv_config.template Init<typename Policy500::SpmvPolicyT>();
508 segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
510 else if (ptx_version >= 370)
512 spmv_config.template Init<typename Policy370::SpmvPolicyT>();
513 segment_fixup_config.template Init<typename Policy370::SegmentFixupPolicyT>();
515 else if (ptx_version >= 350)
517 spmv_config.template Init<typename Policy350::SpmvPolicyT>();
518 segment_fixup_config.template Init<typename Policy350::SegmentFixupPolicyT>();
520 else if (ptx_version >= 300)
522 spmv_config.template Init<typename Policy300::SpmvPolicyT>();
523 segment_fixup_config.template Init<typename Policy300::SegmentFixupPolicyT>();
526 else if (ptx_version >= 200)
528 spmv_config.template Init<typename Policy200::SpmvPolicyT>();
529 segment_fixup_config.template Init<typename Policy200::SegmentFixupPolicyT>();
533 spmv_config.template Init<typename Policy110::SpmvPolicyT>();
534 segment_fixup_config.template Init<typename Policy110::SegmentFixupPolicyT>();
547 int items_per_thread;
550 template <
typename PolicyT>
551 CUB_RUNTIME_FUNCTION __forceinline__
554 block_threads = PolicyT::BLOCK_THREADS;
555 items_per_thread = PolicyT::ITEMS_PER_THREAD;
556 tile_items = block_threads * items_per_thread;
573 typename Spmv1ColKernelT,
574 typename SpmvSearchKernelT,
575 typename SpmvKernelT,
576 typename SegmentFixupKernelT>
577 CUB_RUNTIME_FUNCTION __forceinline__
579 void* d_temp_storage,
580 size_t& temp_storage_bytes,
583 bool debug_synchronous,
584 Spmv1ColKernelT spmv_1col_kernel,
585 SpmvSearchKernelT spmv_search_kernel,
586 SpmvKernelT spmv_kernel,
587 SegmentFixupKernelT segment_fixup_kernel,
591#ifndef CUB_RUNTIME_ENABLED
594 return CubDebug(cudaErrorNotSupported );
597 cudaError error = cudaSuccess;
602 if (d_temp_storage == NULL)
605 temp_storage_bytes = 1;
610 int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
611 int degen_col_kernel_grid_size = (
spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
613 if (debug_synchronous)
_CubLog(
"Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
614 degen_col_kernel_grid_size, degen_col_kernel_block_size, (
long long) stream);
617 spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
621 if (
CubDebug(error = cudaPeekAtLastError()))
break;
631 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
635 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
639 if (
CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
break;;
645 int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread;
646 int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
649 unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
650 unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
653 int spmv_sm_occupancy;
657 spmv_config.block_threads)))
break;
659 int segment_fixup_sm_occupancy;
661 segment_fixup_sm_occupancy,
662 segment_fixup_kernel,
663 segment_fixup_config.block_threads)))
break;
667 CUB_MIN(num_merge_tiles, max_dim_x),
668 (num_merge_tiles + max_dim_x - 1) / max_dim_x,
671 dim3 segment_fixup_grid_size(
677 size_t allocation_sizes[3];
679 allocation_sizes[1] = num_merge_tiles *
sizeof(
KeyValuePairT);
680 allocation_sizes[2] = (num_merge_tiles + 1) *
sizeof(CoordinateT);
683 void* allocations[3];
685 if (d_temp_storage == NULL)
700 int search_block_size = INIT_KERNEL_THREADS;
701 int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
703#if (CUB_PTX_ARCH == 0)
708 if (search_grid_size < sm_count)
719 if (debug_synchronous)
_CubLog(
"Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
720 search_grid_size, search_block_size, (
long long) stream);
723 spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
729 if (
CubDebug(error = cudaPeekAtLastError()))
break;
736 if (debug_synchronous)
_CubLog(
"Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
737 spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (
long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
740 spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
749 if (
CubDebug(error = cudaPeekAtLastError()))
break;
755 if (num_merge_tiles > 1)
758 if (debug_synchronous)
_CubLog(
"Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
759 segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (
long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
762 segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
770 if (
CubDebug(error = cudaPeekAtLastError()))
break;
776#if (CUB_PTX_ARCH == 0)
792 CUB_RUNTIME_FUNCTION __forceinline__
794 void* d_temp_storage,
795 size_t& temp_storage_bytes,
797 cudaStream_t stream = 0,
798 bool debug_synchronous =
false)
800 cudaError error = cudaSuccess;
805 #if (CUB_PTX_ARCH == 0)
813 InitConfigs(ptx_version, spmv_config, segment_fixup_config);
816 d_temp_storage, temp_storage_bytes,
spmv_params, stream, debug_synchronous,
817 DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
818 DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
819 DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
820 DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
821 spmv_config, segment_fixup_config)))
break;
@ LOAD_LDG
Cache as texture.
@ LOAD_CA
Cache at all levels.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
OffsetT spmv_params
[in] SpMV input parameter bundle
__global__ void DeviceSpmvSearchKernel(int num_merge_tiles, CoordinateT *d_tile_coordinates, SpmvParamsT spmv_params)
< SpmvParams type
__global__ void DeviceSpmv1ColKernel(SpmvParams< ValueT, OffsetT > spmv_params)
< Signed integer type for sequence offsets
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > int ScanTileStateT int num_segment_fixup_tiles
< [in] Number of reduce-by-key tiles (fixup grid size)
OffsetT CoordinateT * d_tile_coordinates
[in] Pointer to the temporary array of tile starting coordinates
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OffsetT CoordinateT KeyValuePair< OffsetT, ValueT > * d_tile_carry_pairs
[out] Pointer to the temporary array carry-out dot product row-ids, one per block
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
@ BLOCK_SCAN_RAKING_MEMOIZE
OffsetT OffsetT
[in] Total number of input data items
__host__ __device__ __forceinline__ void MergePathSearch(OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT &path_coordinate)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
< The BlockScan algorithm to use
AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device...
< The BlockScan algorithm to use
AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide Sp...
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream=0, bool debug_synchronous=false)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, SpmvParamsT &spmv_params, cudaStream_t stream, bool debug_synchronous, Spmv1ColKernelT spmv_1col_kernel, SpmvSearchKernelT spmv_search_kernel, SpmvKernelT spmv_kernel, SegmentFixupKernelT segment_fixup_kernel, KernelConfig spmv_config, KernelConfig segment_fixup_config)
< Function type of cub::DeviceSegmentFixupKernelT
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &spmv_config, KernelConfig &segment_fixup_config)
Default equality functor.
A key identifier paired with a corresponding value.
< Signed integer type for sequence offsets
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...