40 #include "../../agent/agent_scan.cuh" 41 #include "../../thread/thread_operators.cuh" 42 #include "../../grid/grid_queue.cuh" 43 #include "../../util_arch.cuh" 44 #include "../../util_debug.cuh" 45 #include "../../util_device.cuh" 46 #include "../../util_namespace.cuh" 63 typename ScanTileStateT>
76 typename ScanTileStateT,
77 typename NumSelectedIteratorT>
87 if ((blockIdx.x == 0) && (threadIdx.x == 0))
97 typename InputIteratorT,
98 typename OutputIteratorT,
99 typename ScanTileStateT,
104 __global__
void DeviceScanKernel(
123 __shared__
typename AgentScanT::TempStorage temp_storage;
144 typename InputIteratorT,
145 typename OutputIteratorT,
157 INIT_KERNEL_THREADS = 128
162 typename std::iterator_traits<InputIteratorT>::value_type,
163 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
267 #if (CUB_PTX_ARCH >= 600) 270 #elif (CUB_PTX_ARCH >= 520) 273 #elif (CUB_PTX_ARCH >= 350) 276 #elif (CUB_PTX_ARCH >= 300) 279 #elif (CUB_PTX_ARCH >= 200) 282 #elif (CUB_PTX_ARCH >= 130) 301 template <
typename KernelConfig>
302 CUB_RUNTIME_FUNCTION __forceinline__
307 #if (CUB_PTX_ARCH > 0) 311 scan_kernel_config.template Init<PtxAgentScanPolicy>();
316 if (ptx_version >= 600)
318 scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
320 else if (ptx_version >= 520)
322 scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
324 else if (ptx_version >= 350)
326 scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
328 else if (ptx_version >= 300)
330 scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
332 else if (ptx_version >= 200)
334 scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
336 else if (ptx_version >= 130)
338 scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
342 scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
355 int items_per_thread;
358 template <
typename PolicyT>
359 CUB_RUNTIME_FUNCTION __forceinline__
362 block_threads = PolicyT::BLOCK_THREADS;
363 items_per_thread = PolicyT::ITEMS_PER_THREAD;
364 tile_items = block_threads * items_per_thread;
378 typename ScanInitKernelPtrT,
379 typename ScanSweepKernelPtrT>
380 CUB_RUNTIME_FUNCTION __forceinline__
382 void* d_temp_storage,
383 size_t& temp_storage_bytes,
385 OutputIteratorT
d_out,
390 bool debug_synchronous,
392 ScanInitKernelPtrT init_kernel,
393 ScanSweepKernelPtrT scan_kernel,
397 #ifndef CUB_RUNTIME_ENABLED 398 (void)d_temp_storage;
399 (void)temp_storage_bytes;
406 (void)debug_synchronous;
409 (void)scan_kernel_config;
412 return CubDebug(cudaErrorNotSupported);
415 cudaError error = cudaSuccess;
420 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
424 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
427 int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
431 size_t allocation_sizes[1];
432 if (
CubDebug(error = ScanTileStateT::AllocationSize(
num_tiles, allocation_sizes[0])))
break;
435 void* allocations[1];
437 if (d_temp_storage == NULL)
452 int init_grid_size = (
num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
453 if (debug_synchronous)
_CubLog(
"Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (
long long) stream);
456 init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
461 if (
CubDebug(error = cudaPeekAtLastError()))
break;
467 int scan_sm_occupancy;
471 scan_kernel_config.block_threads)))
break;
475 if (
CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
break;;
482 if (debug_synchronous)
_CubLog(
"Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
483 start_tile, scan_grid_size, scan_kernel_config.block_threads, (
long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
486 scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
496 if (
CubDebug(error = cudaPeekAtLastError()))
break;
506 #endif // CUB_RUNTIME_ENABLED 513 CUB_RUNTIME_FUNCTION __forceinline__
515 void* d_temp_storage,
516 size_t& temp_storage_bytes,
518 OutputIteratorT
d_out,
523 bool debug_synchronous)
525 cudaError error = cudaSuccess;
548 DeviceScanInitKernel<ScanTileStateT>,
549 DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
550 scan_kernel_config)))
break;
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT d_num_selected_out
[out] Pointer to the total number of items selected (i.e., length of d_selected_out)
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Optional outer namespace(s)
__global__ void DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
< Output iterator type for recording the number of items selected
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &scan_kernel_config)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
#define _CubLog(format,...)
Log macro for printf statements.
OffsetT OffsetT
[in] Total number of input data items
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
< The BlockScan algorithm to use
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelPtrT init_kernel, ScanSweepKernelPtrT scan_kernel, KernelConfig scan_kernel_config)
< Function type of cub::DeviceScanKernelPtrT
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Type selection (IF ? ThenType : ElseType)
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
OutputIteratorT ScanTileStateT int ScanOpT InitValueT init_value
Initial value to seed the exclusive scan.
#define CUB_MIN(a, b)
Select minimum(a, b)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
#define CubDebug(e)
Debug macro.
__global__ void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
< Tile status interface type
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)