40#include "../../agent/agent_scan.cuh"
41#include "../../thread/thread_operators.cuh"
42#include "../../grid/grid_queue.cuh"
43#include "../../util_arch.cuh"
44#include "../../util_debug.cuh"
45#include "../../util_device.cuh"
46#include "../../util_namespace.cuh"
63 typename ScanTileStateT>
76 typename ScanTileStateT,
77 typename NumSelectedIteratorT>
87 if ((blockIdx.x == 0) && (threadIdx.x == 0))
97 typename InputIteratorT,
98 typename OutputIteratorT,
99 typename ScanTileStateT,
104__global__
void DeviceScanKernel(
106 OutputIteratorT
d_out,
123 __shared__
typename AgentScanT::TempStorage temp_storage;
144 typename InputIteratorT,
145 typename OutputIteratorT,
157 INIT_KERNEL_THREADS = 128
162 typename std::iterator_traits<InputIteratorT>::value_type,
163 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
267#if (CUB_PTX_ARCH >= 600)
270#elif (CUB_PTX_ARCH >= 520)
273#elif (CUB_PTX_ARCH >= 350)
276#elif (CUB_PTX_ARCH >= 300)
279#elif (CUB_PTX_ARCH >= 200)
282#elif (CUB_PTX_ARCH >= 130)
301 template <
typename KernelConfig>
302 CUB_RUNTIME_FUNCTION __forceinline__
307 #if (CUB_PTX_ARCH > 0)
311 scan_kernel_config.template Init<PtxAgentScanPolicy>();
316 if (ptx_version >= 600)
318 scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
320 else if (ptx_version >= 520)
322 scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
324 else if (ptx_version >= 350)
326 scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
328 else if (ptx_version >= 300)
330 scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
332 else if (ptx_version >= 200)
334 scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
336 else if (ptx_version >= 130)
338 scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
342 scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
355 int items_per_thread;
358 template <
typename PolicyT>
359 CUB_RUNTIME_FUNCTION __forceinline__
362 block_threads = PolicyT::BLOCK_THREADS;
363 items_per_thread = PolicyT::ITEMS_PER_THREAD;
364 tile_items = block_threads * items_per_thread;
378 typename ScanInitKernelPtrT,
379 typename ScanSweepKernelPtrT>
380 CUB_RUNTIME_FUNCTION __forceinline__
382 void* d_temp_storage,
383 size_t& temp_storage_bytes,
385 OutputIteratorT
d_out,
390 bool debug_synchronous,
392 ScanInitKernelPtrT init_kernel,
393 ScanSweepKernelPtrT scan_kernel,
397#ifndef CUB_RUNTIME_ENABLED
398 (void)d_temp_storage;
399 (void)temp_storage_bytes;
406 (void)debug_synchronous;
409 (void)scan_kernel_config;
412 return CubDebug(cudaErrorNotSupported);
415 cudaError error = cudaSuccess;
420 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
424 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
427 int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
431 size_t allocation_sizes[1];
432 if (
CubDebug(error = ScanTileStateT::AllocationSize(
num_tiles, allocation_sizes[0])))
break;
435 void* allocations[1];
437 if (d_temp_storage == NULL)
452 int init_grid_size = (
num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
453 if (debug_synchronous)
_CubLog(
"Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (
long long) stream);
456 init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
461 if (
CubDebug(error = cudaPeekAtLastError()))
break;
467 int scan_sm_occupancy;
471 scan_kernel_config.block_threads)))
break;
475 if (
CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
break;;
482 if (debug_synchronous)
_CubLog(
"Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
483 start_tile, scan_grid_size, scan_kernel_config.block_threads, (
long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
486 scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
496 if (
CubDebug(error = cudaPeekAtLastError()))
break;
513 CUB_RUNTIME_FUNCTION __forceinline__
515 void* d_temp_storage,
516 size_t& temp_storage_bytes,
518 OutputIteratorT
d_out,
523 bool debug_synchronous)
525 cudaError error = cudaSuccess;
548 DeviceScanInitKernel<ScanTileStateT>,
549 DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
550 scan_kernel_config)))
break;
@ BLOCK_STORE_WARP_TRANSPOSE
@ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
__global__ void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
< Tile status interface type
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT ScanTileStateT int ScanOpT InitValueT init_value
Initial value to seed the exclusive scan.
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT d_num_selected_out
[out] Pointer to the total number of items selected (i.e., length of d_selected_out)
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
@ BLOCK_SCAN_RAKING_MEMOIZE
OffsetT OffsetT
[in] Total number of input data items
__global__ void DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
< Output iterator type for recording the number of items selected
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< The BlockScan algorithm to use
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelPtrT init_kernel, ScanSweepKernelPtrT scan_kernel, KernelConfig scan_kernel_config)
< Function type of cub::DeviceScanKernelPtrT
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &scan_kernel_config)
Type selection (IF ? ThenType : ElseType)
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.