41#include "../../agent/agent_select_if.cuh"
42#include "../../thread/thread_operators.cuh"
43#include "../../grid/grid_queue.cuh"
44#include "../../util_device.cuh"
45#include "../../util_namespace.cuh"
65 typename AgentSelectIfPolicyT,
66 typename InputIteratorT,
67 typename FlagsInputIteratorT,
68 typename SelectedOutputIteratorT,
69 typename NumSelectedIteratorT,
70 typename ScanTileStateT,
76__global__
void DeviceSelectSweepKernel(
92 SelectedOutputIteratorT,
96 KEEP_REJECTS> AgentSelectIfT;
99 __shared__
typename AgentSelectIfT::TempStorage temp_storage;
119 typename InputIteratorT,
120 typename FlagsInputIteratorT,
121 typename SelectedOutputIteratorT,
122 typename NumSelectedIteratorT,
124 typename EqualityOpT,
135 typename std::iterator_traits<InputIteratorT>::value_type,
136 typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type
OutputT;
139 typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
143 INIT_KERNEL_THREADS = 128,
158 NOMINAL_4B_ITEMS_PER_THREAD = 10,
159 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(
OutputT)))),
175 NOMINAL_4B_ITEMS_PER_THREAD = 7,
176 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(
OutputT)))),
192 NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
193 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(
OutputT)))),
209 NOMINAL_4B_ITEMS_PER_THREAD = 9,
210 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(
OutputT)))),
226 NOMINAL_4B_ITEMS_PER_THREAD = 9,
227 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(
OutputT)))),
244#if (CUB_PTX_ARCH >= 350)
247#elif (CUB_PTX_ARCH >= 300)
250#elif (CUB_PTX_ARCH >= 200)
253#elif (CUB_PTX_ARCH >= 130)
272 template <
typename KernelConfig>
273 CUB_RUNTIME_FUNCTION __forceinline__
278 #if (CUB_PTX_ARCH > 0)
282 select_if_config.template Init<PtxSelectIfPolicyT>();
287 if (ptx_version >= 350)
289 select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
291 else if (ptx_version >= 300)
293 select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
295 else if (ptx_version >= 200)
297 select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
299 else if (ptx_version >= 130)
301 select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
305 select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
318 int items_per_thread;
321 template <
typename PolicyT>
322 CUB_RUNTIME_FUNCTION __forceinline__
325 block_threads = PolicyT::BLOCK_THREADS;
326 items_per_thread = PolicyT::ITEMS_PER_THREAD;
327 tile_items = block_threads * items_per_thread;
341 typename ScanInitKernelPtrT,
342 typename SelectIfKernelPtrT>
343 CUB_RUNTIME_FUNCTION __forceinline__
345 void* d_temp_storage,
346 size_t& temp_storage_bytes,
355 bool debug_synchronous,
357 ScanInitKernelPtrT scan_init_kernel,
358 SelectIfKernelPtrT select_if_kernel,
362#ifndef CUB_RUNTIME_ENABLED
363 (void)d_temp_storage;
364 (void)temp_storage_bytes;
373 (void)debug_synchronous;
374 (void)scan_init_kernel;
375 (void)select_if_kernel;
376 (void)select_if_config;
379 return CubDebug(cudaErrorNotSupported);
383 cudaError error = cudaSuccess;
388 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
392 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
395 int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
399 size_t allocation_sizes[1];
400 if (
CubDebug(error = ScanTileStateT::AllocationSize(
num_tiles, allocation_sizes[0])))
break;
403 void* allocations[1];
405 if (d_temp_storage == NULL)
416 int init_grid_size =
CUB_MAX(1, (
num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
417 if (debug_synchronous)
_CubLog(
"Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (
long long) stream);
420 scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
426 if (
CubDebug(error = cudaPeekAtLastError()))
break;
436 int range_select_sm_occupancy;
438 range_select_sm_occupancy,
440 select_if_config.block_threads)))
break;
444 if (
CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
break;;
448 scan_grid_size.z = 1;
449 scan_grid_size.y = ((
unsigned int)
num_tiles + max_dim_x - 1) / max_dim_x;
453 if (debug_synchronous)
_CubLog(
"Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
454 scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (
long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
457 select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
469 if (
CubDebug(error = cudaPeekAtLastError()))
break;
485 CUB_RUNTIME_FUNCTION __forceinline__
487 void* d_temp_storage,
488 size_t& temp_storage_bytes,
497 bool debug_synchronous)
499 cudaError error = cudaSuccess;
504 #if (CUB_PTX_ARCH == 0)
528 DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
529 DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
530 select_if_config)))
break;
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT ScanTileStateT SelectOpT select_op
[in] Selection operator
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
FlagsInputIteratorT SelectedOutputIteratorT NumSelectedIteratorT d_num_selected_out
[out] Pointer to the total number of items selected (i.e., length of d_selected_out)
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
FlagsInputIteratorT SelectedOutputIteratorT d_selected_out
[out] Pointer to the output sequence of selected data items
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
@ BLOCK_SCAN_RAKING_MEMOIZE
FlagsInputIteratorT d_flags
< [in] Pointer to the input sequence of data items
OffsetT OffsetT
[in] Total number of input data items
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_status
[in] Tile status interface
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wid...
< Whether or not we push rejected items to the back of the output
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &select_if_config)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelPtrT scan_init_kernel, SelectIfKernelPtrT select_if_kernel, KernelConfig select_if_config)
< Function type of cub::SelectIfKernelPtrT
Type selection (IF ? ThenType : ElseType)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...