41#include "../../agent/agent_rle.cuh"
42#include "../../thread/thread_operators.cuh"
43#include "../../grid/grid_queue.cuh"
44#include "../../util_device.cuh"
45#include "../../util_namespace.cuh"
66 typename AgentRlePolicyT,
67 typename InputIteratorT,
68 typename OffsetsOutputIteratorT,
69 typename LengthsOutputIteratorT,
70 typename NumRunsOutputIteratorT,
71 typename ScanTileStateT,
75__global__
void DeviceRleSweepKernel(
89 OffsetsOutputIteratorT,
90 LengthsOutputIteratorT,
95 __shared__
typename AgentRleT::TempStorage temp_storage;
115 typename InputIteratorT,
116 typename OffsetsOutputIteratorT,
117 typename LengthsOutputIteratorT,
118 typename NumRunsOutputIteratorT,
119 typename EqualityOpT,
128 typedef typename std::iterator_traits<InputIteratorT>::value_type T;
133 typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type
LengthT;
137 INIT_KERNEL_THREADS = 128,
152 NOMINAL_4B_ITEMS_PER_THREAD = 15,
153 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(T)))),
170 NOMINAL_4B_ITEMS_PER_THREAD = 5,
171 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(T)))),
188 NOMINAL_4B_ITEMS_PER_THREAD = 15,
189 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(T)))),
206 NOMINAL_4B_ITEMS_PER_THREAD = 9,
207 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(T)))),
224 NOMINAL_4B_ITEMS_PER_THREAD = 9,
225 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
sizeof(T)))),
243#if (CUB_PTX_ARCH >= 350)
246#elif (CUB_PTX_ARCH >= 300)
249#elif (CUB_PTX_ARCH >= 200)
252#elif (CUB_PTX_ARCH >= 130)
271 template <
typename KernelConfig>
272 CUB_RUNTIME_FUNCTION __forceinline__
277 #if (CUB_PTX_ARCH > 0)
280 device_rle_config.template Init<PtxRleSweepPolicy>();
285 if (ptx_version >= 350)
287 device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
289 else if (ptx_version >= 300)
291 device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
293 else if (ptx_version >= 200)
295 device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
297 else if (ptx_version >= 130)
299 device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
303 device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
316 int items_per_thread;
318 bool store_warp_time_slicing;
321 template <
typename AgentRlePolicyT>
322 CUB_RUNTIME_FUNCTION __forceinline__
325 block_threads = AgentRlePolicyT::BLOCK_THREADS;
326 items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD;
327 load_policy = AgentRlePolicyT::LOAD_ALGORITHM;
328 store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
329 scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM;
332 CUB_RUNTIME_FUNCTION __forceinline__
335 printf(
"%d, %d, %d, %d, %d",
339 store_warp_time_slicing,
354 typename DeviceScanInitKernelPtr,
355 typename DeviceRleSweepKernelPtr>
356 CUB_RUNTIME_FUNCTION __forceinline__
358 void* d_temp_storage,
359 size_t& temp_storage_bytes,
367 bool debug_synchronous,
369 DeviceScanInitKernelPtr device_scan_init_kernel,
370 DeviceRleSweepKernelPtr device_rle_sweep_kernel,
374#ifndef CUB_RUNTIME_ENABLED
377 return CubDebug(cudaErrorNotSupported);
381 cudaError error = cudaSuccess;
386 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
390 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
393 int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
397 size_t allocation_sizes[1];
398 if (
CubDebug(error = ScanTileStateT::AllocationSize(
num_tiles, allocation_sizes[0])))
break;
401 void* allocations[1];
403 if (d_temp_storage == NULL)
414 int init_grid_size =
CUB_MAX(1, (
num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
415 if (debug_synchronous)
_CubLog(
"Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (
long long) stream);
418 device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
424 if (
CubDebug(error = cudaPeekAtLastError()))
break;
434 int device_rle_kernel_sm_occupancy;
436 device_rle_kernel_sm_occupancy,
437 device_rle_sweep_kernel,
438 device_rle_config.block_threads)))
break;
442 if (
CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
break;;
446 scan_grid_size.z = 1;
447 scan_grid_size.y = ((
unsigned int)
num_tiles + max_dim_x - 1) / max_dim_x;
451 if (debug_synchronous)
_CubLog(
"Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
452 scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (
long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
455 device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
466 if (
CubDebug(error = cudaPeekAtLastError()))
break;
483 CUB_RUNTIME_FUNCTION __forceinline__
485 void* d_temp_storage,
486 size_t& temp_storage_bytes,
494 bool debug_synchronous)
496 cudaError error = cudaSuccess;
501 #if (CUB_PTX_ARCH == 0)
524 DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
525 DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
526 device_rle_config)))
break;
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
OffsetsOutputIteratorT d_offsets_out
< [in] Pointer to input sequence of data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OffsetsOutputIteratorT LengthsOutputIteratorT d_lengths_out
[out] Pointer to output sequence of run-lengths
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
@ BLOCK_SCAN_RAKING_MEMOIZE
OffsetT OffsetT
[in] Total number of input data items
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_status
[in] Tile status interface
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run...
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version, DeviceScanInitKernelPtr device_scan_init_kernel, DeviceRleSweepKernelPtr device_rle_sweep_kernel, KernelConfig device_rle_config)
< Function type of cub::DeviceRleSweepKernelPtr
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &device_rle_config)
Type selection (IF ? ThenType : ElseType)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...