41#include "../../agent/agent_reduce_by_key.cuh"
42#include "../../thread/thread_operators.cuh"
43#include "../../grid/grid_queue.cuh"
44#include "../../util_device.cuh"
45#include "../../util_namespace.cuh"
61 typename AgentReduceByKeyPolicyT,
62 typename KeysInputIteratorT,
63 typename UniqueOutputIteratorT,
64 typename ValuesInputIteratorT,
65 typename AggregatesOutputIteratorT,
66 typename NumRunsOutputIteratorT,
67 typename ScanTileStateT,
69 typename ReductionOpT,
72__global__
void DeviceReduceByKeyKernel(
73 KeysInputIteratorT d_keys_in,
86 AgentReduceByKeyPolicyT,
88 UniqueOutputIteratorT,
90 AggregatesOutputIteratorT,
91 NumRunsOutputIteratorT,
98 __shared__
typename AgentReduceByKeyT::TempStorage temp_storage;
118 typename KeysInputIteratorT,
119 typename UniqueOutputIteratorT,
120 typename ValuesInputIteratorT,
121 typename AggregatesOutputIteratorT,
122 typename NumRunsOutputIteratorT,
123 typename EqualityOpT,
124 typename ReductionOpT,
133 typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
137 typename std::iterator_traits<KeysInputIteratorT>::value_type,
138 typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type
KeyOutputT;
141 typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
145 typename std::iterator_traits<ValuesInputIteratorT>::value_type,
146 typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type
ValueOutputT;
150 INIT_KERNEL_THREADS = 128,
167 NOMINAL_4B_ITEMS_PER_THREAD = 6,
168 ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 :
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
184 NOMINAL_4B_ITEMS_PER_THREAD = 6,
185 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
201 NOMINAL_4B_ITEMS_PER_THREAD = 11,
202 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
218 NOMINAL_4B_ITEMS_PER_THREAD = 7,
219 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
235 NOMINAL_4B_ITEMS_PER_THREAD = 5,
236 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
253#if (CUB_PTX_ARCH >= 350)
256#elif (CUB_PTX_ARCH >= 300)
259#elif (CUB_PTX_ARCH >= 200)
262#elif (CUB_PTX_ARCH >= 130)
281 template <
typename KernelConfig>
282 CUB_RUNTIME_FUNCTION __forceinline__
287 #if (CUB_PTX_ARCH > 0)
291 reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
296 if (ptx_version >= 350)
298 reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
300 else if (ptx_version >= 300)
302 reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
304 else if (ptx_version >= 200)
306 reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
308 else if (ptx_version >= 130)
310 reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
314 reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
327 int items_per_thread;
330 template <
typename PolicyT>
331 CUB_RUNTIME_FUNCTION __forceinline__
334 block_threads = PolicyT::BLOCK_THREADS;
335 items_per_thread = PolicyT::ITEMS_PER_THREAD;
336 tile_items = block_threads * items_per_thread;
350 typename ScanInitKernelT,
351 typename ReduceByKeyKernelT>
352 CUB_RUNTIME_FUNCTION __forceinline__
354 void* d_temp_storage,
355 size_t& temp_storage_bytes,
356 KeysInputIteratorT d_keys_in,
365 bool debug_synchronous,
367 ScanInitKernelT init_kernel,
368 ReduceByKeyKernelT reduce_by_key_kernel,
372#ifndef CUB_RUNTIME_ENABLED
373 (void)d_temp_storage;
374 (void)temp_storage_bytes;
384 (void)debug_synchronous;
386 (void)reduce_by_key_kernel;
387 (void)reduce_by_key_config;
390 return CubDebug(cudaErrorNotSupported);
394 cudaError error = cudaSuccess;
399 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
403 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
406 int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
410 size_t allocation_sizes[1];
411 if (
CubDebug(error = ScanTileStateT::AllocationSize(
num_tiles, allocation_sizes[0])))
break;
414 void* allocations[1];
416 if (d_temp_storage == NULL)
427 int init_grid_size =
CUB_MAX(1, (
num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
428 if (debug_synchronous)
_CubLog(
"Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (
long long) stream);
431 init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
437 if (
CubDebug(error = cudaPeekAtLastError()))
break;
447 int reduce_by_key_sm_occupancy;
449 reduce_by_key_sm_occupancy,
450 reduce_by_key_kernel,
451 reduce_by_key_config.block_threads)))
break;
455 if (
CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
break;;
462 if (debug_synchronous)
_CubLog(
"Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
463 start_tile, scan_grid_size, reduce_by_key_config.block_threads, (
long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
466 reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
479 if (
CubDebug(error = cudaPeekAtLastError()))
break;
496 CUB_RUNTIME_FUNCTION __forceinline__
498 void* d_temp_storage,
499 size_t& temp_storage_bytes,
500 KeysInputIteratorT d_keys_in,
509 bool debug_synchronous)
511 cudaError error = cudaSuccess;
516 #if (CUB_PTX_ARCH == 0)
541 DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
542 DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
543 reduce_by_key_config)))
break;
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int start_tile
The starting tile for the current grid.
KeyT const ValueT * d_values_in
[in] Input values buffer
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-...
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
CUB_RUNTIME_FUNCTION static __forceinline__ void InitConfigs(int ptx_version, KernelConfig &reduce_by_key_config)
Type selection (IF ? ThenType : ElseType)
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...