41 #include "../../agent/agent_reduce_by_key.cuh"
42 #include "../../thread/thread_operators.cuh"
43 #include "../../grid/grid_queue.cuh"
44 #include "../../util_device.cuh"
45 #include "../../util_namespace.cuh"
61 typename AgentReduceByKeyPolicyT,
62 typename KeysInputIteratorT,
63 typename UniqueOutputIteratorT,
64 typename ValuesInputIteratorT,
65 typename AggregatesOutputIteratorT,
66 typename NumRunsOutputIteratorT,
67 typename ScanTileStateT,
69 typename ReductionOpT,
72 __global__
void DeviceReduceByKeyKernel(
73 KeysInputIteratorT d_keys_in,
86 AgentReduceByKeyPolicyT,
88 UniqueOutputIteratorT,
90 AggregatesOutputIteratorT,
91 NumRunsOutputIteratorT,
98 __shared__
typename AgentReduceByKeyT::TempStorage temp_storage;
118 typename KeysInputIteratorT,
119 typename UniqueOutputIteratorT,
120 typename ValuesInputIteratorT,
121 typename AggregatesOutputIteratorT,
122 typename NumRunsOutputIteratorT,
123 typename EqualityOpT,
124 typename ReductionOpT,
133 typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
137 typename std::iterator_traits<KeysInputIteratorT>::value_type,
138 typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type
KeyOutputT;
141 typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
145 typename std::iterator_traits<ValuesInputIteratorT>::value_type,
146 typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type
ValueOutputT;
150 INIT_KERNEL_THREADS = 128,
167 NOMINAL_4B_ITEMS_PER_THREAD = 6,
168 ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 :
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
184 NOMINAL_4B_ITEMS_PER_THREAD = 6,
185 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
201 NOMINAL_4B_ITEMS_PER_THREAD = 11,
202 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
218 NOMINAL_4B_ITEMS_PER_THREAD = 7,
219 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
235 NOMINAL_4B_ITEMS_PER_THREAD = 5,
236 ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
253 #if (CUB_PTX_ARCH >= 350)
254 typedef Policy350 PtxPolicy;
256 #elif (CUB_PTX_ARCH >= 300)
257 typedef Policy300 PtxPolicy;
259 #elif (CUB_PTX_ARCH >= 200)
260 typedef Policy200 PtxPolicy;
262 #elif (CUB_PTX_ARCH >= 130)
263 typedef Policy130 PtxPolicy;
266 typedef Policy110 PtxPolicy;
271 struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
281 template <
typename KernelConfig>
282 CUB_RUNTIME_FUNCTION __forceinline__
287 #if (CUB_PTX_ARCH > 0)
291 reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
296 if (ptx_version >= 350)
298 reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
300 else if (ptx_version >= 300)
302 reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
304 else if (ptx_version >= 200)
306 reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
308 else if (ptx_version >= 130)
310 reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
314 reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
327 int items_per_thread;
330 template <
typename PolicyT>
331 CUB_RUNTIME_FUNCTION __forceinline__
334 block_threads = PolicyT::BLOCK_THREADS;
335 items_per_thread = PolicyT::ITEMS_PER_THREAD;
336 tile_items = block_threads * items_per_thread;
350 typename ScanInitKernelT,
351 typename ReduceByKeyKernelT>
352 CUB_RUNTIME_FUNCTION __forceinline__
354 void* d_temp_storage,
355 size_t& temp_storage_bytes,
356 KeysInputIteratorT d_keys_in,
365 bool debug_synchronous,
367 ScanInitKernelT init_kernel,
368 ReduceByKeyKernelT reduce_by_key_kernel,
372 #ifndef CUB_RUNTIME_ENABLED
373 (void)d_temp_storage;
374 (void)temp_storage_bytes;
384 (void)debug_synchronous;
386 (void)reduce_by_key_kernel;
387 (void)reduce_by_key_config;
390 return CubDebug(cudaErrorNotSupported);
394 cudaError error = cudaSuccess;
399 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
403 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
406 int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
410 size_t allocation_sizes[1];
411 if (
CubDebug(error = ScanTileStateT::AllocationSize(
num_tiles, allocation_sizes[0])))
break;
414 void* allocations[1];
416 if (d_temp_storage == NULL)
427 int init_grid_size =
CUB_MAX(1, (
num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
428 if (debug_synchronous)
_CubLog(
"Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (
long long) stream);
431 init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
437 if (
CubDebug(error = cudaPeekAtLastError()))
break;
447 int reduce_by_key_sm_occupancy;
449 reduce_by_key_sm_occupancy,
450 reduce_by_key_kernel,
451 reduce_by_key_config.block_threads)))
break;
455 if (
CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
break;;
462 if (debug_synchronous)
_CubLog(
"Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
463 start_tile, scan_grid_size, reduce_by_key_config.block_threads, (
long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
466 reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
479 if (
CubDebug(error = cudaPeekAtLastError()))
break;
489 #endif // CUB_RUNTIME_ENABLED
496 CUB_RUNTIME_FUNCTION __forceinline__
498 void* d_temp_storage,
499 size_t& temp_storage_bytes,
500 KeysInputIteratorT d_keys_in,
509 bool debug_synchronous)
511 cudaError error = cudaSuccess;
516 #if (CUB_PTX_ARCH == 0)
541 DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
542 DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
543 reduce_by_key_config)))
break;