40#include "../../agent/agent_reduce.cuh"
41#include "../../iterator/arg_index_input_iterator.cuh"
42#include "../../thread/thread_operators.cuh"
43#include "../../grid/grid_even_share.cuh"
44#include "../../iterator/arg_index_input_iterator.cuh"
45#include "../../util_debug.cuh"
46#include "../../util_device.cuh"
47#include "../../util_namespace.cuh"
63 typename ChainedPolicyT,
64 typename InputIteratorT,
65 typename OutputIteratorT,
67 typename ReductionOpT>
69__global__
void DeviceReduceKernel(
78 typename std::iterator_traits<InputIteratorT>::value_type,
79 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
83 typename ChainedPolicyT::ActivePolicy::ReducePolicy,
98 d_out[blockIdx.x] = block_aggregate;
106 typename ChainedPolicyT,
107 typename InputIteratorT,
108 typename OutputIteratorT,
110 typename ReductionOpT,
113__global__
void DeviceReduceSingleTileKernel(
115 OutputIteratorT
d_out,
122 typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
135 if (threadIdx.x == 0)
146 if (threadIdx.x == 0)
152template <
typename T,
typename OffsetT,
typename IteratorT>
153__device__ __forceinline__
162template <
typename KeyValuePairT,
typename OffsetT,
typename WrappedIteratorT,
typename OutputValueT>
163__device__ __forceinline__
169 val.key -= base_offset;
177 typename ChainedPolicyT,
178 typename InputIteratorT,
179 typename OutputIteratorT,
180 typename OffsetIteratorT,
182 typename ReductionOpT,
185__global__
void DeviceSegmentedReduceKernel(
187 OutputIteratorT
d_out,
196 typename ChainedPolicyT::ActivePolicy::ReducePolicy,
210 if (segment_begin == segment_end)
212 if (threadIdx.x == 0)
225 if (threadIdx.x == 0)
239 typename ReductionOpT>
355 typename InputIteratorT,
356 typename OutputIteratorT,
358 typename ReductionOpT>
361 typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),
362 typename std::iterator_traits<InputIteratorT>::value_type,
363 typename std::iterator_traits<OutputIteratorT>::value_type>::Type,
373 typename std::iterator_traits<InputIteratorT>::value_type,
374 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
397 CUB_RUNTIME_FUNCTION __forceinline__
402 OutputIteratorT
d_out,
429 typename ActivePolicyT,
430 typename SingleTileKernelT>
431 CUB_RUNTIME_FUNCTION __forceinline__
433 SingleTileKernelT single_tile_kernel)
435#ifndef CUB_RUNTIME_ENABLED
436 (void)single_tile_kernel;
439 return CubDebug(cudaErrorNotSupported );
441 cudaError error = cudaSuccess;
452 if (
debug_synchronous)
_CubLog(
"Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
453 ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
455 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
458 single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
466 if (
CubDebug(error = cudaPeekAtLastError()))
break;
485 typename ActivePolicyT,
486 typename ReduceKernelT,
487 typename SingleTileKernelT>
488 CUB_RUNTIME_FUNCTION __forceinline__
490 ReduceKernelT reduce_kernel,
491 SingleTileKernelT single_tile_kernel)
493#ifndef CUB_RUNTIME_ENABLED
494 (void) reduce_kernel;
495 (void) single_tile_kernel;
498 return CubDebug(cudaErrorNotSupported );
501 cudaError error = cudaSuccess;
506 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
510 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
514 if (
CubDebug(error = reduce_config.Init<
typename ActivePolicyT::ReducePolicy>(reduce_kernel)))
break;
515 int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
523 void* allocations[1];
524 size_t allocation_sizes[1] =
526 max_blocks *
sizeof(OutputT)
538 OutputT *d_block_reductions = (OutputT*) allocations[0];
544 if (
debug_synchronous)
_CubLog(
"Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
546 ActivePolicyT::ReducePolicy::BLOCK_THREADS,
548 ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
549 reduce_config.sm_occupancy);
552 reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
560 if (
CubDebug(error = cudaPeekAtLastError()))
break;
566 if (
debug_synchronous)
_CubLog(
"Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
567 ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
569 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
572 single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
580 if (
CubDebug(error = cudaPeekAtLastError()))
break;
599 template <
typename ActivePolicyT>
600 CUB_RUNTIME_FUNCTION __forceinline__
603 typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
607 if (
num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
610 return InvokeSingleTile<ActivePolicyT>(
611 DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
616 return InvokePasses<ActivePolicyT>(
617 DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
618 DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
630 CUB_RUNTIME_FUNCTION __forceinline__
635 OutputIteratorT
d_out,
644 cudaError error = cudaSuccess;
676 typename InputIteratorT,
677 typename OutputIteratorT,
678 typename OffsetIteratorT,
680 typename ReductionOpT>
683 typename std::iterator_traits<InputIteratorT>::value_type,
693 typename std::iterator_traits<InputIteratorT>::value_type,
694 typename std::iterator_traits<OutputIteratorT>::value_type>::Type
OutputT;
719 CUB_RUNTIME_FUNCTION __forceinline__
724 OutputIteratorT
d_out,
756 typename ActivePolicyT,
757 typename DeviceSegmentedReduceKernelT>
758 CUB_RUNTIME_FUNCTION __forceinline__
760 DeviceSegmentedReduceKernelT segmented_reduce_kernel)
762#ifndef CUB_RUNTIME_ENABLED
763 (void)segmented_reduce_kernel;
765 return CubDebug(cudaErrorNotSupported );
767 cudaError error = cudaSuccess;
779 if (
CubDebug(error = segmented_reduce_config.Init<
typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel)))
break;
782 if (
debug_synchronous)
_CubLog(
"Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
784 ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
786 ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
787 segmented_reduce_config.sm_occupancy);
790 segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
800 if (
CubDebug(error = cudaPeekAtLastError()))
break;
815 template <
typename ActivePolicyT>
816 CUB_RUNTIME_FUNCTION __forceinline__
822 return InvokePasses<ActivePolicyT>(
823 DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
834 CUB_RUNTIME_FUNCTION __forceinline__
839 OutputIteratorT
d_out,
853 cudaError error = cudaSuccess;
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Optional outer namespace(s)
__device__ __forceinline__ void NormalizeReductionOutput(T &, OffsetT, IteratorT)
Normalize input iterator to segment offset.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
@ BLOCK_REDUCE_WARP_REDUCTIONS
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< Cache load modifier for reading input elements
Alias wrapper allowing storage to be unioned.
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
Helper for dispatching into a policy chain.
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 2, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 16, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
< Binary reduction functor type having member T operator()(const T &a, const T &b)
Policy600 MaxPolicy
MaxPolicy.
< Binary reduction functor type having member T operator()(const T &a, const T &b)
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
InputIteratorT d_in
[in] Pointer to the input sequence of data items
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
OutputIteratorT d_out
[out] Pointer to the output aggregate
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block block to reduce in-core.
OffsetT num_items
[in] Total number of input items (i.e., length of d_in)
int ptx_version
[in] PTX version
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
OutputT init
[in] The initial value of the reduction
CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel)
Invoke two-passes to reduce.
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
ReductionOpT reduction_op
[in] Binary reduction functor
< Binary reduction functor type having member T operator()(const T &a, const T &b)
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
OutputIteratorT d_out
[out] Pointer to the output aggregate
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
int ptx_version
[in] PTX version
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
OutputT init
[in] The initial value of the reduction
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
Invocation.
ReductionOpT reduction_op
[in] Binary reduction functor
InputIteratorT d_in
[in] Pointer to the input sequence of data items
If<(Equals< typenamestd::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typenamestd::iterator_traits< InputIteratorT >::value_type, typenamestd::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
OffsetT num_segments
[in] The number of segments that comprise the sorting data
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
__host__ __device__ __forceinline__ void DispatchInit(OffsetT num_items, int max_grid_size, int tile_items)
Dispatch initializer. To be called prior prior to kernel launch.
Type selection (IF ? ThenType : ElseType)
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.