40#include "../../agent/agent_reduce.cuh" 
   41#include "../../iterator/arg_index_input_iterator.cuh" 
   42#include "../../thread/thread_operators.cuh" 
   43#include "../../grid/grid_even_share.cuh" 
   44#include "../../iterator/arg_index_input_iterator.cuh" 
   45#include "../../util_debug.cuh" 
   46#include "../../util_device.cuh" 
   47#include "../../util_namespace.cuh" 
   63    typename                ChainedPolicyT,             
 
   64    typename                InputIteratorT,             
 
   65    typename                OutputIteratorT,            
 
   67    typename                ReductionOpT>               
 
   69__global__ 
void DeviceReduceKernel(
 
   78        typename std::iterator_traits<InputIteratorT>::value_type,                                          
 
   79        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          
 
   83            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
 
   98        d_out[blockIdx.x] = block_aggregate;
 
  106    typename                ChainedPolicyT,             
 
  107    typename                InputIteratorT,             
 
  108    typename                OutputIteratorT,            
 
  110    typename                ReductionOpT,               
 
  113__global__ 
void DeviceReduceSingleTileKernel(
 
  115    OutputIteratorT         
d_out,                      
 
  122            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
 
  135        if (threadIdx.x == 0)
 
  146    if (threadIdx.x == 0)
 
  152template <
typename T, 
typename OffsetT, 
typename IteratorT>
 
  153__device__ __forceinline__
 
  162template <
typename KeyValuePairT, 
typename OffsetT, 
typename WrappedIteratorT, 
typename OutputValueT>
 
  163__device__ __forceinline__
 
  169    val.key -= base_offset;
 
  177    typename                ChainedPolicyT,             
 
  178    typename                InputIteratorT,             
 
  179    typename                OutputIteratorT,            
 
  180    typename                OffsetIteratorT,            
 
  182    typename                ReductionOpT,               
 
  185__global__ 
void DeviceSegmentedReduceKernel(
 
  187    OutputIteratorT         
d_out,                      
 
  196            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
 
  210    if (segment_begin == segment_end)
 
  212        if (threadIdx.x == 0)
 
  225    if (threadIdx.x == 0)
 
  239    typename ReductionOpT>      
 
  355    typename InputIteratorT,    
 
  356    typename OutputIteratorT,   
 
  358    typename ReductionOpT>      
 
  361        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  
 
  362            typename std::iterator_traits<InputIteratorT>::value_type,                                  
 
  363            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          
 
  373        typename std::iterator_traits<InputIteratorT>::value_type,                                          
 
  374        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          
 
  397    CUB_RUNTIME_FUNCTION __forceinline__
 
  402        OutputIteratorT         
d_out,
 
  429        typename                ActivePolicyT,          
 
  430        typename                SingleTileKernelT>      
 
  431    CUB_RUNTIME_FUNCTION __forceinline__
 
  433        SingleTileKernelT       single_tile_kernel)     
 
  435#ifndef CUB_RUNTIME_ENABLED 
  436        (void)single_tile_kernel;
 
  439        return CubDebug(cudaErrorNotSupported );
 
  441        cudaError error = cudaSuccess;
 
  452            if (
debug_synchronous) 
_CubLog(
"Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
 
  453                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
 
  455                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
 
  458            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
 
  466            if (
CubDebug(error = cudaPeekAtLastError())) 
break;
 
  485        typename                ActivePolicyT,              
 
  486        typename                ReduceKernelT,              
 
  487        typename                SingleTileKernelT>          
 
  488    CUB_RUNTIME_FUNCTION __forceinline__
 
  490        ReduceKernelT           reduce_kernel,          
 
  491        SingleTileKernelT       single_tile_kernel)     
 
  493#ifndef CUB_RUNTIME_ENABLED 
  494        (void)                  reduce_kernel;
 
  495        (void)                  single_tile_kernel;
 
  498        return CubDebug(cudaErrorNotSupported );
 
  501        cudaError error = cudaSuccess;
 
  506            if (
CubDebug(error = cudaGetDevice(&device_ordinal))) 
break;
 
  510            if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) 
break;
 
  514            if (
CubDebug(error = reduce_config.Init<
typename ActivePolicyT::ReducePolicy>(reduce_kernel))) 
break;
 
  515            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
 
  523            void* allocations[1];
 
  524            size_t allocation_sizes[1] =
 
  526                max_blocks * 
sizeof(OutputT)    
 
  538            OutputT *d_block_reductions = (OutputT*) allocations[0];
 
  544            if (
debug_synchronous) 
_CubLog(
"Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
 
  546                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
 
  548                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
 
  549                reduce_config.sm_occupancy);
 
  552            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
 
  560            if (
CubDebug(error = cudaPeekAtLastError())) 
break;
 
  566            if (
debug_synchronous) 
_CubLog(
"Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
 
  567                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
 
  569                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
 
  572            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
 
  580            if (
CubDebug(error = cudaPeekAtLastError())) 
break;
 
  599    template <
typename ActivePolicyT>
 
  600    CUB_RUNTIME_FUNCTION __forceinline__
 
  603        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
 
  607        if (
num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
 
  610            return InvokeSingleTile<ActivePolicyT>(
 
  611                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
 
  616            return InvokePasses<ActivePolicyT>(
 
  617                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
 
  618                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
 
  630    CUB_RUNTIME_FUNCTION __forceinline__
 
  635        OutputIteratorT 
d_out,                              
 
  644        cudaError error = cudaSuccess;
 
  676    typename InputIteratorT,    
 
  677    typename OutputIteratorT,   
 
  678    typename OffsetIteratorT,   
 
  680    typename ReductionOpT>      
 
  683        typename std::iterator_traits<InputIteratorT>::value_type,
 
  693        typename std::iterator_traits<InputIteratorT>::value_type,                                          
 
  694        typename std::iterator_traits<OutputIteratorT>::value_type>::Type 
OutputT;                          
 
  719    CUB_RUNTIME_FUNCTION __forceinline__
 
  724        OutputIteratorT         
d_out,
 
  756        typename                        ActivePolicyT,                  
 
  757        typename                        DeviceSegmentedReduceKernelT>   
 
  758    CUB_RUNTIME_FUNCTION __forceinline__
 
  760        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        
 
  762#ifndef CUB_RUNTIME_ENABLED 
  763        (void)segmented_reduce_kernel;
 
  765        return CubDebug(cudaErrorNotSupported );
 
  767        cudaError error = cudaSuccess;
 
  779            if (
CubDebug(error = segmented_reduce_config.Init<
typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) 
break;
 
  782            if (
debug_synchronous) 
_CubLog(
"Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
 
  784                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
 
  786                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
 
  787                segmented_reduce_config.sm_occupancy);
 
  790            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
 
  800            if (
CubDebug(error = cudaPeekAtLastError())) 
break;
 
  815    template <
typename ActivePolicyT>
 
  816    CUB_RUNTIME_FUNCTION __forceinline__
 
  822        return InvokePasses<ActivePolicyT>(
 
  823            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
 
  834    CUB_RUNTIME_FUNCTION __forceinline__
 
  839        OutputIteratorT 
d_out,                              
 
  853        cudaError error = cudaSuccess;
 
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Optional outer namespace(s)
__device__ __forceinline__ void NormalizeReductionOutput(T &, OffsetT, IteratorT)
Normalize input iterator to segment offset.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
@ BLOCK_REDUCE_WARP_REDUCTIONS
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< Cache load modifier for reading input elements
Alias wrapper allowing storage to be unioned.
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
Helper for dispatching into a policy chain.
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(128, 8, OuputT), 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 2, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 20, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
AgentReducePolicy< CUB_SCALED_GRANULARITIES(256, 16, OuputT), 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG > ReducePolicy
< Cache load modifier
< Binary reduction functor type having member T operator()(const T &a, const T &b)
Policy600 MaxPolicy
MaxPolicy.
< Binary reduction functor type having member T operator()(const T &a, const T &b)
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
InputIteratorT d_in
[in] Pointer to the input sequence of data items
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
OutputIteratorT d_out
[out] Pointer to the output aggregate
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block block to reduce in-core.
OffsetT num_items
[in] Total number of input items (i.e., length of d_in)
int ptx_version
[in] PTX version
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
OutputT init
[in] The initial value of the reduction
CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel)
Invoke two-passes to reduce.
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
ReductionOpT reduction_op
[in] Binary reduction functor
< Binary reduction functor type having member T operator()(const T &a, const T &b)
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
OutputIteratorT d_out
[out] Pointer to the output aggregate
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
int ptx_version
[in] PTX version
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
OutputT init
[in] The initial value of the reduction
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
Invocation.
ReductionOpT reduction_op
[in] Binary reduction functor
InputIteratorT d_in
[in] Pointer to the input sequence of data items
If<(Equals< typenamestd::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typenamestd::iterator_traits< InputIteratorT >::value_type, typenamestd::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
OffsetT num_segments
[in] The number of segments that comprise the sorting data
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
__host__ __device__ __forceinline__ void DispatchInit(OffsetT num_items, int max_grid_size, int tile_items)
Dispatch initializer. To be called prior prior to kernel launch.
Type selection (IF ? ThenType : ElseType)
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.