41#include "../iterator/arg_index_input_iterator.cuh"
44#include "../util_namespace.cuh"
143 typename InputIteratorT,
144 typename OutputIteratorT,
145 typename ReductionOpT,
149 void *d_temp_storage,
150 size_t &temp_storage_bytes,
152 OutputIteratorT
d_out,
156 cudaStream_t stream = 0,
157 bool debug_synchronous =
false)
226 typename InputIteratorT,
227 typename OutputIteratorT>
230 void *d_temp_storage,
231 size_t &temp_storage_bytes,
233 OutputIteratorT
d_out,
235 cudaStream_t stream = 0,
236 bool debug_synchronous =
false)
243 typename std::iterator_traits<InputIteratorT>::value_type,
244 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
303 typename InputIteratorT,
304 typename OutputIteratorT>
307 void *d_temp_storage,
308 size_t &temp_storage_bytes,
310 OutputIteratorT
d_out,
312 cudaStream_t stream = 0,
313 bool debug_synchronous =
false)
319 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
380 typename InputIteratorT,
381 typename OutputIteratorT>
384 void *d_temp_storage,
385 size_t &temp_storage_bytes,
387 OutputIteratorT
d_out,
389 cudaStream_t stream = 0,
390 bool debug_synchronous =
false)
396 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
401 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;
404 typedef typename OutputTupleT::Value OutputValueT;
408 ArgIndexInputIteratorT d_indexed_in(d_in);
470 typename InputIteratorT,
471 typename OutputIteratorT>
474 void *d_temp_storage,
475 size_t &temp_storage_bytes,
477 OutputIteratorT
d_out,
479 cudaStream_t stream = 0,
480 bool debug_synchronous =
false)
486 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
547 typename InputIteratorT,
548 typename OutputIteratorT>
551 void *d_temp_storage,
552 size_t &temp_storage_bytes,
554 OutputIteratorT
d_out,
556 cudaStream_t stream = 0,
557 bool debug_synchronous =
false)
563 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
568 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;
571 typedef typename OutputTupleT::Value OutputValueT;
575 ArgIndexInputIteratorT d_indexed_in(d_in);
680 typename KeysInputIteratorT,
681 typename UniqueOutputIteratorT,
682 typename ValuesInputIteratorT,
683 typename AggregatesOutputIteratorT,
684 typename NumRunsOutputIteratorT,
685 typename ReductionOpT>
686 CUB_RUNTIME_FUNCTION __forceinline__
688 void *d_temp_storage,
689 size_t &temp_storage_bytes,
690 KeysInputIteratorT d_keys_in,
697 cudaStream_t stream = 0,
698 bool debug_synchronous =
false)
Optional outer namespace(s)
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
KeyT const ValueT * d_values_in
[in] Input values buffer
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of...
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index o...
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide minimum using the less-than ('<') operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of t...
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide maximum using the greater-than ('>') operator.
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide sum using the addition (+) operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Default equality functor.
Type selection (IF ? ThenType : ElseType)
A key identifier paired with a corresponding value.