41 #include "../iterator/arg_index_input_iterator.cuh" 44 #include "../util_namespace.cuh" 143 typename InputIteratorT,
144 typename OutputIteratorT,
145 typename ReductionOpT,
149 void *d_temp_storage,
150 size_t &temp_storage_bytes,
152 OutputIteratorT
d_out,
156 cudaStream_t stream = 0,
157 bool debug_synchronous =
false)
226 typename InputIteratorT,
227 typename OutputIteratorT>
230 void *d_temp_storage,
231 size_t &temp_storage_bytes,
233 OutputIteratorT
d_out,
235 cudaStream_t stream = 0,
236 bool debug_synchronous =
false)
243 typename std::iterator_traits<InputIteratorT>::value_type,
244 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
303 typename InputIteratorT,
304 typename OutputIteratorT>
307 void *d_temp_storage,
308 size_t &temp_storage_bytes,
310 OutputIteratorT
d_out,
312 cudaStream_t stream = 0,
313 bool debug_synchronous =
false)
319 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
380 typename InputIteratorT,
381 typename OutputIteratorT>
384 void *d_temp_storage,
385 size_t &temp_storage_bytes,
387 OutputIteratorT
d_out,
389 cudaStream_t stream = 0,
390 bool debug_synchronous =
false)
396 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
401 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;
404 typedef typename OutputTupleT::Value OutputValueT;
408 ArgIndexInputIteratorT d_indexed_in(d_in);
470 typename InputIteratorT,
471 typename OutputIteratorT>
474 void *d_temp_storage,
475 size_t &temp_storage_bytes,
477 OutputIteratorT
d_out,
479 cudaStream_t stream = 0,
480 bool debug_synchronous =
false)
486 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
547 typename InputIteratorT,
548 typename OutputIteratorT>
551 void *d_temp_storage,
552 size_t &temp_storage_bytes,
554 OutputIteratorT
d_out,
556 cudaStream_t stream = 0,
557 bool debug_synchronous =
false)
563 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
568 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;
571 typedef typename OutputTupleT::Value OutputValueT;
575 ArgIndexInputIteratorT d_indexed_in(d_in);
680 typename KeysInputIteratorT,
681 typename UniqueOutputIteratorT,
682 typename ValuesInputIteratorT,
683 typename AggregatesOutputIteratorT,
684 typename NumRunsOutputIteratorT,
685 typename ReductionOpT>
686 CUB_RUNTIME_FUNCTION __forceinline__
688 void *d_temp_storage,
689 size_t &temp_storage_bytes,
690 KeysInputIteratorT d_keys_in,
697 cudaStream_t stream = 0,
698 bool debug_synchronous =
false)
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide minimum using the less-than ('<') operator.
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Optional outer namespace(s)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
A key identifier paired with a corresponding value.
Default equality functor.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index o...
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
KeyT const ValueT * d_values_in
[in] Input values buffer
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of t...
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide maximum using the greater-than ('>') operator.
Type selection (IF ? ThenType : ElseType)
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide sum using the addition (+) operator.
DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of...
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items