40#include "../iterator/arg_index_input_iterator.cuh"
43#include "../util_type.cuh"
44#include "../util_namespace.cuh"
127 typename InputIteratorT,
128 typename OutputIteratorT,
129 typename OffsetIteratorT,
130 typename ReductionOp,
134 void *d_temp_storage,
135 size_t &temp_storage_bytes,
137 OutputIteratorT
d_out,
143 cudaStream_t stream = 0,
144 bool debug_synchronous =
false)
211 typename InputIteratorT,
212 typename OutputIteratorT,
213 typename OffsetIteratorT>
216 void *d_temp_storage,
217 size_t &temp_storage_bytes,
219 OutputIteratorT
d_out,
223 cudaStream_t stream = 0,
224 bool debug_synchronous =
false)
231 typename std::iterator_traits<InputIteratorT>::value_type,
232 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
296 typename InputIteratorT,
297 typename OutputIteratorT,
298 typename OffsetIteratorT>
301 void *d_temp_storage,
302 size_t &temp_storage_bytes,
304 OutputIteratorT
d_out,
308 cudaStream_t stream = 0,
309 bool debug_synchronous =
false)
315 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
381 typename InputIteratorT,
382 typename OutputIteratorT,
383 typename OffsetIteratorT>
386 void *d_temp_storage,
387 size_t &temp_storage_bytes,
389 OutputIteratorT
d_out,
393 cudaStream_t stream = 0,
394 bool debug_synchronous =
false)
400 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
405 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;
408 typedef typename OutputTupleT::Value OutputValueT;
412 ArgIndexInputIteratorT d_indexed_in(d_in);
479 typename InputIteratorT,
480 typename OutputIteratorT,
481 typename OffsetIteratorT>
484 void *d_temp_storage,
485 size_t &temp_storage_bytes,
487 OutputIteratorT
d_out,
491 cudaStream_t stream = 0,
492 bool debug_synchronous =
false)
498 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
564 typename InputIteratorT,
565 typename OutputIteratorT,
566 typename OffsetIteratorT>
569 void *d_temp_storage,
570 size_t &temp_storage_bytes,
572 OutputIteratorT
d_out,
576 cudaStream_t stream = 0,
577 bool debug_synchronous =
false)
583 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
588 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;
591 typedef typename OutputTupleT::Value OutputValueT;
595 ArgIndexInputIteratorT d_indexed_in(d_in);
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across mult...
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum in each segment using the greater-than ('>') operator,...
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented sum using the addition ('+') operator.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum in each segment using the less-than ('<') operator,...
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented minimum using the less-than ('<') operator.
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented maximum using the greater-than ('>') operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Type selection (IF ? ThenType : ElseType)
A key identifier paired with a corresponding value.