40 #include "../iterator/arg_index_input_iterator.cuh" 43 #include "../util_type.cuh" 44 #include "../util_namespace.cuh" 127 typename InputIteratorT,
128 typename OutputIteratorT,
129 typename OffsetIteratorT,
130 typename ReductionOp,
134 void *d_temp_storage,
135 size_t &temp_storage_bytes,
137 OutputIteratorT
d_out,
143 cudaStream_t stream = 0,
144 bool debug_synchronous =
false)
211 typename InputIteratorT,
212 typename OutputIteratorT,
213 typename OffsetIteratorT>
216 void *d_temp_storage,
217 size_t &temp_storage_bytes,
219 OutputIteratorT
d_out,
223 cudaStream_t stream = 0,
224 bool debug_synchronous =
false)
231 typename std::iterator_traits<InputIteratorT>::value_type,
232 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;
296 typename InputIteratorT,
297 typename OutputIteratorT,
298 typename OffsetIteratorT>
301 void *d_temp_storage,
302 size_t &temp_storage_bytes,
304 OutputIteratorT
d_out,
308 cudaStream_t stream = 0,
309 bool debug_synchronous =
false)
315 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
381 typename InputIteratorT,
382 typename OutputIteratorT,
383 typename OffsetIteratorT>
386 void *d_temp_storage,
387 size_t &temp_storage_bytes,
389 OutputIteratorT
d_out,
393 cudaStream_t stream = 0,
394 bool debug_synchronous =
false)
400 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
405 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;
408 typedef typename OutputTupleT::Value OutputValueT;
412 ArgIndexInputIteratorT d_indexed_in(d_in);
479 typename InputIteratorT,
480 typename OutputIteratorT,
481 typename OffsetIteratorT>
484 void *d_temp_storage,
485 size_t &temp_storage_bytes,
487 OutputIteratorT
d_out,
491 cudaStream_t stream = 0,
492 bool debug_synchronous =
false)
498 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
564 typename InputIteratorT,
565 typename OutputIteratorT,
566 typename OffsetIteratorT>
569 void *d_temp_storage,
570 size_t &temp_storage_bytes,
572 OutputIteratorT
d_out,
576 cudaStream_t stream = 0,
577 bool debug_synchronous =
false)
583 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
588 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;
591 typedef typename OutputTupleT::Value OutputValueT;
595 ArgIndexInputIteratorT d_indexed_in(d_in);
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented sum using the addition ('+') operator.
DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across mult...
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum in each segment using the greater-than ('>') operator,...
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.
A key identifier paired with a corresponding value.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum in each segment using the less-than ('<') operator,...
OffsetT OffsetT
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented maximum using the greater-than ('>') operator.
Type selection (IF ? ThenType : ElseType)
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented minimum using the less-than ('<') operator.
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items