DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. More...

Static Public Member Functions
template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT , typename ReductionOp , typename T >
static CUB_RUNTIME_FUNCTION cudaError_t	Reduce (void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
	Computes a device-wide segmented reduction using the specified binary `reduction_op` functor. More...

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >
static CUB_RUNTIME_FUNCTION cudaError_t	Sum (void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
	Computes a device-wide segmented sum using the addition ('+') operator. More...

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >
static CUB_RUNTIME_FUNCTION cudaError_t	Min (void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
	Computes a device-wide segmented minimum using the less-than ('<') operator. More...

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >
static CUB_RUNTIME_FUNCTION cudaError_t	ArgMin (void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
	Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item. More...

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >
static CUB_RUNTIME_FUNCTION cudaError_t	Max (void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
	Computes a device-wide segmented maximum using the greater-than ('>') operator. More...

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >
static CUB_RUNTIME_FUNCTION cudaError_t	ArgMax (void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
	Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item. More...

Detailed Description

DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory.

Overview: A reduction (or fold) uses a binary combining operator to compute a single aggregate from a sequence of input elements.

Usage Considerations: \cdp_class{DeviceSegmentedReduce}

Definition at line 65 of file device_segmented_reduce.cuh.

Member Function Documentation

◆ ArgMax()

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >

static CUB_RUNTIME_FUNCTION cudaError_t cub::DeviceSegmentedReduce::ArgMax	(	void *	d_temp_storage,
		size_t &	temp_storage_bytes,
		InputIteratorT	d_in,
		OutputIteratorT	d_out,
		int	num_segments,
		OffsetIteratorT	d_begin_offsets,
		OffsetIteratorT	d_end_offsets,
		cudaStream_t	stream = `0`,
		bool	debug_synchronous = `false`
	)

inlinestatic

Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item.

The output value type of d_out is cub::KeyValuePair <int, T> (assuming the value type of d_in is T)
- The maximum of the i^th segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key.
- The {1, std::numeric_limits<T>::lowest()} tuple is produced for zero-length inputs
When input a contiguous sequence of segments, a single sequence segment_offsets (of length num_segments+1) can be aliased for both the d_begin_offsets and d_end_offsets parameters (where the latter is specified as segment_offsets+1).
Does not support > operators that are non-commutative.
\devicestorage

Snippet: The code snippet below illustrates the argmax-reduction of a device vector of int data elements.

: #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>

// Declare, allocate, and initialize device-accessible pointers for input and output

int num_segments; // e.g., 3

int *d_offsets; // e.g., [0, 3, 3, 7]

int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]

KeyValuePair<int, int> *d_out; // e.g., [{-,-}, {-,-}, {-,-}]

...

// Determine temporary device storage requirements

void *d_temp_storage = NULL;

size_t temp_storage_bytes = 0;

cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// Allocate temporary storage

cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run argmax-reduction

cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]

cub::d_out
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Definition: dispatch_reduce.cuh:71

cub::DeviceSegmentedReduce::ArgMax
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum in each segment using the greater-than ('>') operator,...
Definition: device_segmented_reduce.cuh:568

Template Parameters

InputIteratorT	[inferred] Random-access input iterator type for reading input items (of some type `T`) \iterator
OutputIteratorT	[inferred] Output iterator type for recording the reduced aggregate (having value type `KeyValuePair<int, T>`) \iterator
OffsetIteratorT	[inferred] Random-access input iterator type for reading segment offsets \iterator

Parameters

[in]	d_temp_storage	Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done.
[in,out]	temp_storage_bytes	Reference to size in bytes of `d_temp_storage` allocation
[in]	d_in	Pointer to the input sequence of data items
[out]	d_out	Pointer to the output aggregate
[in]	num_segments	The number of segments that comprise the sorting data
[in]	d_begin_offsets	Pointer to the sequence of beginning offsets of length `num_segments`, such that `d_begin_offsets[i]` is the first element of the i^th data segment in `d_keys_` and `d_values_`
[in]	d_end_offsets	Pointer to the sequence of ending offsets of length `num_segments`, such that `d_end_offsets[i]-1` is the last element of the i^th data segment in `d_keys_` and `d_values_`. If `d_end_offsets[i]-1` <= `d_begin_offsets[i]`, the i^th is considered empty.
[in]	stream	[optional] CUDA stream to launch kernels within. Default is stream₀.
[in]	debug_synchronous	[optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is `false`.

Definition at line 568 of file device_segmented_reduce.cuh.

◆ ArgMin()

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >

static CUB_RUNTIME_FUNCTION cudaError_t cub::DeviceSegmentedReduce::ArgMin	(	void *	d_temp_storage,
		size_t &	temp_storage_bytes,
		InputIteratorT	d_in,
		OutputIteratorT	d_out,
		int	num_segments,
		OffsetIteratorT	d_begin_offsets,
		OffsetIteratorT	d_end_offsets,
		cudaStream_t	stream = `0`,
		bool	debug_synchronous = `false`
	)

inlinestatic

Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.

The output value type of d_out is cub::KeyValuePair <int, T> (assuming the value type of d_in is T)
- The minimum of the i^th segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key.
- The {1, std::numeric_limits<T>::max()} tuple is produced for zero-length inputs
When input a contiguous sequence of segments, a single sequence segment_offsets (of length num_segments+1) can be aliased for both the d_begin_offsets and d_end_offsets parameters (where the latter is specified as segment_offsets+1).
Does not support < operators that are non-commutative.
\devicestorage

Snippet: The code snippet below illustrates the argmin-reduction of a device vector of int data elements.

: #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>

// Declare, allocate, and initialize device-accessible pointers for input and output

int num_segments; // e.g., 3

int *d_offsets; // e.g., [0, 3, 3, 7]

int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]

KeyValuePair<int, int> *d_out; // e.g., [{-,-}, {-,-}, {-,-}]

...

// Determine temporary device storage requirements

void *d_temp_storage = NULL;

size_t temp_storage_bytes = 0;

cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// Allocate temporary storage

cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run argmin-reduction

cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]

cub::DeviceSegmentedReduce::ArgMin
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum in each segment using the less-than ('<') operator,...
Definition: device_segmented_reduce.cuh:385

Template Parameters

InputIteratorT	[inferred] Random-access input iterator type for reading input items (of some type `T`) \iterator
OutputIteratorT	[inferred] Output iterator type for recording the reduced aggregate (having value type `KeyValuePair<int, T>`) \iterator
OffsetIteratorT	[inferred] Random-access input iterator type for reading segment offsets \iterator

Parameters

[in]	d_temp_storage	Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done.
[in,out]	temp_storage_bytes	Reference to size in bytes of `d_temp_storage` allocation
[in]	d_in	Pointer to the input sequence of data items
[out]	d_out	Pointer to the output aggregate
[in]	num_segments	The number of segments that comprise the sorting data
[in]	d_begin_offsets	Pointer to the sequence of beginning offsets of length `num_segments`, such that `d_begin_offsets[i]` is the first element of the i^th data segment in `d_keys_` and `d_values_`
[in]	d_end_offsets	Pointer to the sequence of ending offsets of length `num_segments`, such that `d_end_offsets[i]-1` is the last element of the i^th data segment in `d_keys_` and `d_values_`. If `d_end_offsets[i]-1` <= `d_begin_offsets[i]`, the i^th is considered empty.
[in]	stream	[optional] CUDA stream to launch kernels within. Default is stream₀.
[in]	debug_synchronous	[optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is `false`.

Definition at line 385 of file device_segmented_reduce.cuh.

◆ Max()

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >

static CUB_RUNTIME_FUNCTION cudaError_t cub::DeviceSegmentedReduce::Max	(	void *	d_temp_storage,
		size_t &	temp_storage_bytes,
		InputIteratorT	d_in,
		OutputIteratorT	d_out,
		int	num_segments,
		OffsetIteratorT	d_begin_offsets,
		OffsetIteratorT	d_end_offsets,
		cudaStream_t	stream = `0`,
		bool	debug_synchronous = `false`
	)

inlinestatic

Computes a device-wide segmented maximum using the greater-than ('>') operator.

Uses std::numeric_limits<T>::lowest() as the initial value of the reduction.
When input a contiguous sequence of segments, a single sequence segment_offsets (of length num_segments+1) can be aliased for both the d_begin_offsets and d_end_offsets parameters (where the latter is specified as segment_offsets+1).
Does not support > operators that are non-commutative.
\devicestorage

Snippet: The code snippet below illustrates the max-reduction of a device vector of int data elements.

: #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>

// Declare, allocate, and initialize device-accessible pointers for input and output

int num_segments; // e.g., 3

int *d_offsets; // e.g., [0, 3, 3, 7]

int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]

int *d_out; // e.g., [-, -, -]

...

// Determine temporary device storage requirements

void *d_temp_storage = NULL;

size_t temp_storage_bytes = 0;

cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// Allocate temporary storage

cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run max-reduction

cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// d_out <-- [8, INT_MIN, 9]

cub::DeviceSegmentedReduce::Max
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented maximum using the greater-than ('>') operator.
Definition: device_segmented_reduce.cuh:483

Template Parameters

InputIteratorT	[inferred] Random-access input iterator type for reading input items \iterator
OutputIteratorT	[inferred] Output iterator type for recording the reduced aggregate \iterator
OffsetIteratorT	[inferred] Random-access input iterator type for reading segment offsets \iterator

Parameters

[in]	d_temp_storage	Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done.
[in,out]	temp_storage_bytes	Reference to size in bytes of `d_temp_storage` allocation
[in]	d_in	Pointer to the input sequence of data items
[out]	d_out	Pointer to the output aggregate
[in]	num_segments	The number of segments that comprise the sorting data
[in]	d_begin_offsets	Pointer to the sequence of beginning offsets of length `num_segments`, such that `d_begin_offsets[i]` is the first element of the i^th data segment in `d_keys_` and `d_values_`
[in]	d_end_offsets	Pointer to the sequence of ending offsets of length `num_segments`, such that `d_end_offsets[i]-1` is the last element of the i^th data segment in `d_keys_` and `d_values_`. If `d_end_offsets[i]-1` <= `d_begin_offsets[i]`, the i^th is considered empty.
[in]	stream	[optional] CUDA stream to launch kernels within. Default is stream₀.
[in]	debug_synchronous	[optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is `false`.

Definition at line 483 of file device_segmented_reduce.cuh.

◆ Min()

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >

static CUB_RUNTIME_FUNCTION cudaError_t cub::DeviceSegmentedReduce::Min	(	void *	d_temp_storage,
		size_t &	temp_storage_bytes,
		InputIteratorT	d_in,
		OutputIteratorT	d_out,
		int	num_segments,
		OffsetIteratorT	d_begin_offsets,
		OffsetIteratorT	d_end_offsets,
		cudaStream_t	stream = `0`,
		bool	debug_synchronous = `false`
	)

inlinestatic

Computes a device-wide segmented minimum using the less-than ('<') operator.

Uses std::numeric_limits<T>::max() as the initial value of the reduction for each segment.
When input a contiguous sequence of segments, a single sequence segment_offsets (of length num_segments+1) can be aliased for both the d_begin_offsets and d_end_offsets parameters (where the latter is specified as segment_offsets+1).
Does not support < operators that are non-commutative.
\devicestorage

Snippet: The code snippet below illustrates the min-reduction of a device vector of int data elements.

: #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>

// Declare, allocate, and initialize device-accessible pointers for input and output

int num_segments; // e.g., 3

int *d_offsets; // e.g., [0, 3, 3, 7]

int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]

int *d_out; // e.g., [-, -, -]

...

// Determine temporary device storage requirements

void *d_temp_storage = NULL;

size_t temp_storage_bytes = 0;

cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// Allocate temporary storage

cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run min-reduction

cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// d_out <-- [6, INT_MAX, 0]

cub::DeviceSegmentedReduce::Min
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented minimum using the less-than ('<') operator.
Definition: device_segmented_reduce.cuh:300

Template Parameters

InputIteratorT	[inferred] Random-access input iterator type for reading input items \iterator
OutputIteratorT	[inferred] Output iterator type for recording the reduced aggregate \iterator
OffsetIteratorT	[inferred] Random-access input iterator type for reading segment offsets \iterator

Parameters

[in]	d_temp_storage	Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done.
[in,out]	temp_storage_bytes	Reference to size in bytes of `d_temp_storage` allocation
[in]	d_in	Pointer to the input sequence of data items
[out]	d_out	Pointer to the output aggregate
[in]	num_segments	The number of segments that comprise the sorting data
[in]	d_begin_offsets	Pointer to the sequence of beginning offsets of length `num_segments`, such that `d_begin_offsets[i]` is the first element of the i^th data segment in `d_keys_` and `d_values_`
[in]	d_end_offsets	Pointer to the sequence of ending offsets of length `num_segments`, such that `d_end_offsets[i]-1` is the last element of the i^th data segment in `d_keys_` and `d_values_`. If `d_end_offsets[i]-1` <= `d_begin_offsets[i]`, the i^th is considered empty.
[in]	stream	[optional] CUDA stream to launch kernels within. Default is stream₀.
[in]	debug_synchronous	[optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is `false`.

Definition at line 300 of file device_segmented_reduce.cuh.

◆ Reduce()

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT , typename ReductionOp , typename T >

static CUB_RUNTIME_FUNCTION cudaError_t cub::DeviceSegmentedReduce::Reduce	(	void *	d_temp_storage,
		size_t &	temp_storage_bytes,
		InputIteratorT	d_in,
		OutputIteratorT	d_out,
		int	num_segments,
		OffsetIteratorT	d_begin_offsets,
		OffsetIteratorT	d_end_offsets,
		ReductionOp	reduction_op,
		T	initial_value,
		cudaStream_t	stream = `0`,
		bool	debug_synchronous = `false`
	)

inlinestatic

Computes a device-wide segmented reduction using the specified binary reduction_op functor.

Does not support binary reduction operators that are non-commutative.
When input a contiguous sequence of segments, a single sequence segment_offsets (of length num_segments+1) can be aliased for both the d_begin_offsets and d_end_offsets parameters (where the latter is specified as segment_offsets+1).
\devicestorage

Snippet: The code snippet below illustrates a custom min-reduction of a device vector of int data elements.

: #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>

// CustomMin functor

struct CustomMin

{

template <typename T>

CUB_RUNTIME_FUNCTION __forceinline__

T operator()(const T &a, const T &b) const {

return (b < a) ? b : a;

}

};

// Declare, allocate, and initialize device-accessible pointers for input and output

int num_segments; // e.g., 3

int *d_offsets; // e.g., [0, 3, 3, 7]

int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]

int *d_out; // e.g., [-, -, -]

CustomMin min_op;

int initial_value; // e.g., INT_MAX

...

// Determine temporary device storage requirements

void *d_temp_storage = NULL;

size_t temp_storage_bytes = 0;

cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1, min_op, initial_value);

// Allocate temporary storage

cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run reduction

cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1, min_op, initial_value);

// d_out <-- [6, INT_MAX, 0]

cub::DeviceSegmentedReduce::Reduce
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.
Definition: device_segmented_reduce.cuh:133

Template Parameters

InputIteratorT	[inferred] Random-access input iterator type for reading input items \iterator
OutputIteratorT	[inferred] Output iterator type for recording the reduced aggregate \iterator
OffsetIteratorT	[inferred] Random-access input iterator type for reading segment offsets \iterator
ReductionOp	[inferred] Binary reduction functor type having member `T operator()(const T &a, const T &b)`
T	[inferred] Data element type that is convertible to the `value` type of `InputIteratorT`

Parameters

[in]	d_temp_storage	Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done.
[in,out]	temp_storage_bytes	Reference to size in bytes of `d_temp_storage` allocation
[in]	d_in	Pointer to the input sequence of data items
[out]	d_out	Pointer to the output aggregate
[in]	num_segments	The number of segments that comprise the sorting data
[in]	d_begin_offsets	Pointer to the sequence of beginning offsets of length `num_segments`, such that `d_begin_offsets[i]` is the first element of the i^th data segment in `d_keys_` and `d_values_`
[in]	d_end_offsets	Pointer to the sequence of ending offsets of length `num_segments`, such that `d_end_offsets[i]-1` is the last element of the i^th data segment in `d_keys_` and `d_values_`. If `d_end_offsets[i]-1` <= `d_begin_offsets[i]`, the i^th is considered empty.
[in]	reduction_op	Binary reduction functor
[in]	initial_value	Initial value of the reduction for each segment
[in]	stream	[optional] CUDA stream to launch kernels within. Default is stream₀.
[in]	debug_synchronous	[optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is `false`.

Definition at line 133 of file device_segmented_reduce.cuh.

◆ Sum()

template<typename InputIteratorT , typename OutputIteratorT , typename OffsetIteratorT >

static CUB_RUNTIME_FUNCTION cudaError_t cub::DeviceSegmentedReduce::Sum	(	void *	d_temp_storage,
		size_t &	temp_storage_bytes,
		InputIteratorT	d_in,
		OutputIteratorT	d_out,
		int	num_segments,
		OffsetIteratorT	d_begin_offsets,
		OffsetIteratorT	d_end_offsets,
		cudaStream_t	stream = `0`,
		bool	debug_synchronous = `false`
	)

inlinestatic

Computes a device-wide segmented sum using the addition ('+') operator.

Uses 0 as the initial value of the reduction for each segment.
When input a contiguous sequence of segments, a single sequence segment_offsets (of length num_segments+1) can be aliased for both the d_begin_offsets and d_end_offsets parameters (where the latter is specified as segment_offsets+1).
Does not support + operators that are non-commutative..
\devicestorage

Snippet: The code snippet below illustrates the sum reduction of a device vector of int data elements.

: #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>

// Declare, allocate, and initialize device-accessible pointers for input and output

int num_segments; // e.g., 3

int *d_offsets; // e.g., [0, 3, 3, 7]

int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]

int *d_out; // e.g., [-, -, -]

...

// Determine temporary device storage requirements

void *d_temp_storage = NULL;

size_t temp_storage_bytes = 0;

cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// Allocate temporary storage

cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run sum-reduction

cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,

num_segments, d_offsets, d_offsets + 1);

// d_out <-- [21, 0, 17]

cub::DeviceSegmentedReduce::Sum
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented sum using the addition ('+') operator.
Definition: device_segmented_reduce.cuh:215

Template Parameters

InputIteratorT	[inferred] Random-access input iterator type for reading input items \iterator
OutputIteratorT	[inferred] Output iterator type for recording the reduced aggregate \iterator
OffsetIteratorT	[inferred] Random-access input iterator type for reading segment offsets \iterator

Parameters

[in]	d_temp_storage	Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done.
[in,out]	temp_storage_bytes	Reference to size in bytes of `d_temp_storage` allocation
[in]	d_in	Pointer to the input sequence of data items
[out]	d_out	Pointer to the output aggregate
[in]	num_segments	The number of segments that comprise the sorting data
[in]	d_begin_offsets	Pointer to the sequence of beginning offsets of length `num_segments`, such that `d_begin_offsets[i]` is the first element of the i^th data segment in `d_keys_` and `d_values_`
[in]	d_end_offsets	Pointer to the sequence of ending offsets of length `num_segments`, such that `d_end_offsets[i]-1` is the last element of the i^th data segment in `d_keys_` and `d_values_`. If `d_end_offsets[i]-1` <= `d_begin_offsets[i]`, the i^th is considered empty.
[in]	stream	[optional] CUDA stream to launch kernels within. Default is stream₀.
[in]	debug_synchronous	[optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is `false`.

Definition at line 215 of file device_segmented_reduce.cuh.

The documentation for this struct was generated from the following file:

openfpm_data/src/util/cuda/cub_old/device/device_segmented_reduce.cuh

Static Public Member Functions

Detailed Description

Member Function Documentation

◆ ArgMax()

◆ ArgMin()

◆ Max()

◆ Min()

◆ Reduce()

◆ Sum()