The WarpReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread warp. More...

Data Structures
struct	TempStorage
	\smemstorage{WarpReduce} More...

Public Types
typedef If<(PTX_ARCH >=300) &&(IS_POW_OF_TWO), WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >, WarpReduceSmem< T, LOGICAL_WARP_THREADS, PTX_ARCH > >::Type	InternalWarpReduce
	Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)

Public Member Functions
Collective constructors
__device__ __forceinline__	WarpReduce (TempStorage &temp_storage)
	Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from `threadIdx.x`. More...

Summation reductions
__device__ __forceinline__ T	Sum (T input)
	Computes a warp-wide sum in the calling warp. The output is valid in warp lane₀. More...

__device__ __forceinline__ T	Sum (T input, int valid_items)
	Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane₀. More...

template<typename FlagT >
__device__ __forceinline__ T	HeadSegmentedSum (T input, FlagT head_flag)
	Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane₀). More...

template<typename FlagT >
__device__ __forceinline__ T	TailSegmentedSum (T input, FlagT tail_flag)
	Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane₀). More...

Generic reductions
template<typename ReductionOp >
__device__ __forceinline__ T	Reduce (T input, ReductionOp reduction_op)
	Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane₀. More...

template<typename ReductionOp >
__device__ __forceinline__ T	Reduce (T input, ReductionOp reduction_op, int valid_items)
	Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane₀. More...

template<typename ReductionOp , typename FlagT >
__device__ __forceinline__ T	HeadSegmentedReduce (T input, FlagT head_flag, ReductionOp reduction_op)
	Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane₀). More...

template<typename ReductionOp , typename FlagT >
__device__ __forceinline__ T	TailSegmentedReduce (T input, FlagT tail_flag, ReductionOp reduction_op)
	Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane₀). More...

Private Types
enum	{ IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)) , IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE }

typedef InternalWarpReduce::TempStorage	_TempStorage
	Shared memory storage layout type for WarpReduce.

Private Attributes
_TempStorage &	temp_storage
	Shared storage reference.

Detailed Description

template<typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>
class cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >

The WarpReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread warp.

Template Parameters

T	The reduction input/output element type
LOGICAL_WARP_THREADS	[optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
PTX_ARCH	[optional] \ptxversion

Overview

A reduction (or fold) uses a binary combining operator to compute a single aggregate from a list of input elements.
Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
The number of entrant threads must be an multiple of LOGICAL_WARP_THREADS

Performance Considerations

Uses special instructions when applicable (e.g., warp SHFL instructions)
Uses synchronization-free communication between warp lanes when applicable
Incurs zero bank conflicts for most types
Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- Summation (vs. generic reduction)
- The architecture's warp size is a whole multiple of LOGICAL_WARP_THREADS

Simple Examples: \warpcollective{WarpReduce}

: The code snippet below illustrates four concurrent warp sum reductions within a block of 128 threads (one per each of the 32-thread warps).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(...)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for 4 warps

__shared__ typename WarpReduce::TempStorage temp_storage[4];

// Obtain one input item per thread

int thread_data = ...

// Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)

int warp_id = threadIdx.x / 32;

int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);

cub::WarpReduce
The WarpReduce class provides collective methods for computing a parallel reduction of items partitio...
Definition: warp_reduce.cuh:142

cub::WarpReduce::temp_storage
_TempStorage & temp_storage
Shared storage reference.
Definition: warp_reduce.cuh:181

cub::WarpReduce::WarpReduce
__device__ __forceinline__ WarpReduce(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage....
Definition: warp_reduce.cuh:203

aggregate
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:221

: Suppose the set of input thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. The corresponding output aggregate in threads 0, 32, 64, and 96 will 496, 1520, 2544, and 3568, respectively (and is undefined in other threads).

: The code snippet below illustrates a single warp sum reduction within a block of 128 threads.

: #include <cub/cub.cuh>

__global__ void ExampleKernel(...)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for one warp

__shared__ typename WarpReduce::TempStorage temp_storage;

...

// Only the first warp performs a reduction

if (threadIdx.x < 32)

{

// Obtain one input item per thread

int thread_data = ...

// Return the warp-wide sum to lane0

int aggregate = WarpReduce(temp_storage).Sum(thread_data);

: Suppose the set of input thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. The corresponding output aggregate in thread0 will be 496 (and is undefined in other threads).

Definition at line 141 of file warp_reduce.cuh.

Member Enumeration Documentation

◆ anonymous enum

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

anonymous enum

private

Enumerator
IS_ARCH_WARP	Whether the logical warp size and the PTX warp size coincide.
IS_POW_OF_TWO	Whether the logical warp size is a power-of-two.

Definition at line 149 of file warp_reduce.cuh.

Constructor & Destructor Documentation

◆ WarpReduce()

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::WarpReduce ( TempStorage & temp_storage )

inline

Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x.

Parameters

[in] temp_storage Reference to memory allocation having layout type TempStorage

Definition at line 203 of file warp_reduce.cuh.

Member Function Documentation

◆ HeadSegmentedReduce()

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

template<typename ReductionOp , typename FlagT >

__device__ __forceinline__ T cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::HeadSegmentedReduce	(	T	input,
		FlagT	head_flag,
		ReductionOp	reduction_op
	)

inline

Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane₀).

Supports non-commutative reduction operators

\smemreuse

Snippet: The code snippet below illustrates a head-segmented warp max reduction within a block of 32 threads (one warp).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(...)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for one warp

__shared__ typename WarpReduce::TempStorage temp_storage;

// Obtain one input item and flag per thread

int thread_data = ...

int head_flag = ...

// Return the warp-wide reductions to each lane0

int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(

thread_data, head_flag, cub::Max());

cub::Max
Default max functor.
Definition: thread_operators.cuh:124

: Suppose the set of input thread_data and head_flag across the block of threads is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, respectively. The corresponding output aggregate in threads 0, 4, 8, etc. will be 3, 7, 11, etc. (and is undefined in other threads).

Template Parameters

ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)

Parameters

[in]	input	Calling thread's input
[in]	head_flag	Head flag denoting whether or not `input` is the start of a new segment
[in]	reduction_op	Reduction operator

Definition at line 545 of file warp_reduce.cuh.

◆ HeadSegmentedSum()

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

template<typename FlagT >

__device__ __forceinline__ T cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::HeadSegmentedSum	(	T	input,
		FlagT	head_flag
	)

inline

Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane₀).

\smemreuse

Snippet: The code snippet below illustrates a head-segmented warp sum reduction within a block of 32 threads (one warp).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(...)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for one warp

__shared__ typename WarpReduce::TempStorage temp_storage;

// Obtain one input item and flag per thread

int thread_data = ...

int head_flag = ...

// Return the warp-wide sums to each lane0

int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(

thread_data, head_flag);

: Suppose the set of input thread_data and head_flag across the block of threads is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, respectively. The corresponding output aggregate in threads 0, 4, 8, etc. will be 6, 22, 38, etc. (and is undefined in other threads).

Template Parameters

ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)

Parameters

[in]	input	Calling thread's input
[in]	head_flag	Head flag denoting whether or not `input` is the start of a new segment

Definition at line 344 of file warp_reduce.cuh.

◆ Reduce() [1/2]

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

template<typename ReductionOp >

__device__ __forceinline__ T cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::Reduce	(	T	input,
		ReductionOp	reduction_op
	)

inline

Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane₀.

Supports non-commutative reduction operators

\smemreuse

Snippet: The code snippet below illustrates four concurrent warp max reductions within a block of 128 threads (one per each of the 32-thread warps).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(...)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for 4 warps

__shared__ typename WarpReduce::TempStorage temp_storage[4];

// Obtain one input item per thread

int thread_data = ...

// Return the warp-wide reductions to each lane0

int warp_id = threadIdx.x / 32;

int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(

thread_data, cub::Max());

: Suppose the set of input thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. The corresponding output aggregate in threads 0, 32, 64, and 96 will 31, 63, 95, and 127, respectively (and is undefined in other threads).

Template Parameters

ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)

Parameters

[in]	input	Calling thread's input
[in]	reduction_op	Binary reduction operator

Definition at line 445 of file warp_reduce.cuh.

◆ Reduce() [2/2]

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

template<typename ReductionOp >

__device__ __forceinline__ T cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::Reduce	(	T	input,
		ReductionOp	reduction_op,
		int	valid_items
	)

inline

Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane₀.

All threads across the calling warp must agree on the same value for valid_items. Otherwise the result is undefined.

Supports non-commutative reduction operators

\smemreuse

Snippet: The code snippet below illustrates a max reduction within a single, partially-full block of 32 threads (one warp).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(int *d_data, int valid_items)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for one warp

__shared__ typename WarpReduce::TempStorage temp_storage;

// Obtain one input item per thread if in range

int thread_data;

if (threadIdx.x < valid_items)

thread_data = d_data[threadIdx.x];

// Return the warp-wide reductions to each lane0

int aggregate = WarpReduce(temp_storage).Reduce(

thread_data, cub::Max(), valid_items);

: Suppose the input d_data is {0, 1, 2, 3, 4, ... and valid_items is 4. The corresponding output aggregate in thread0 is 3 (and is undefined in other threads).

Template Parameters

ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)

Parameters

[in]	input	Calling thread's input
[in]	reduction_op	Binary reduction operator
[in]	valid_items	Total number of valid items in the calling thread's logical warp (may be less than `LOGICAL_WARP_THREADS`)

Definition at line 494 of file warp_reduce.cuh.

◆ Sum() [1/2]

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ T cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::Sum ( T input )

inline

Computes a warp-wide sum in the calling warp. The output is valid in warp lane₀.

\smemreuse

Snippet: The code snippet below illustrates four concurrent warp sum reductions within a block of 128 threads (one per each of the 32-thread warps).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(...)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for 4 warps

__shared__ typename WarpReduce::TempStorage temp_storage[4];

// Obtain one input item per thread

int thread_data = ...

// Return the warp-wide sums to each lane0

int warp_id = threadIdx.x / 32;

int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);

: Suppose the set of input thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. The corresponding output aggregate in threads 0, 32, 64, and 96 will 496, 1520, 2544, and 3568, respectively (and is undefined in other threads).

Parameters

[in] input Calling thread's input

Definition at line 251 of file warp_reduce.cuh.

◆ Sum() [2/2]

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ T cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::Sum	(	T	input,
		int	valid_items
	)

inline

Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane₀.

All threads across the calling warp must agree on the same value for valid_items. Otherwise the result is undefined.

\smemreuse

Snippet: The code snippet below illustrates a sum reduction within a single, partially-full block of 32 threads (one warp).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(int *d_data, int valid_items)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for one warp

__shared__ typename WarpReduce::TempStorage temp_storage;

// Obtain one input item per thread if in range

int thread_data;

if (threadIdx.x < valid_items)

thread_data = d_data[threadIdx.x];

// Return the warp-wide sums to each lane0

int aggregate = WarpReduce(temp_storage).Sum(

thread_data, valid_items);

: Suppose the input d_data is {0, 1, 2, 3, 4, ... and valid_items is 4. The corresponding output aggregate in thread0 is 6 (and is undefined in other threads).

Parameters

[in]	input	Calling thread's input
[in]	valid_items	Total number of valid items in the calling thread's logical warp (may be less than `LOGICAL_WARP_THREADS`)

Definition at line 295 of file warp_reduce.cuh.

◆ TailSegmentedReduce()

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

template<typename ReductionOp , typename FlagT >

__device__ __forceinline__ T cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::TailSegmentedReduce	(	T	input,
		FlagT	tail_flag,
		ReductionOp	reduction_op
	)

inline

Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane₀).

Supports non-commutative reduction operators

\smemreuse

Snippet: The code snippet below illustrates a tail-segmented warp max reduction within a block of 32 threads (one warp).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(...)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for one warp

__shared__ typename WarpReduce::TempStorage temp_storage;

// Obtain one input item and flag per thread

int thread_data = ...

int tail_flag = ...

// Return the warp-wide reductions to each lane0

int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(

thread_data, tail_flag, cub::Max());

: Suppose the set of input thread_data and tail_flag across the block of threads is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, respectively. The corresponding output aggregate in threads 0, 4, 8, etc. will be 3, 7, 11, etc. (and is undefined in other threads).

Template Parameters

ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)

Parameters

[in]	input	Calling thread's input
[in]	tail_flag	Tail flag denoting whether or not `input` is the end of the current segment
[in]	reduction_op	Reduction operator

Definition at line 596 of file warp_reduce.cuh.

◆ TailSegmentedSum()

template<typename T , int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>

template<typename FlagT >

__device__ __forceinline__ T cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >::TailSegmentedSum	(	T	input,
		FlagT	tail_flag
	)

inline

Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane₀).

\smemreuse

Snippet: The code snippet below illustrates a tail-segmented warp sum reduction within a block of 32 threads (one warp).

: #include <cub/cub.cuh>

__global__ void ExampleKernel(...)

{

// Specialize WarpReduce for type int

typedef cub::WarpReduce<int> WarpReduce;

// Allocate WarpReduce shared memory for one warp

__shared__ typename WarpReduce::TempStorage temp_storage;

// Obtain one input item and flag per thread

int thread_data = ...

int tail_flag = ...

// Return the warp-wide sums to each lane0

int aggregate = WarpReduce(temp_storage).TailSegmentedSum(

thread_data, tail_flag);

: Suppose the set of input thread_data and tail_flag across the block of threads is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, respectively. The corresponding output aggregate in threads 0, 4, 8, etc. will be 6, 22, 38, etc. (and is undefined in other threads).

Template Parameters

ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)

Parameters

[in]	input	Calling thread's input
[in]	tail_flag	Head flag denoting whether or not `input` is the start of a new segment

Definition at line 391 of file warp_reduce.cuh.

The documentation for this class was generated from the following file:

openfpm_data/src/util/cuda/cub_old/warp/warp_reduce.cuh

Data Structures

Public Types

Public Member Functions

Private Types

Private Attributes

Detailed Description

template<typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH> class cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >

Member Enumeration Documentation

◆ anonymous enum

Constructor & Destructor Documentation

◆ WarpReduce()

Member Function Documentation

◆ HeadSegmentedReduce()

◆ HeadSegmentedSum()

◆ Reduce() [1/2]

◆ Reduce() [2/2]

◆ Sum() [1/2]

◆ Sum() [2/2]

◆ TailSegmentedReduce()

◆ TailSegmentedSum()

template<typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>
class cub::WarpReduce< T, LOGICAL_WARP_THREADS, PTX_ARCH >