template<typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockShuffle< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockShuffle class provides collective methods for shuffling data partitioned across a CUDA thread block.
- Template Parameters
-
T | The data type to be exchanged. |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
- It is commonplace for blocks of threads to rearrange data items between threads. The BlockShuffle abstraction allows threads to efficiently shift items either (a) up to their successor or (b) down to their predecessor.
Definition at line 70 of file block_shuffle.cuh.
|
|
__device__ __forceinline__ | BlockShuffle () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockShuffle (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage. More...
|
|
|
__device__ __forceinline__ void | Offset (T input, T &output, int distance=1) |
| Each threadi obtains the input provided by threadi+distance . The offset distance may be negative. More...
|
|
__device__ __forceinline__ void | Rotate (T input, T &output, unsigned int distance=1) |
| Each threadi obtains the input provided by threadi+distance . More...
|
|
template<int ITEMS_PER_THREAD> |
__device__ __forceinline__ void | Up (T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD]) |
| The thread block rotates its blocked arrangement of input items, shifting it up by one item. More...
|
|
template<int ITEMS_PER_THREAD> |
__device__ __forceinline__ void | Up (T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_suffix) |
| The thread block rotates its blocked arrangement of input items, shifting it up by one item. All threads receive the input provided by threadBLOCK_THREADS-1 . More...
|
|
template<int ITEMS_PER_THREAD> |
__device__ __forceinline__ void | Down (T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD]) |
| The thread block rotates its blocked arrangement of input items, shifting it down by one item. More...
|
|
template<int ITEMS_PER_THREAD> |
__device__ __forceinline__ void | Down (T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_prefix) |
| The thread block rotates its blocked arrangement of input items, shifting it down by one item. All threads receive input [0] provided by thread0 . More...
|
|
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::BlockShuffle< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Down |
( |
T(&) |
input[ITEMS_PER_THREAD], |
|
|
T(&) |
prev[ITEMS_PER_THREAD] |
|
) |
| |
|
inline |
The thread block rotates its blocked arrangement of input
items, shifting it down by one item.
- \blocked
- \granularity
- \smemreuse
- Parameters
-
[in] | input | The calling thread's input items |
[out] | prev | The corresponding predecessor items (may be aliased to input ). The value prev [0] is not updated for threadBLOCK_THREADS-1. |
Definition at line 263 of file block_shuffle.cuh.
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::BlockShuffle< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Down |
( |
T(&) |
input[ITEMS_PER_THREAD], |
|
|
T(&) |
prev[ITEMS_PER_THREAD], |
|
|
T & |
block_prefix |
|
) |
| |
|
inline |
The thread block rotates its blocked arrangement of input items, shifting it down by one item. All threads receive input
[0] provided by thread0
.
- \blocked
- \granularity
- \smemreuse
- Parameters
-
[in] | input | The calling thread's input items |
[out] | prev | The corresponding predecessor items (may be aliased to input ). The value prev [0] is not updated for threadBLOCK_THREADS-1. |
[out] | block_prefix | The item input [0] from thread0 , provided to all threads |
Definition at line 289 of file block_shuffle.cuh.
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockShuffle< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Offset |
( |
T |
input, |
|
|
T & |
output, |
|
|
int |
distance = 1 |
|
) |
| |
|
inline |
Each threadi obtains the input
provided by threadi+distance
. The offset distance
may be negative.
-
- Parameters
-
[in] | input | The input item from the calling thread (threadi) |
[out] | output | The input item from the successor (or predecessor) thread threadi+distance (may be aliased to input ). This value is only updated for for threadi when 0 <= (i + distance ) < BLOCK_THREADS-1 |
[in] | distance | Offset distance (may be negative) |
Definition at line 171 of file block_shuffle.cuh.
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockShuffle< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Rotate |
( |
T |
input, |
|
|
T & |
output, |
|
|
unsigned int |
distance = 1 |
|
) |
| |
|
inline |
Each threadi obtains the input
provided by threadi+distance
.
-
- Parameters
-
[in] | input | The calling thread's input item |
[out] | output | The input item from thread thread(i+distance> )%<BLOCK_THREADS> (may be aliased to input ). This value is not updated for threadBLOCK_THREADS-1 |
[in] | distance | Offset distance (0 < distance < BLOCK_THREADS ) |
Definition at line 191 of file block_shuffle.cuh.
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::BlockShuffle< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Up |
( |
T(&) |
input[ITEMS_PER_THREAD], |
|
|
T(&) |
prev[ITEMS_PER_THREAD] |
|
) |
| |
|
inline |
The thread block rotates its blocked arrangement of input
items, shifting it up by one item.
- \blocked
- \granularity
- \smemreuse
- Parameters
-
[in] | input | The calling thread's input items |
[out] | prev | The corresponding predecessor items (may be aliased to input ). The item prev [0] is not updated for thread0. |
Definition at line 217 of file block_shuffle.cuh.
template<typename T , int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::BlockShuffle< T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Up |
( |
T(&) |
input[ITEMS_PER_THREAD], |
|
|
T(&) |
prev[ITEMS_PER_THREAD], |
|
|
T & |
block_suffix |
|
) |
| |
|
inline |
The thread block rotates its blocked arrangement of input
items, shifting it up by one item. All threads receive the input
provided by threadBLOCK_THREADS-1
.
- \blocked
- \granularity
- \smemreuse
- Parameters
-
[in] | input | The calling thread's input items |
[out] | prev | The corresponding predecessor items (may be aliased to input ). The item prev [0] is not updated for thread0. |
[out] | block_suffix | The item input [ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1 , provided to all threads |
Definition at line 244 of file block_shuffle.cuh.