The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA thread block. More...
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA thread block.
T | The data type to be exchanged. |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of items partitioned onto each thread. |
WARP_TIME_SLICING | [optional] When true , only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
thread_data
across the block of threads is { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }
. The corresponding output thread_data
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.Definition at line 116 of file block_exchange.cuh.
Data Structures | |
struct | TempStorage |
\smemstorage{BlockExchange} More... | |
Public Member Functions | |
Collective constructors | |
__device__ __forceinline__ | BlockExchange () |
Collective constructor using a private static allocation of shared memory as temporary storage. | |
__device__ __forceinline__ | BlockExchange (TempStorage &temp_storage) |
Collective constructor using the specified memory allocation as temporary storage. | |
Structured exchanges | |
template<typename OutputT > | |
__device__ __forceinline__ void | StripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD]) |
Transposes data items from striped arrangement to blocked arrangement. | |
template<typename OutputT > | |
__device__ __forceinline__ void | BlockedToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD]) |
Transposes data items from blocked arrangement to striped arrangement. | |
template<typename OutputT > | |
__device__ __forceinline__ void | WarpStripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD]) |
Transposes data items from warp-striped arrangement to blocked arrangement. | |
template<typename OutputT > | |
__device__ __forceinline__ void | BlockedToWarpStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD]) |
Transposes data items from blocked arrangement to warp-striped arrangement. | |
Scatter exchanges | |
template<typename OutputT , typename OffsetT > | |
__device__ __forceinline__ void | ScatterToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
Exchanges data items annotated by rank into blocked arrangement. | |
template<typename OutputT , typename OffsetT > | |
__device__ __forceinline__ void | ScatterToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
Exchanges data items annotated by rank into striped arrangement. | |
template<typename OutputT , typename OffsetT > | |
__device__ __forceinline__ void | ScatterToStripedGuarded (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. | |
template<typename OutputT , typename OffsetT , typename ValidFlag > | |
__device__ __forceinline__ void | ScatterToStripedFlagged (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD]) |
Exchanges valid data items annotated by rank into striped arrangement. | |
__device__ __forceinline__ void | StripedToBlocked (InputT items[ITEMS_PER_THREAD]) |
__device__ __forceinline__ void | BlockedToStriped (InputT items[ITEMS_PER_THREAD]) |
__device__ __forceinline__ void | WarpStripedToBlocked (InputT items[ITEMS_PER_THREAD]) |
__device__ __forceinline__ void | BlockedToWarpStriped (InputT items[ITEMS_PER_THREAD]) |
template<typename OffsetT > | |
__device__ __forceinline__ void | ScatterToBlocked (InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
template<typename OffsetT > | |
__device__ __forceinline__ void | ScatterToStriped (InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
template<typename OffsetT > | |
__device__ __forceinline__ void | ScatterToStripedGuarded (InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
template<typename OffsetT , typename ValidFlag > | |
__device__ __forceinline__ void | ScatterToStripedFlagged (InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD]) |
Private Types | |
enum | { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z , LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH) , WARP_THREADS = 1 << LOG_WARP_THREADS , WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS , LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH) , SMEM_BANKS = 1 << LOG_SMEM_BANKS , TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD , TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1 , TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS , TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD , WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS) , WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD , INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE) , PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0 } |
Constants. More... | |
Private Member Functions | |
struct | __align__ (16) _TempStorage |
Shared memory storage layout type. | |
__device__ __forceinline__ _TempStorage & | PrivateStorage () |
Internal storage allocator. | |
template<typename OutputT > | |
__device__ __forceinline__ void | BlockedToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >) |
template<typename OutputT > | |
__device__ __forceinline__ void | BlockedToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >) |
template<typename OutputT > | |
__device__ __forceinline__ void | BlockedToWarpStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >) |
template<typename OutputT > | |
__device__ __forceinline__ void | BlockedToWarpStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >) |
template<typename OutputT > | |
__device__ __forceinline__ void | StripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >) |
template<typename OutputT > | |
__device__ __forceinline__ void | StripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >) |
template<typename OutputT > | |
__device__ __forceinline__ void | WarpStripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >) |
template<typename OutputT > | |
__device__ __forceinline__ void | WarpStripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >) |
template<typename OutputT , typename OffsetT > | |
__device__ __forceinline__ void | ScatterToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >) |
template<typename OutputT , typename OffsetT > | |
__device__ __forceinline__ void | ScatterToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >) |
template<typename OutputT , typename OffsetT > | |
__device__ __forceinline__ void | ScatterToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >) |
template<typename OutputT , typename OffsetT > | |
__device__ __forceinline__ void | ScatterToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >) |
Private Attributes | |
_TempStorage & | temp_storage |
Shared storage reference. | |
unsigned int | linear_tid |
Linear thread-id. | |
unsigned int | lane_id |
unsigned int | warp_id |
unsigned int | warp_offset |
|
private |
Constants.
Enumerator | |
---|---|
BLOCK_THREADS | The thread block size in threads. |
Definition at line 125 of file block_exchange.cuh.
|
inline |
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition at line 713 of file block_exchange.cuh.
|
inline |
Collective constructor using the specified memory allocation as temporary storage.
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 726 of file block_exchange.cuh.
|
inlineprivate |
Shared memory storage layout type.
Definition at line 1247 of file block_exchange.cuh.
|
inline |
Transposes data items from blocked arrangement to striped arrangement.
thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
. The corresponding output thread_data
in those threads will be { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }
in preparation for storing to device-accessible memory. [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
[out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
Definition at line 829 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing.
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 200 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing.
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 229 of file block_exchange.cuh.
|
inline |
items | [in-out] Items to exchange, converting between striped and blocked arrangements. |
Definition at line 1069 of file block_exchange.cuh.
|
inline |
Transposes data items from blocked arrangement to warp-striped arrangement.
thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
. The corresponding output thread_data
in those threads will be { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }
in preparation for storing to device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
[out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
Definition at line 928 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 289 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 317 of file block_exchange.cuh.
|
inline |
items | [in-out] Items to exchange, converting between striped and blocked arrangements. |
Definition at line 1081 of file block_exchange.cuh.
|
inlineprivate |
Internal storage allocator.
Definition at line 189 of file block_exchange.cuh.
|
inline |
Exchanges data items annotated by rank into blocked arrangement.
OffsetT | [inferred] Signed integer type for local offsets |
[in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
[out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
[in] | ranks | Corresponding scatter ranks |
Definition at line 953 of file block_exchange.cuh.
|
inlineprivate |
Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing.
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
[in] | ranks | Corresponding scatter ranks |
Definition at line 534 of file block_exchange.cuh.
|
inlineprivate |
Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing.
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
[in] | ranks | Corresponding scatter ranks |
Definition at line 563 of file block_exchange.cuh.
|
inline |
items | [in-out] Items to exchange, converting between striped and blocked arrangements. | |
[in] | ranks | Corresponding scatter ranks |
Definition at line 1088 of file block_exchange.cuh.
|
inline |
Exchanges data items annotated by rank into striped arrangement.
OffsetT | [inferred] Signed integer type for local offsets |
[in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
[out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
[in] | ranks | Corresponding scatter ranks |
Definition at line 972 of file block_exchange.cuh.
|
inlineprivate |
Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing.
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
[in] | ranks | Corresponding scatter ranks |
Definition at line 616 of file block_exchange.cuh.
|
inlineprivate |
Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing.
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
[in] | ranks | Corresponding scatter ranks |
Definition at line 646 of file block_exchange.cuh.
|
inline |
items | [in-out] Items to exchange, converting between striped and blocked arrangements. | |
[in] | ranks | Corresponding scatter ranks |
Definition at line 1096 of file block_exchange.cuh.
|
inline |
Exchanges valid data items annotated by rank into striped arrangement.
OffsetT | [inferred] Signed integer type for local offsets |
ValidFlag | [inferred] FlagT type denoting which items are valid |
[in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
[out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
[in] | ranks | Corresponding scatter ranks |
[in] | is_valid | Corresponding flag denoting item validity |
Definition at line 1029 of file block_exchange.cuh.
|
inline |
items | [in-out] Items to exchange, converting between striped and blocked arrangements. | |
[in] | ranks | Corresponding scatter ranks |
[in] | is_valid | Corresponding flag denoting item validity |
Definition at line 1112 of file block_exchange.cuh.
|
inline |
Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged.
OffsetT | [inferred] Signed integer type for local offsets |
[in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
[out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
[in] | ranks | Corresponding scatter ranks |
Definition at line 991 of file block_exchange.cuh.
|
inline |
items | [in-out] Items to exchange, converting between striped and blocked arrangements. | |
[in] | ranks | Corresponding scatter ranks |
Definition at line 1104 of file block_exchange.cuh.
|
inline |
Transposes data items from striped arrangement to blocked arrangement.
thread_data
across the block of threads is { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }
after loading from device-accessible memory. The corresponding output thread_data
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
. [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
[out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
Definition at line 780 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing.
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 376 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing.
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 406 of file block_exchange.cuh.
|
inline |
items | [in-out] Items to exchange, converting between striped and blocked arrangements. |
Definition at line 1063 of file block_exchange.cuh.
|
inline |
Transposes data items from warp-striped arrangement to blocked arrangement.
thread_data
across the block of threads is { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }
after loading from device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) The corresponding output thread_data
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
. [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
[out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
Definition at line 877 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 467 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing
[in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
[out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 496 of file block_exchange.cuh.
|
inline |
items | [in-out] Items to exchange, converting between striped and blocked arrangements. |
Definition at line 1075 of file block_exchange.cuh.
|
private |
Definition at line 179 of file block_exchange.cuh.
|
private |
Linear thread-id.
Definition at line 178 of file block_exchange.cuh.
|
private |
Shared storage reference.
Definition at line 175 of file block_exchange.cuh.
|
private |
Definition at line 180 of file block_exchange.cuh.
|
private |
Definition at line 181 of file block_exchange.cuh.