The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA thread block. More...
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA thread block.

| T | The data type to be exchanged. |
| BLOCK_DIM_X | The thread block length in threads along the X dimension |
| ITEMS_PER_THREAD | The number of items partitioned onto each thread. |
| WARP_TIME_SLICING | [optional] When true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) |
| BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
| BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
| PTX_ARCH | [optional] \ptxversion |
thread_data across the block of threads is { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }. The corresponding output thread_data in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.Definition at line 116 of file block_exchange.cuh.
Data Structures | |
| struct | TempStorage |
| \smemstorage{BlockExchange} More... | |
Public Member Functions | |
Collective constructors | |
| __device__ __forceinline__ | BlockExchange () |
| Collective constructor using a private static allocation of shared memory as temporary storage. | |
| __device__ __forceinline__ | BlockExchange (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage. | |
Structured exchanges | |
| template<typename OutputT > | |
| __device__ __forceinline__ void | StripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD]) |
| Transposes data items from striped arrangement to blocked arrangement. | |
| template<typename OutputT > | |
| __device__ __forceinline__ void | BlockedToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD]) |
| Transposes data items from blocked arrangement to striped arrangement. | |
| template<typename OutputT > | |
| __device__ __forceinline__ void | WarpStripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD]) |
| Transposes data items from warp-striped arrangement to blocked arrangement. | |
| template<typename OutputT > | |
| __device__ __forceinline__ void | BlockedToWarpStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD]) |
| Transposes data items from blocked arrangement to warp-striped arrangement. | |
Scatter exchanges | |
| template<typename OutputT , typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| Exchanges data items annotated by rank into blocked arrangement. | |
| template<typename OutputT , typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| Exchanges data items annotated by rank into striped arrangement. | |
| template<typename OutputT , typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToStripedGuarded (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. | |
| template<typename OutputT , typename OffsetT , typename ValidFlag > | |
| __device__ __forceinline__ void | ScatterToStripedFlagged (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD]) |
| Exchanges valid data items annotated by rank into striped arrangement. | |
| __device__ __forceinline__ void | StripedToBlocked (InputT items[ITEMS_PER_THREAD]) |
| __device__ __forceinline__ void | BlockedToStriped (InputT items[ITEMS_PER_THREAD]) |
| __device__ __forceinline__ void | WarpStripedToBlocked (InputT items[ITEMS_PER_THREAD]) |
| __device__ __forceinline__ void | BlockedToWarpStriped (InputT items[ITEMS_PER_THREAD]) |
| template<typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToBlocked (InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| template<typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToStriped (InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| template<typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToStripedGuarded (InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| template<typename OffsetT , typename ValidFlag > | |
| __device__ __forceinline__ void | ScatterToStripedFlagged (InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD]) |
Private Types | |
| enum | { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z , LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH) , WARP_THREADS = 1 << LOG_WARP_THREADS , WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS , LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH) , SMEM_BANKS = 1 << LOG_SMEM_BANKS , TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD , TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1 , TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS , TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD , WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS) , WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD , INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE) , PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0 } |
| Constants. More... | |
Private Member Functions | |
| struct | __align__ (16) _TempStorage |
| Shared memory storage layout type. | |
| __device__ __forceinline__ _TempStorage & | PrivateStorage () |
| Internal storage allocator. | |
| template<typename OutputT > | |
| __device__ __forceinline__ void | BlockedToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >) |
| template<typename OutputT > | |
| __device__ __forceinline__ void | BlockedToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >) |
| template<typename OutputT > | |
| __device__ __forceinline__ void | BlockedToWarpStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >) |
| template<typename OutputT > | |
| __device__ __forceinline__ void | BlockedToWarpStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >) |
| template<typename OutputT > | |
| __device__ __forceinline__ void | StripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >) |
| template<typename OutputT > | |
| __device__ __forceinline__ void | StripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >) |
| template<typename OutputT > | |
| __device__ __forceinline__ void | WarpStripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >) |
| template<typename OutputT > | |
| __device__ __forceinline__ void | WarpStripedToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >) |
| template<typename OutputT , typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >) |
| template<typename OutputT , typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToBlocked (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >) |
| template<typename OutputT , typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >) |
| template<typename OutputT , typename OffsetT > | |
| __device__ __forceinline__ void | ScatterToStriped (InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >) |
Private Attributes | |
| _TempStorage & | temp_storage |
| Shared storage reference. | |
| unsigned int | linear_tid |
| Linear thread-id. | |
| unsigned int | lane_id |
| unsigned int | warp_id |
| unsigned int | warp_offset |
|
private |
Constants.
| Enumerator | |
|---|---|
| BLOCK_THREADS | The thread block size in threads. |
Definition at line 125 of file block_exchange.cuh.
|
inline |
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition at line 713 of file block_exchange.cuh.
|
inline |
Collective constructor using the specified memory allocation as temporary storage.
| [in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 726 of file block_exchange.cuh.
|
inlineprivate |
Shared memory storage layout type.
Definition at line 1247 of file block_exchange.cuh.
|
inline |
Transposes data items from blocked arrangement to striped arrangement.
thread_data across the block of threads is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. The corresponding output thread_data in those threads will be { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in preparation for storing to device-accessible memory. | [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
| [out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
Definition at line 829 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing.
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 200 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing.
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 229 of file block_exchange.cuh.
|
inline |
| items | [in-out] Items to exchange, converting between striped and blocked arrangements. |
Definition at line 1069 of file block_exchange.cuh.
|
inline |
Transposes data items from blocked arrangement to warp-striped arrangement.
thread_data across the block of threads is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. The corresponding output thread_data in those threads will be { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } in preparation for storing to device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) | [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
| [out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
Definition at line 928 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 289 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 317 of file block_exchange.cuh.
|
inline |
| items | [in-out] Items to exchange, converting between striped and blocked arrangements. |
Definition at line 1081 of file block_exchange.cuh.
|
inlineprivate |
Internal storage allocator.
Definition at line 189 of file block_exchange.cuh.
|
inline |
Exchanges data items annotated by rank into blocked arrangement.
| OffsetT | [inferred] Signed integer type for local offsets |
| [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
| [out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 953 of file block_exchange.cuh.
|
inlineprivate |
Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing.
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 534 of file block_exchange.cuh.
|
inlineprivate |
Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing.
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 563 of file block_exchange.cuh.
|
inline |
| items | [in-out] Items to exchange, converting between striped and blocked arrangements. | |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 1088 of file block_exchange.cuh.
|
inline |
Exchanges data items annotated by rank into striped arrangement.
| OffsetT | [inferred] Signed integer type for local offsets |
| [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
| [out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 972 of file block_exchange.cuh.
|
inlineprivate |
Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing.
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 616 of file block_exchange.cuh.
|
inlineprivate |
Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing.
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 646 of file block_exchange.cuh.
|
inline |
| items | [in-out] Items to exchange, converting between striped and blocked arrangements. | |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 1096 of file block_exchange.cuh.
|
inline |
Exchanges valid data items annotated by rank into striped arrangement.
| OffsetT | [inferred] Signed integer type for local offsets |
| ValidFlag | [inferred] FlagT type denoting which items are valid |
| [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
| [out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
| [in] | ranks | Corresponding scatter ranks |
| [in] | is_valid | Corresponding flag denoting item validity |
Definition at line 1029 of file block_exchange.cuh.
|
inline |
| items | [in-out] Items to exchange, converting between striped and blocked arrangements. | |
| [in] | ranks | Corresponding scatter ranks |
| [in] | is_valid | Corresponding flag denoting item validity |
Definition at line 1112 of file block_exchange.cuh.
|
inline |
Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged.
| OffsetT | [inferred] Signed integer type for local offsets |
| [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
| [out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 991 of file block_exchange.cuh.
|
inline |
| items | [in-out] Items to exchange, converting between striped and blocked arrangements. | |
| [in] | ranks | Corresponding scatter ranks |
Definition at line 1104 of file block_exchange.cuh.
|
inline |
Transposes data items from striped arrangement to blocked arrangement.
thread_data across the block of threads is { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory. The corresponding output thread_data in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. | [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
| [out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
Definition at line 780 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing.
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 376 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing.
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 406 of file block_exchange.cuh.
|
inline |
| items | [in-out] Items to exchange, converting between striped and blocked arrangements. |
Definition at line 1063 of file block_exchange.cuh.
|
inline |
Transposes data items from warp-striped arrangement to blocked arrangement.
thread_data across the block of threads is { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } after loading from device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) The corresponding output thread_data in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. | [in] | input_items | Items to exchange, converting between striped and blocked arrangements. |
| [out] | output_items | Items from exchange, converting between striped and blocked arrangements. |
Definition at line 877 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 467 of file block_exchange.cuh.
|
inlineprivate |
Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing
| [in] | input_items | Items to exchange, converting between blocked and striped arrangements. |
| [out] | output_items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 496 of file block_exchange.cuh.
|
inline |
| items | [in-out] Items to exchange, converting between striped and blocked arrangements. |
Definition at line 1075 of file block_exchange.cuh.
|
private |
Definition at line 179 of file block_exchange.cuh.
|
private |
Linear thread-id.
Definition at line 178 of file block_exchange.cuh.
|
private |
Shared storage reference.
Definition at line 175 of file block_exchange.cuh.
|
private |
Definition at line 180 of file block_exchange.cuh.
|
private |
Definition at line 181 of file block_exchange.cuh.