template<typename InputT,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.
- Template Parameters
-
InputT | The data type to read into (which must be convertible from the input iterator's value type). |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of consecutive items partitioned onto each thread. |
ALGORITHM | [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. |
WARP_TIME_SLICING | [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
-
- A Simple Example
- \blockcollective{BlockLoad}
- The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
BLOCK_LOAD_WARP_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
#include <cub/cub.cuh>
__global__ void ExampleKernel(int *d_data, ...)
{
int thread_data[4];
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ BlockLoad()
Collective constructor using a private static allocation of shared memory as temporary storage.
- Suppose the input
d_data
is 0, 1, 2, 3, 4, 5, ...
. The set of thread_data
across the block of threads in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
.
Definition at line 640 of file block_load.cuh.
|
|
__device__ __forceinline__ | BlockLoad () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockLoad (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage.
|
|
|
template<typename InputIteratorT > |
__device__ __forceinline__ void | Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
| Load a linear segment of items from memory.
|
|
template<typename InputIteratorT > |
__device__ __forceinline__ void | Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
| Load a linear segment of items from memory, guarded by range.
|
|
template<typename InputIteratorT , typename DefaultT > |
__device__ __forceinline__ void | Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
| Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
|
|